LLVM 21.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
24#include "llvm/ADT/Statistic.h"
41#include "llvm/IR/CallingConv.h"
42#include "llvm/IR/Constants.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalAlias.h"
48#include "llvm/IR/IRBuilder.h"
50#include "llvm/IR/Intrinsics.h"
52#include "llvm/MC/MCAsmInfo.h"
53#include "llvm/MC/MCContext.h"
54#include "llvm/MC/MCExpr.h"
55#include "llvm/MC/MCSymbol.h"
57#include "llvm/Support/Debug.h"
62#include <algorithm>
63#include <bitset>
64#include <cctype>
65#include <numeric>
66using namespace llvm;
67
68#define DEBUG_TYPE "x86-isel"
69
71 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
73 "Sets the preferable loop alignment for experiments (as log2 bytes) "
74 "for innermost loops only. If specified, this option overrides "
75 "alignment set by x86-experimental-pref-loop-alignment."),
77
79 "x86-br-merging-base-cost", cl::init(2),
81 "Sets the cost threshold for when multiple conditionals will be merged "
82 "into one branch versus be split in multiple branches. Merging "
83 "conditionals saves branches at the cost of additional instructions. "
84 "This value sets the instruction cost limit, below which conditionals "
85 "will be merged, and above which conditionals will be split. Set to -1 "
86 "to never merge branches."),
88
90 "x86-br-merging-ccmp-bias", cl::init(6),
91 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
92 "supports conditional compare instructions."),
94
95static cl::opt<bool>
96 WidenShift("x86-widen-shift", cl::init(true),
97 cl::desc("Replace narrow shifts with wider shifts."),
99
101 "x86-br-merging-likely-bias", cl::init(0),
102 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
103 "that all conditionals will be executed. For example for merging "
104 "the conditionals (a == b && c > d), if its known that a == b is "
105 "likely, then it is likely that if the conditionals are split "
106 "both sides will be executed, so it may be desirable to increase "
107 "the instruction cost threshold. Set to -1 to never merge likely "
108 "branches."),
109 cl::Hidden);
110
112 "x86-br-merging-unlikely-bias", cl::init(-1),
113 cl::desc(
114 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
115 "that all conditionals will be executed. For example for merging "
116 "the conditionals (a == b && c > d), if its known that a == b is "
117 "unlikely, then it is unlikely that if the conditionals are split "
118 "both sides will be executed, so it may be desirable to decrease "
119 "the instruction cost threshold. Set to -1 to never merge unlikely "
120 "branches."),
121 cl::Hidden);
122
124 "mul-constant-optimization", cl::init(true),
125 cl::desc("Replace 'mul x, Const' with more effective instructions like "
126 "SHIFT, LEA, etc."),
127 cl::Hidden);
128
130 const X86Subtarget &STI)
131 : TargetLowering(TM), Subtarget(STI) {
132 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
133 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
134
135 // Set up the TargetLowering object.
136
137 // X86 is weird. It always uses i8 for shift amounts and setcc results.
139 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
141
142 // X86 instruction cache is coherent with its data cache so we can use the
143 // default expansion to a no-op.
145
146 // For 64-bit, since we have so many registers, use the ILP scheduler.
147 // For 32-bit, use the register pressure specific scheduling.
148 // For Atom, always use ILP scheduling.
149 if (Subtarget.isAtom())
151 else if (Subtarget.is64Bit())
153 else
155 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
157
158 // Bypass expensive divides and use cheaper ones.
159 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
160 if (Subtarget.hasSlowDivide32())
161 addBypassSlowDiv(32, 8);
162 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
163 addBypassSlowDiv(64, 32);
164 }
165
166 // Setup Windows compiler runtime calls.
167 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
168 static const struct {
169 const RTLIB::Libcall Op;
170 const char * const Name;
171 const CallingConv::ID CC;
172 } LibraryCalls[] = {
173 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
174 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
175 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
176 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
177 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
178 };
179
180 for (const auto &LC : LibraryCalls) {
181 setLibcallName(LC.Op, LC.Name);
182 setLibcallCallingConv(LC.Op, LC.CC);
183 }
184 }
185
186 if (Subtarget.canUseCMPXCHG16B())
188 else if (Subtarget.canUseCMPXCHG8B())
190 else
192
193 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
194
196
197 // Set up the register classes.
198 addRegisterClass(MVT::i8, &X86::GR8RegClass);
199 addRegisterClass(MVT::i16, &X86::GR16RegClass);
200 addRegisterClass(MVT::i32, &X86::GR32RegClass);
201 if (Subtarget.is64Bit())
202 addRegisterClass(MVT::i64, &X86::GR64RegClass);
203
204 for (MVT VT : MVT::integer_valuetypes())
206
207 // We don't accept any truncstore of integer registers.
208 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
209 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
210 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
211 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
212 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
213 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
214
215 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
216
217 // SETOEQ and SETUNE require checking two conditions.
218 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
221 }
222
223 // Integer absolute.
224 if (Subtarget.canUseCMOV()) {
225 setOperationAction(ISD::ABS , MVT::i16 , Custom);
226 setOperationAction(ISD::ABS , MVT::i32 , Custom);
227 if (Subtarget.is64Bit())
228 setOperationAction(ISD::ABS , MVT::i64 , Custom);
229 }
230
231 // Absolute difference.
232 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
233 setOperationAction(Op , MVT::i8 , Custom);
234 setOperationAction(Op , MVT::i16 , Custom);
235 setOperationAction(Op , MVT::i32 , Custom);
236 if (Subtarget.is64Bit())
237 setOperationAction(Op , MVT::i64 , Custom);
238 }
239
240 // Signed saturation subtraction.
244 if (Subtarget.is64Bit())
246
247 // Funnel shifts.
248 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
249 // For slow shld targets we only lower for code size.
250 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
251
252 setOperationAction(ShiftOp , MVT::i8 , Custom);
253 setOperationAction(ShiftOp , MVT::i16 , Custom);
254 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
255 if (Subtarget.is64Bit())
256 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
257 }
258
259 if (!Subtarget.useSoftFloat()) {
260 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
261 // operation.
266 // We have an algorithm for SSE2, and we turn this into a 64-bit
267 // FILD or VCVTUSI2SS/SD for other targets.
270 // We have an algorithm for SSE2->double, and we turn this into a
271 // 64-bit FILD followed by conditional FADD for other targets.
274
275 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
276 // this operation.
279 // SSE has no i16 to fp conversion, only i32. We promote in the handler
280 // to allow f80 to use i16 and f64 to use i16 with sse1 only
283 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
287 // are Legal, f80 is custom lowered.
290
291 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
292 // this operation.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
301 // are Legal, f80 is custom lowered.
304
305 // Handle FP_TO_UINT by promoting the destination to a larger signed
306 // conversion.
308 // FIXME: This doesn't generate invalid exception when it should. PR44019.
311 // FIXME: This doesn't generate invalid exception when it should. PR44019.
317
322
323 if (!Subtarget.is64Bit()) {
326 }
327 }
328
329 if (Subtarget.hasSSE2()) {
330 // Custom lowering for saturating float to int conversions.
331 // We handle promotion to larger result types manually.
332 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
335 }
338 if (Subtarget.is64Bit()) {
341 }
342 }
343 if (Subtarget.hasAVX10_2()) {
346 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
347 MVT::v4i64}) {
350 }
351 if (Subtarget.hasAVX10_2_512()) {
354 }
355 if (Subtarget.is64Bit()) {
358 }
359 }
360
361 // Handle address space casts between mixed sized pointers.
364
365 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
366 if (!Subtarget.hasSSE2()) {
371 if (Subtarget.is64Bit()) {
373 // Without SSE, i64->f64 goes through memory.
375 }
376 } else if (!Subtarget.is64Bit())
378
379 // Scalar integer divide and remainder are lowered to use operations that
380 // produce two results, to match the available instructions. This exposes
381 // the two-result form to trivial CSE, which is able to combine x/y and x%y
382 // into a single instruction.
383 //
384 // Scalar integer multiply-high is also lowered to use two-result
385 // operations, to match the available instructions. However, plain multiply
386 // (low) operations are left as Legal, as there are single-result
387 // instructions for this in x86. Using the two-result multiply instructions
388 // when both high and low results are needed must be arranged by dagcombine.
389 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
396 }
397
398 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
400 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
401 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
404 }
405 if (Subtarget.is64Bit())
410
411 setOperationAction(ISD::FREM , MVT::f32 , Expand);
412 setOperationAction(ISD::FREM , MVT::f64 , Expand);
413 setOperationAction(ISD::FREM , MVT::f80 , Expand);
414 setOperationAction(ISD::FREM , MVT::f128 , Expand);
415
416 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
422 }
423
424 // Promote the i8 variants and force them on up to i32 which has a shorter
425 // encoding.
426 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
428 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
429 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
430 // promote that too.
431 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
433
434 if (!Subtarget.hasBMI()) {
435 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
437 if (Subtarget.is64Bit()) {
438 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
440 }
441 }
442
443 if (Subtarget.hasLZCNT()) {
444 // When promoting the i8 variants, force them to i32 for a shorter
445 // encoding.
446 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
448 } else {
449 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
450 if (VT == MVT::i64 && !Subtarget.is64Bit())
451 continue;
454 }
455 }
456
459 // Special handling for half-precision floating point conversions.
460 // If we don't have F16C support, then lower half float conversions
461 // into library calls.
463 Op, MVT::f32,
464 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
465 // There's never any support for operations beyond MVT::f32.
466 setOperationAction(Op, MVT::f64, Expand);
467 setOperationAction(Op, MVT::f80, Expand);
468 setOperationAction(Op, MVT::f128, Expand);
469 }
470
471 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
474 }
475
476 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
477 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
478 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
479 setTruncStoreAction(VT, MVT::f16, Expand);
480 setTruncStoreAction(VT, MVT::bf16, Expand);
481
484 }
485
489 if (Subtarget.is64Bit())
491 if (Subtarget.hasPOPCNT()) {
492 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
493 // popcntw is longer to encode than popcntl and also has a false dependency
494 // on the dest that popcntl hasn't had since Cannon Lake.
495 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
496 } else {
501 }
502
504
505 if (!Subtarget.hasMOVBE())
507
508 // X86 wants to expand cmov itself.
509 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
514 }
515 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
520 }
521
522 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
525
527 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
528 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
532 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
533 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
534
535 // Darwin ABI issue.
536 for (auto VT : { MVT::i32, MVT::i64 }) {
537 if (VT == MVT::i64 && !Subtarget.is64Bit())
538 continue;
545 }
546
547 // 64-bit shl, sra, srl (iff 32-bit x86)
548 for (auto VT : { MVT::i32, MVT::i64 }) {
549 if (VT == MVT::i64 && !Subtarget.is64Bit())
550 continue;
554 }
555
556 if (Subtarget.hasSSEPrefetch())
558
560
561 // Expand certain atomics
562 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
570 }
571
572 if (!Subtarget.is64Bit())
574
575 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
576 // All CPUs supporting AVX will atomically load/store aligned 128-bit
577 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
580 }
581
582 if (Subtarget.canUseCMPXCHG16B())
584
585 // FIXME - use subtarget debug flags
586 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
587 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
588 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
590 }
591
594
597
598 setOperationAction(ISD::TRAP, MVT::Other, Legal);
600 if (Subtarget.isTargetPS())
602 else
604
605 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
607 setOperationAction(ISD::VAEND , MVT::Other, Expand);
608 bool Is64Bit = Subtarget.is64Bit();
609 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
610 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
611
614
616
617 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
620
622
623 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
624 setOperationAction(ISD::FABS, VT, Action);
625 setOperationAction(ISD::FNEG, VT, Action);
627 setOperationAction(ISD::FREM, VT, Action);
628 setOperationAction(ISD::FMA, VT, Action);
629 setOperationAction(ISD::FMINNUM, VT, Action);
630 setOperationAction(ISD::FMAXNUM, VT, Action);
635 setOperationAction(ISD::FSIN, VT, Action);
636 setOperationAction(ISD::FCOS, VT, Action);
637 setOperationAction(ISD::FSINCOS, VT, Action);
638 setOperationAction(ISD::FTAN, VT, Action);
639 setOperationAction(ISD::FSQRT, VT, Action);
640 setOperationAction(ISD::FPOW, VT, Action);
641 setOperationAction(ISD::FPOWI, VT, Action);
642 setOperationAction(ISD::FLOG, VT, Action);
643 setOperationAction(ISD::FLOG2, VT, Action);
644 setOperationAction(ISD::FLOG10, VT, Action);
645 setOperationAction(ISD::FEXP, VT, Action);
646 setOperationAction(ISD::FEXP2, VT, Action);
647 setOperationAction(ISD::FEXP10, VT, Action);
648 setOperationAction(ISD::FCEIL, VT, Action);
649 setOperationAction(ISD::FFLOOR, VT, Action);
651 setOperationAction(ISD::FRINT, VT, Action);
652 setOperationAction(ISD::BR_CC, VT, Action);
653 setOperationAction(ISD::SETCC, VT, Action);
656 setOperationAction(ISD::FROUND, VT, Action);
658 setOperationAction(ISD::FTRUNC, VT, Action);
659 setOperationAction(ISD::FLDEXP, VT, Action);
660 };
661
662 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
663 // f16, f32 and f64 use SSE.
664 // Set up the FP register classes.
665 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
666 : &X86::FR16RegClass);
667 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
668 : &X86::FR32RegClass);
669 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
670 : &X86::FR64RegClass);
671
672 // Disable f32->f64 extload as we can only generate this in one instruction
673 // under optsize. So its easier to pattern match (fpext (load)) for that
674 // case instead of needing to emit 2 instructions for extload in the
675 // non-optsize case.
676 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
677
678 for (auto VT : { MVT::f32, MVT::f64 }) {
679 // Use ANDPD to simulate FABS.
681
682 // Use XORP to simulate FNEG.
684
685 // Use ANDPD and ORPD to simulate FCOPYSIGN.
687
688 // These might be better off as horizontal vector ops.
691
692 // We don't support sin/cos/fmod
696 }
697
698 // Half type will be promoted by default.
699 setF16Action(MVT::f16, Promote);
707
738
739 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
740 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
741
742 // Lower this to MOVMSK plus an AND.
745
746 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
747 (UseX87 || Is64Bit)) {
748 // Use SSE for f32, x87 for f64.
749 // Set up the FP register classes.
750 addRegisterClass(MVT::f32, &X86::FR32RegClass);
751 if (UseX87)
752 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
753
754 // Use ANDPS to simulate FABS.
756
757 // Use XORP to simulate FNEG.
759
760 if (UseX87)
762
763 // Use ANDPS and ORPS to simulate FCOPYSIGN.
764 if (UseX87)
767
768 // We don't support sin/cos/fmod
772
773 if (UseX87) {
774 // Always expand sin/cos functions even though x87 has an instruction.
778 }
779 } else if (UseX87) {
780 // f32 and f64 in x87.
781 // Set up the FP register classes.
782 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
783 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
784
785 for (auto VT : { MVT::f32, MVT::f64 }) {
788
789 // Always expand sin/cos functions even though x87 has an instruction.
793 }
794 }
795
796 // Expand FP32 immediates into loads from the stack, save special cases.
797 if (isTypeLegal(MVT::f32)) {
798 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
799 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
800 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
801 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
802 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
803 } else // SSE immediates.
804 addLegalFPImmediate(APFloat(+0.0f)); // xorps
805 }
806 // Expand FP64 immediates into loads from the stack, save special cases.
807 if (isTypeLegal(MVT::f64)) {
808 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
809 addLegalFPImmediate(APFloat(+0.0)); // FLD0
810 addLegalFPImmediate(APFloat(+1.0)); // FLD1
811 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
812 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
813 } else // SSE immediates.
814 addLegalFPImmediate(APFloat(+0.0)); // xorpd
815 }
816 // Support fp16 0 immediate.
817 if (isTypeLegal(MVT::f16))
818 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
819
820 // Handle constrained floating-point operations of scalar.
833
834 // We don't support FMA.
837
838 // f80 always uses X87.
839 if (UseX87) {
840 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
843 {
845 addLegalFPImmediate(TmpFlt); // FLD0
846 TmpFlt.changeSign();
847 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
848
849 bool ignored;
850 APFloat TmpFlt2(+1.0);
852 &ignored);
853 addLegalFPImmediate(TmpFlt2); // FLD1
854 TmpFlt2.changeSign();
855 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
856 }
857
858 // Always expand sin/cos functions even though x87 has an instruction.
859 // clang-format off
871 // clang-format on
872
884
885 // Handle constrained floating-point operations of scalar.
892 if (isTypeLegal(MVT::f16)) {
895 } else {
897 }
898 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
899 // as Custom.
901 }
902
903 // f128 uses xmm registers, but most operations require libcalls.
904 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
905 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
906 : &X86::VR128RegClass);
907
908 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
909
920
924
925 // clang-format off
933 // clang-format on
934 // No STRICT_FSINCOS
937
940 // We need to custom handle any FP_ROUND with an f128 input, but
941 // LegalizeDAG uses the result type to know when to run a custom handler.
942 // So we have to list all legal floating point result types here.
943 if (isTypeLegal(MVT::f32)) {
946 }
947 if (isTypeLegal(MVT::f64)) {
950 }
951 if (isTypeLegal(MVT::f80)) {
955 }
956
958
959 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
960 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
961 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
962 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
963 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
964 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
965 }
966
967 // Always use a library call for pow.
968 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
969 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
970 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
971 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
972
981
982 // Some FP actions are always expanded for vector types.
983 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
984 MVT::v4f32, MVT::v8f32, MVT::v16f32,
985 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
986 // clang-format off
1000 // clang-format on
1001 }
1002
1003 // First set operation action for all vector types to either promote
1004 // (for widening) or expand (for scalarization). Then we will selectively
1005 // turn on ones that can be effectively codegen'd.
1045 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1046 setTruncStoreAction(InnerVT, VT, Expand);
1047
1048 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1049 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1050
1051 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1052 // types, we have to deal with them whether we ask for Expansion or not.
1053 // Setting Expand causes its own optimisation problems though, so leave
1054 // them legal.
1055 if (VT.getVectorElementType() == MVT::i1)
1056 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1057
1058 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1059 // split/scalarized right now.
1060 if (VT.getVectorElementType() == MVT::f16 ||
1061 VT.getVectorElementType() == MVT::bf16)
1062 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1063 }
1064 }
1065
1066 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1067 // with -msoft-float, disable use of MMX as well.
1068 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1069 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1070 // No operations on x86mmx supported, everything uses intrinsics.
1071 }
1072
1073 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1074 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1075 : &X86::VR128RegClass);
1076
1081
1082 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1083 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1091
1092 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1093 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1095
1101 }
1102
1103 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1104 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1105 : &X86::VR128RegClass);
1106
1107 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1108 // registers cannot be used even for integer operations.
1109 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1110 : &X86::VR128RegClass);
1111 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1112 : &X86::VR128RegClass);
1113 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1114 : &X86::VR128RegClass);
1115 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1116 : &X86::VR128RegClass);
1117 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1118 : &X86::VR128RegClass);
1119
1120 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1125 }
1126
1127 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1128 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1133 }
1134
1135 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1136 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1137 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1138
1139 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1140 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1141 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1142 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1143 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1144 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1145 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1146 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1147 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1148 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1151
1152 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1153 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1154 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1155
1156 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1158 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1160
1161 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1162
1163 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1164 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1165 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1166 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1167 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1168 }
1169
1180
1185
1186 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1192
1193 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1194 // setcc all the way to isel and prefer SETGT in some isel patterns.
1197 }
1198
1199 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1200 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1205
1206 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1212 }
1213
1214 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1218
1219 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1220 continue;
1221
1224 }
1225 setF16Action(MVT::v8f16, Expand);
1226 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1227 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1228 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1229 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1230 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1231 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1233
1234 // Custom lower v2i64 and v2f64 selects.
1241
1248
1249 // Custom legalize these to avoid over promotion or custom promotion.
1250 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1255 }
1256
1261
1264
1267
1268 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1273
1278
1279 // We want to legalize this to an f64 load rather than an i64 load on
1280 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1281 // store.
1282 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1283 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1284 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1285 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1286 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1288
1289 // Add 32-bit vector stores to help vectorization opportunities.
1290 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1292
1296 if (!Subtarget.hasAVX512())
1298
1302
1304
1321
1322 // In the customized shift lowering, the legal v4i32/v2i64 cases
1323 // in AVX2 will be recognized.
1324 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1328 if (VT == MVT::v2i64) continue;
1333 }
1334
1340 }
1341
1342 if (Subtarget.hasGFNI()) {
1347 }
1348
1349 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1350 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1351 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1352 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1353
1354 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1357 }
1358
1359 // These might be better off as horizontal vector ops.
1364 }
1365
1366 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1367 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1370 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1374 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1380
1382 }
1383
1384 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1385 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1386 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1387 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1388 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1389 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1390 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1391 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1392
1396
1397 // FIXME: Do we need to handle scalar-to-vector here?
1398 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1399 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1400
1401 // We directly match byte blends in the backend as they match the VSELECT
1402 // condition form.
1404
1405 // SSE41 brings specific instructions for doing vector sign extend even in
1406 // cases where we don't have SRA.
1407 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1410 }
1411
1412 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1413 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1414 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1415 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1416 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1417 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1418 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1419 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1420 }
1421
1422 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1423 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1424 // do the pre and post work in the vector domain.
1427 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1428 // so that DAG combine doesn't try to turn it into uint_to_fp.
1431 }
1432 }
1433
1434 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1436 }
1437
1438 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1439 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1440 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1443 }
1444
1445 // XOP can efficiently perform BITREVERSE with VPPERM.
1446 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1448 }
1449
1450 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1451 bool HasInt256 = Subtarget.hasInt256();
1452
1453 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1454 : &X86::VR256RegClass);
1455 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1456 : &X86::VR256RegClass);
1457 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1458 : &X86::VR256RegClass);
1459 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1460 : &X86::VR256RegClass);
1461 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1462 : &X86::VR256RegClass);
1463 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1464 : &X86::VR256RegClass);
1465 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1466 : &X86::VR256RegClass);
1467
1468 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1481
1483
1487
1493 }
1494
1495 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1496 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1497
1498 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1499 // even though v8i16 is a legal type.
1500 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1501 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1502 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1503 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1507
1514
1526
1527 if (!Subtarget.hasAVX512())
1529
1530 // In the customized shift lowering, the legal v8i32/v4i64 cases
1531 // in AVX2 will be recognized.
1532 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1538 if (VT == MVT::v4i64) continue;
1543 }
1544
1545 // These types need custom splitting if their input is a 128-bit vector.
1550
1554 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1555 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1558
1559 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1563 }
1564
1569
1570 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1575
1576 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1577 // setcc all the way to isel and prefer SETGT in some isel patterns.
1580 }
1581
1582 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1583 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1588
1589 if (Subtarget.hasAnyFMA()) {
1590 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1591 MVT::v2f64, MVT::v4f64 }) {
1594 }
1595 }
1596
1597 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1598 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1599 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1600 }
1601
1602 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1603 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1604 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1605 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1606
1607 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1608 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1609 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1610 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1611 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1612 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1613 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1615
1616 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1617 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1618
1619 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1620 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1621 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1622 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1623 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1624
1625 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1630 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1631 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1632 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1637
1638 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1639 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1640 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1641 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1642 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1643 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1644 }
1645
1646 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1649 }
1650
1651 if (HasInt256) {
1652 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1653 // when we have a 256bit-wide blend with immediate.
1656
1657 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1658 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1659 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1660 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1661 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1662 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1663 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1664 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1665 }
1666 }
1667
1668 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1669 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1670 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1672 }
1673
1674 // Extract subvector is special because the value type
1675 // (result) is 128-bit but the source is 256-bit wide.
1676 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1677 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1679 }
1680
1681 // Custom lower several nodes for 256-bit types.
1682 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1683 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1693 }
1694 setF16Action(MVT::v16f16, Expand);
1695 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1696 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1698 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1699 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1700 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1701 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1702
1703 if (HasInt256) {
1705
1706 // Custom legalize 2x32 to get a little better code.
1709
1710 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1711 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1713 }
1714 }
1715
1716 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1717 Subtarget.hasF16C()) {
1718 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1721 }
1722 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1725 }
1726 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1727 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1728 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1729 }
1730 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1731 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1732 }
1733
1734 // This block controls legalization of the mask vector sizes that are
1735 // available with AVX512. 512-bit vectors are in a separate block controlled
1736 // by useAVX512Regs.
1737 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1738 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1739 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1740 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1741 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1742 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1743
1747
1748 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1749 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1750 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1751 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1752 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1753 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1754 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1755 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1763
1764 // There is no byte sized k-register load or store without AVX512DQ.
1765 if (!Subtarget.hasDQI()) {
1766 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1767 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1768 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1769 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1770
1775 }
1776
1777 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1778 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1782 }
1783
1784 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1786
1787 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1791
1798 }
1799
1800 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1802 }
1803 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1804 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1807 }
1808 }
1809
1810 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1811 // elements. 512-bits can be disabled based on prefer-vector-width and
1812 // required-vector-width function attributes.
1813 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1814 bool HasBWI = Subtarget.hasBWI();
1815
1816 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1817 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1818 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1819 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1820 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1821 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1822 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1823
1824 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1825 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1826 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1827 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1828 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1829 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1830 if (HasBWI)
1831 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1832 }
1833
1834 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1845 }
1846 setOperationAction(ISD::LRINT, MVT::v16f32,
1847 Subtarget.hasDQI() ? Legal : Custom);
1848 setOperationAction(ISD::LRINT, MVT::v8f64,
1849 Subtarget.hasDQI() ? Legal : Custom);
1850 if (Subtarget.hasDQI())
1851 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1852
1853 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1858 }
1859
1860 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1865 }
1866
1873
1885
1886 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1887 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1888 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1889 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1890 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1891 if (HasBWI)
1892 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1893
1894 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1895 // to 512-bit rather than use the AVX2 instructions so that we can use
1896 // k-masks.
1897 if (!Subtarget.hasVLX()) {
1898 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1899 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1902 }
1903 }
1904
1906 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1907 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1917
1918 if (HasBWI) {
1919 // Extends from v64i1 masks to 512-bit vectors.
1923 }
1924
1925 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1938
1940 }
1941
1942 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1945 }
1946
1947 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1948 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1949 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1953 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1954 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1955 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1956
1957 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1958 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1959 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1960 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1961 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1962 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1963 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1964 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1965
1966 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1967 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1968
1969 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1979
1980 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1981 // setcc all the way to isel and prefer SETGT in some isel patterns.
1984 }
1985
1986 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1987 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1992
1993 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
2000 }
2001
2002 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2003 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
2004 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
2006 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
2007 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
2008 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
2009 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2014 }
2015
2016 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2017 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2018 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2019 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2020 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2021 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2022
2023 if (Subtarget.hasDQI()) {
2027 setOperationAction(Opc, MVT::v8i64, Custom);
2028 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2029 }
2030
2031 if (Subtarget.hasCDI()) {
2032 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2033 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2035 }
2036 } // Subtarget.hasCDI()
2037
2038 if (Subtarget.hasVPOPCNTDQ()) {
2039 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2041 }
2042
2043 // Extract subvector is special because the value type
2044 // (result) is 256-bit but the source is 512-bit wide.
2045 // 128-bit was made Legal under AVX1.
2046 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2047 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2049
2050 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2051 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2061 }
2062 setF16Action(MVT::v32f16, Expand);
2067 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2068 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2069 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2070
2071 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2076 }
2077 if (HasBWI) {
2078 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2081 }
2082 } else {
2083 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2084 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2085 }
2086
2087 if (Subtarget.hasVBMI2()) {
2088 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2091 }
2092
2093 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2094 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2095 }
2096
2097 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2098 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2100 }// useAVX512Regs
2101
2102 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2103 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2104 MVT::v4i64}) {
2107 }
2108 }
2109
2110 // This block controls legalization for operations that don't have
2111 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2112 // narrower widths.
2113 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2114 // These operations are handled on non-VLX by artificially widening in
2115 // isel patterns.
2116
2120
2121 if (Subtarget.hasDQI()) {
2122 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2123 // v2f32 UINT_TO_FP is already custom under SSE2.
2126 "Unexpected operation action!");
2127 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2132 }
2133
2134 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2140 }
2141
2142 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2145 }
2146
2147 // Custom legalize 2x32 to get a little better code.
2150
2151 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2152 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2154
2155 if (Subtarget.hasDQI()) {
2159 setOperationAction(Opc, MVT::v2i64, Custom);
2160 setOperationAction(Opc, MVT::v4i64, Custom);
2161 }
2162 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2163 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2164 }
2165
2166 if (Subtarget.hasCDI()) {
2167 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2169 }
2170 } // Subtarget.hasCDI()
2171
2172 if (Subtarget.hasVPOPCNTDQ()) {
2173 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2175 }
2176
2177 // We can try to convert vectors to different sizes to leverage legal
2178 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2179 // then specialize to Legal below.
2180 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2181 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2182 MVT::v16i16, MVT::v8i8})
2184
2185 // Legal vpcompress depends on various AVX512 extensions.
2186 // Legal in AVX512F
2187 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2189
2190 // Legal in AVX512F + AVX512VL
2191 if (Subtarget.hasVLX())
2192 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2193 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2195
2196 // Legal in AVX512F + AVX512VBMI2
2197 if (Subtarget.hasVBMI2())
2198 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2200
2201 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2202 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2203 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2205 }
2206
2207 // This block control legalization of v32i1/v64i1 which are available with
2208 // AVX512BW..
2209 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2210 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2211 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2212
2213 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2224 }
2225
2226 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2228
2229 // Extends from v32i1 masks to 256-bit vectors.
2233
2234 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2235 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2236 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2237 }
2238
2239 // These operations are handled on non-VLX by artificially widening in
2240 // isel patterns.
2241 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2242
2243 if (Subtarget.hasBITALG()) {
2244 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2246 }
2247 }
2248
2249 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2250 auto setGroup = [&] (MVT VT) {
2261
2274
2276
2279
2285
2291
2295 };
2296
2297 // AVX512_FP16 scalar operations
2298 setGroup(MVT::f16);
2314
2317
2318 if (Subtarget.useAVX512Regs()) {
2319 setGroup(MVT::v32f16);
2325 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2332
2337 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2339 MVT::v32i16);
2340 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2342 MVT::v32i16);
2343 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2345 MVT::v32i16);
2346 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2348 MVT::v32i16);
2349
2353
2354 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2355 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2356
2361 }
2362
2363 if (Subtarget.hasVLX()) {
2364 setGroup(MVT::v8f16);
2365 setGroup(MVT::v16f16);
2366
2377
2388
2389 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2392
2396
2397 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2398 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2399 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2400 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2401
2402 // Need to custom widen these to prevent scalarization.
2403 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2404 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2405
2410
2415 }
2416 }
2417
2418 if (!Subtarget.useSoftFloat() &&
2419 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2420 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2421 : &X86::VR128RegClass);
2422 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2423 : &X86::VR256RegClass);
2424 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2425 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2426 // Set the operation action Custom to do the customization later.
2429 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2430 setF16Action(VT, Expand);
2431 if (!Subtarget.hasBF16())
2437 }
2438 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2439 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2440 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2441 }
2442 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2443 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2445 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2446 }
2447
2448 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2449 Subtarget.useAVX512Regs()) {
2450 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2451 setF16Action(MVT::v32bf16, Expand);
2452 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2453 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2454 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2456 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2460 }
2461
2462 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2463 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2475 }
2476 if (Subtarget.hasAVX10_2_512()) {
2477 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2478 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2479 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2480 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2481 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2482 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2483 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2484 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2485 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2488 }
2489 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2492 }
2493 }
2494
2495 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2496 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2497 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2498 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2499 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2500 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2501
2502 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2503 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2504 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2505 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2506 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2507
2508 if (Subtarget.hasBWI()) {
2509 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2510 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2511 }
2512
2513 if (Subtarget.hasFP16()) {
2514 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2523 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2532 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2537 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2542 }
2543 }
2544
2545 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2546 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2547 }
2548
2549 // We want to custom lower some of our intrinsics.
2553 if (!Subtarget.is64Bit()) {
2555 }
2556
2557 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2558 // handle type legalization for these operations here.
2559 //
2560 // FIXME: We really should do custom legalization for addition and
2561 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2562 // than generic legalization for 64-bit multiplication-with-overflow, though.
2563 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2564 if (VT == MVT::i64 && !Subtarget.is64Bit())
2565 continue;
2566 // Add/Sub/Mul with overflow operations are custom lowered.
2573
2574 // Support carry in as value rather than glue.
2580 }
2581
2582 // Combine sin / cos into _sincos_stret if it is available.
2583 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2584 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2587 }
2588
2589 if (Subtarget.isTargetWin64()) {
2590 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2591 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2592 setOperationAction(ISD::SREM, MVT::i128, Custom);
2593 setOperationAction(ISD::UREM, MVT::i128, Custom);
2602 }
2603
2604 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2605 // is. We should promote the value to 64-bits to solve this.
2606 // This is what the CRT headers do - `fmodf` is an inline header
2607 // function casting to f64 and calling `fmod`.
2608 if (Subtarget.is32Bit() &&
2609 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2610 // clang-format off
2611 for (ISD::NodeType Op :
2629 if (isOperationExpand(Op, MVT::f32))
2630 setOperationAction(Op, MVT::f32, Promote);
2631 // clang-format on
2632
2633 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2634 // it, but it's just a wrapper around ldexp.
2635 if (Subtarget.isOSWindows()) {
2637 if (isOperationExpand(Op, MVT::f32))
2638 setOperationAction(Op, MVT::f32, Promote);
2639 }
2640
2641 // We have target-specific dag combine patterns for the following nodes:
2652 ISD::SHL,
2653 ISD::SRA,
2654 ISD::SRL,
2655 ISD::OR,
2656 ISD::AND,
2662 ISD::ADD,
2663 ISD::FADD,
2664 ISD::FSUB,
2665 ISD::FNEG,
2666 ISD::FMA,
2670 ISD::SUB,
2671 ISD::LOAD,
2672 ISD::LRINT,
2674 ISD::MLOAD,
2675 ISD::STORE,
2691 ISD::SETCC,
2692 ISD::MUL,
2693 ISD::XOR,
2704
2706
2707 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2709 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2711 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2713
2714 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2715 // that needs to benchmarked and balanced with the potential use of vector
2716 // load/store types (PR33329, PR33914).
2719
2720 // Default loop alignment, which can be overridden by -align-loops.
2722
2723 // An out-of-order CPU can speculatively execute past a predictable branch,
2724 // but a conditional move could be stalled by an expensive earlier operation.
2725 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2726 EnableExtLdPromotion = true;
2728
2730
2731 // Default to having -disable-strictnode-mutation on
2732 IsStrictFPEnabled = true;
2733}
2734
2735// This has so far only been implemented for 64-bit MachO.
2737 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2738}
2739
2741 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2742 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2743}
2744
2746 const SDLoc &DL) const {
2747 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2748 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2749 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2750 return SDValue(Node, 0);
2751}
2752
2755 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2756 !Subtarget.hasBWI())
2757 return TypeSplitVector;
2758
2759 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2760 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2761 return TypeSplitVector;
2762
2763 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2764 VT.getVectorElementType() != MVT::i1)
2765 return TypeWidenVector;
2766
2768}
2769
2770FastISel *
2772 const TargetLibraryInfo *libInfo) const {
2773 return X86::createFastISel(funcInfo, libInfo);
2774}
2775
2776//===----------------------------------------------------------------------===//
2777// Other Lowering Hooks
2778//===----------------------------------------------------------------------===//
2779
2781 bool AssumeSingleUse) {
2782 if (!AssumeSingleUse && !Op.hasOneUse())
2783 return false;
2784 if (!ISD::isNormalLoad(Op.getNode()))
2785 return false;
2786
2787 // If this is an unaligned vector, make sure the target supports folding it.
2788 auto *Ld = cast<LoadSDNode>(Op.getNode());
2789 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2790 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2791 return false;
2792
2793 // TODO: If this is a non-temporal load and the target has an instruction
2794 // for it, it should not be folded. See "useNonTemporalLoad()".
2795
2796 return true;
2797}
2798
2800 const X86Subtarget &Subtarget,
2801 bool AssumeSingleUse) {
2802 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2803 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2804 return false;
2805
2806 // We can not replace a wide volatile load with a broadcast-from-memory,
2807 // because that would narrow the load, which isn't legal for volatiles.
2808 auto *Ld = cast<LoadSDNode>(Op.getNode());
2809 return !Ld->isVolatile() ||
2810 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2811}
2812
2814 if (!Op.hasOneUse())
2815 return false;
2816 // Peek through (oneuse) bitcast users
2817 SDNode *User = *Op->user_begin();
2818 while (User->getOpcode() == ISD::BITCAST) {
2819 if (!User->hasOneUse())
2820 return false;
2821 User = *User->user_begin();
2822 }
2823 return ISD::isNormalStore(User);
2824}
2825
2827 if (Op.hasOneUse()) {
2828 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2829 return (ISD::ZERO_EXTEND == Opcode);
2830 }
2831 return false;
2832}
2833
2834static bool isLogicOp(unsigned Opcode) {
2835 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2836 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2837}
2838
2839static bool isTargetShuffle(unsigned Opcode) {
2840 switch(Opcode) {
2841 default: return false;
2842 case X86ISD::BLENDI:
2843 case X86ISD::PSHUFB:
2844 case X86ISD::PSHUFD:
2845 case X86ISD::PSHUFHW:
2846 case X86ISD::PSHUFLW:
2847 case X86ISD::SHUFP:
2848 case X86ISD::INSERTPS:
2849 case X86ISD::EXTRQI:
2850 case X86ISD::INSERTQI:
2851 case X86ISD::VALIGN:
2852 case X86ISD::PALIGNR:
2853 case X86ISD::VSHLDQ:
2854 case X86ISD::VSRLDQ:
2855 case X86ISD::MOVLHPS:
2856 case X86ISD::MOVHLPS:
2857 case X86ISD::MOVSHDUP:
2858 case X86ISD::MOVSLDUP:
2859 case X86ISD::MOVDDUP:
2860 case X86ISD::MOVSS:
2861 case X86ISD::MOVSD:
2862 case X86ISD::MOVSH:
2863 case X86ISD::UNPCKL:
2864 case X86ISD::UNPCKH:
2865 case X86ISD::VBROADCAST:
2866 case X86ISD::VPERMILPI:
2867 case X86ISD::VPERMILPV:
2868 case X86ISD::VPERM2X128:
2869 case X86ISD::SHUF128:
2870 case X86ISD::VPERMIL2:
2871 case X86ISD::VPERMI:
2872 case X86ISD::VPPERM:
2873 case X86ISD::VPERMV:
2874 case X86ISD::VPERMV3:
2875 case X86ISD::VZEXT_MOVL:
2876 return true;
2877 }
2878}
2879
2880static bool isTargetShuffleVariableMask(unsigned Opcode) {
2881 switch (Opcode) {
2882 default: return false;
2883 // Target Shuffles.
2884 case X86ISD::PSHUFB:
2885 case X86ISD::VPERMILPV:
2886 case X86ISD::VPERMIL2:
2887 case X86ISD::VPPERM:
2888 case X86ISD::VPERMV:
2889 case X86ISD::VPERMV3:
2890 return true;
2891 // 'Faux' Target Shuffles.
2892 case ISD::OR:
2893 case ISD::AND:
2894 case X86ISD::ANDNP:
2895 return true;
2896 }
2897}
2898
2901 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2903 int ReturnAddrIndex = FuncInfo->getRAIndex();
2904
2905 if (ReturnAddrIndex == 0) {
2906 // Set up a frame object for the return address.
2907 unsigned SlotSize = RegInfo->getSlotSize();
2908 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2909 -(int64_t)SlotSize,
2910 false);
2911 FuncInfo->setRAIndex(ReturnAddrIndex);
2912 }
2913
2914 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2915}
2916
2918 bool HasSymbolicDisplacement) {
2919 // Offset should fit into 32 bit immediate field.
2920 if (!isInt<32>(Offset))
2921 return false;
2922
2923 // If we don't have a symbolic displacement - we don't have any extra
2924 // restrictions.
2925 if (!HasSymbolicDisplacement)
2926 return true;
2927
2928 // We can fold large offsets in the large code model because we always use
2929 // 64-bit offsets.
2930 if (CM == CodeModel::Large)
2931 return true;
2932
2933 // For kernel code model we know that all object resist in the negative half
2934 // of 32bits address space. We may not accept negative offsets, since they may
2935 // be just off and we may accept pretty large positive ones.
2936 if (CM == CodeModel::Kernel)
2937 return Offset >= 0;
2938
2939 // For other non-large code models we assume that latest small object is 16MB
2940 // before end of 31 bits boundary. We may also accept pretty large negative
2941 // constants knowing that all objects are in the positive half of address
2942 // space.
2943 return Offset < 16 * 1024 * 1024;
2944}
2945
2946/// Return true if the condition is an signed comparison operation.
2947static bool isX86CCSigned(unsigned X86CC) {
2948 switch (X86CC) {
2949 default:
2950 llvm_unreachable("Invalid integer condition!");
2951 case X86::COND_E:
2952 case X86::COND_NE:
2953 case X86::COND_B:
2954 case X86::COND_A:
2955 case X86::COND_BE:
2956 case X86::COND_AE:
2957 return false;
2958 case X86::COND_G:
2959 case X86::COND_GE:
2960 case X86::COND_L:
2961 case X86::COND_LE:
2962 return true;
2963 }
2964}
2965
2967 switch (SetCCOpcode) {
2968 // clang-format off
2969 default: llvm_unreachable("Invalid integer condition!");
2970 case ISD::SETEQ: return X86::COND_E;
2971 case ISD::SETGT: return X86::COND_G;
2972 case ISD::SETGE: return X86::COND_GE;
2973 case ISD::SETLT: return X86::COND_L;
2974 case ISD::SETLE: return X86::COND_LE;
2975 case ISD::SETNE: return X86::COND_NE;
2976 case ISD::SETULT: return X86::COND_B;
2977 case ISD::SETUGT: return X86::COND_A;
2978 case ISD::SETULE: return X86::COND_BE;
2979 case ISD::SETUGE: return X86::COND_AE;
2980 // clang-format on
2981 }
2982}
2983
2984/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2985/// condition code, returning the condition code and the LHS/RHS of the
2986/// comparison to make.
2988 bool isFP, SDValue &LHS, SDValue &RHS,
2989 SelectionDAG &DAG) {
2990 if (!isFP) {
2991 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2992 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2993 // X > -1 -> X == 0, jump !sign.
2994 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2995 return X86::COND_NS;
2996 }
2997 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2998 // X < 0 -> X == 0, jump on sign.
2999 return X86::COND_S;
3000 }
3001 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3002 // X >= 0 -> X == 0, jump on !sign.
3003 return X86::COND_NS;
3004 }
3005 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3006 // X < 1 -> X <= 0
3007 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3008 return X86::COND_LE;
3009 }
3010 }
3011
3012 return TranslateIntegerX86CC(SetCCOpcode);
3013 }
3014
3015 // First determine if it is required or is profitable to flip the operands.
3016
3017 // If LHS is a foldable load, but RHS is not, flip the condition.
3018 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3019 !ISD::isNON_EXTLoad(RHS.getNode())) {
3020 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3021 std::swap(LHS, RHS);
3022 }
3023
3024 switch (SetCCOpcode) {
3025 default: break;
3026 case ISD::SETOLT:
3027 case ISD::SETOLE:
3028 case ISD::SETUGT:
3029 case ISD::SETUGE:
3030 std::swap(LHS, RHS);
3031 break;
3032 }
3033
3034 // On a floating point condition, the flags are set as follows:
3035 // ZF PF CF op
3036 // 0 | 0 | 0 | X > Y
3037 // 0 | 0 | 1 | X < Y
3038 // 1 | 0 | 0 | X == Y
3039 // 1 | 1 | 1 | unordered
3040 switch (SetCCOpcode) {
3041 // clang-format off
3042 default: llvm_unreachable("Condcode should be pre-legalized away");
3043 case ISD::SETUEQ:
3044 case ISD::SETEQ: return X86::COND_E;
3045 case ISD::SETOLT: // flipped
3046 case ISD::SETOGT:
3047 case ISD::SETGT: return X86::COND_A;
3048 case ISD::SETOLE: // flipped
3049 case ISD::SETOGE:
3050 case ISD::SETGE: return X86::COND_AE;
3051 case ISD::SETUGT: // flipped
3052 case ISD::SETULT:
3053 case ISD::SETLT: return X86::COND_B;
3054 case ISD::SETUGE: // flipped
3055 case ISD::SETULE:
3056 case ISD::SETLE: return X86::COND_BE;
3057 case ISD::SETONE:
3058 case ISD::SETNE: return X86::COND_NE;
3059 case ISD::SETUO: return X86::COND_P;
3060 case ISD::SETO: return X86::COND_NP;
3061 case ISD::SETOEQ:
3062 case ISD::SETUNE: return X86::COND_INVALID;
3063 // clang-format on
3064 }
3065}
3066
3067/// Is there a floating point cmov for the specific X86 condition code?
3068/// Current x86 isa includes the following FP cmov instructions:
3069/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3070static bool hasFPCMov(unsigned X86CC) {
3071 switch (X86CC) {
3072 default:
3073 return false;
3074 case X86::COND_B:
3075 case X86::COND_BE:
3076 case X86::COND_E:
3077 case X86::COND_P:
3078 case X86::COND_A:
3079 case X86::COND_AE:
3080 case X86::COND_NE:
3081 case X86::COND_NP:
3082 return true;
3083 }
3084}
3085
3086static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3087 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3088 VT.is512BitVector();
3089}
3090
3092 const CallInst &I,
3093 MachineFunction &MF,
3094 unsigned Intrinsic) const {
3096 Info.offset = 0;
3097
3098 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
3099 if (!IntrData) {
3100 switch (Intrinsic) {
3101 case Intrinsic::x86_aesenc128kl:
3102 case Intrinsic::x86_aesdec128kl:
3104 Info.ptrVal = I.getArgOperand(1);
3105 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3106 Info.align = Align(1);
3108 return true;
3109 case Intrinsic::x86_aesenc256kl:
3110 case Intrinsic::x86_aesdec256kl:
3112 Info.ptrVal = I.getArgOperand(1);
3113 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3114 Info.align = Align(1);
3116 return true;
3117 case Intrinsic::x86_aesencwide128kl:
3118 case Intrinsic::x86_aesdecwide128kl:
3120 Info.ptrVal = I.getArgOperand(0);
3121 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3122 Info.align = Align(1);
3124 return true;
3125 case Intrinsic::x86_aesencwide256kl:
3126 case Intrinsic::x86_aesdecwide256kl:
3128 Info.ptrVal = I.getArgOperand(0);
3129 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3130 Info.align = Align(1);
3132 return true;
3133 case Intrinsic::x86_cmpccxadd32:
3134 case Intrinsic::x86_cmpccxadd64:
3135 case Intrinsic::x86_atomic_bts:
3136 case Intrinsic::x86_atomic_btc:
3137 case Intrinsic::x86_atomic_btr: {
3139 Info.ptrVal = I.getArgOperand(0);
3140 unsigned Size = I.getType()->getScalarSizeInBits();
3141 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3142 Info.align = Align(Size);
3145 return true;
3146 }
3147 case Intrinsic::x86_atomic_bts_rm:
3148 case Intrinsic::x86_atomic_btc_rm:
3149 case Intrinsic::x86_atomic_btr_rm: {
3151 Info.ptrVal = I.getArgOperand(0);
3152 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3153 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3154 Info.align = Align(Size);
3157 return true;
3158 }
3159 case Intrinsic::x86_aadd32:
3160 case Intrinsic::x86_aadd64:
3161 case Intrinsic::x86_aand32:
3162 case Intrinsic::x86_aand64:
3163 case Intrinsic::x86_aor32:
3164 case Intrinsic::x86_aor64:
3165 case Intrinsic::x86_axor32:
3166 case Intrinsic::x86_axor64:
3167 case Intrinsic::x86_atomic_add_cc:
3168 case Intrinsic::x86_atomic_sub_cc:
3169 case Intrinsic::x86_atomic_or_cc:
3170 case Intrinsic::x86_atomic_and_cc:
3171 case Intrinsic::x86_atomic_xor_cc: {
3173 Info.ptrVal = I.getArgOperand(0);
3174 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3175 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3176 Info.align = Align(Size);
3179 return true;
3180 }
3181 }
3182 return false;
3183 }
3184
3185 switch (IntrData->Type) {
3188 case TRUNCATE_TO_MEM_VI32: {
3190 Info.ptrVal = I.getArgOperand(0);
3191 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3193 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3194 ScalarVT = MVT::i8;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3196 ScalarVT = MVT::i16;
3197 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3198 ScalarVT = MVT::i32;
3199
3200 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3201 Info.align = Align(1);
3203 break;
3204 }
3205 case GATHER:
3206 case GATHER_AVX2: {
3208 Info.ptrVal = nullptr;
3209 MVT DataVT = MVT::getVT(I.getType());
3210 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3211 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3212 IndexVT.getVectorNumElements());
3213 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3214 Info.align = Align(1);
3216 break;
3217 }
3218 case SCATTER: {
3220 Info.ptrVal = nullptr;
3221 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3222 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3223 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3224 IndexVT.getVectorNumElements());
3225 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3226 Info.align = Align(1);
3228 break;
3229 }
3230 default:
3231 return false;
3232 }
3233
3234 return true;
3235}
3236
3237/// Returns true if the target can instruction select the
3238/// specified FP immediate natively. If false, the legalizer will
3239/// materialize the FP immediate as a load from a constant pool.
3241 bool ForCodeSize) const {
3242 for (const APFloat &FPImm : LegalFPImmediates)
3243 if (Imm.bitwiseIsEqual(FPImm))
3244 return true;
3245 return false;
3246}
3247
3249 ISD::LoadExtType ExtTy,
3250 EVT NewVT) const {
3251 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3252
3253 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3254 // relocation target a movq or addq instruction: don't let the load shrink.
3255 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3256 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3257 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3258 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3259
3260 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3261 // those uses are extracted directly into a store, then the extract + store
3262 // can be store-folded. Therefore, it's probably not worth splitting the load.
3263 EVT VT = Load->getValueType(0);
3264 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3265 for (SDUse &Use : Load->uses()) {
3266 // Skip uses of the chain value. Result 0 of the node is the load value.
3267 if (Use.getResNo() != 0)
3268 continue;
3269
3270 SDNode *User = Use.getUser();
3271
3272 // If this use is not an extract + store, it's probably worth splitting.
3273 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !User->hasOneUse() ||
3274 User->user_begin()->getOpcode() != ISD::STORE)
3275 return true;
3276 }
3277 // All non-chain uses are extract + store.
3278 return false;
3279 }
3280
3281 return true;
3282}
3283
3284/// Returns true if it is beneficial to convert a load of a constant
3285/// to just the constant itself.
3287 Type *Ty) const {
3288 assert(Ty->isIntegerTy());
3289
3290 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3291 if (BitSize == 0 || BitSize > 64)
3292 return false;
3293 return true;
3294}
3295
3297 // If we are using XMM registers in the ABI and the condition of the select is
3298 // a floating-point compare and we have blendv or conditional move, then it is
3299 // cheaper to select instead of doing a cross-register move and creating a
3300 // load that depends on the compare result.
3301 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3302 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3303}
3304
3306 // TODO: It might be a win to ease or lift this restriction, but the generic
3307 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3308 if (VT.isVector() && Subtarget.hasAVX512())
3309 return false;
3310
3311 return true;
3312}
3313
3315 SDValue C) const {
3316 // TODO: We handle scalars using custom code, but generic combining could make
3317 // that unnecessary.
3318 APInt MulC;
3319 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3320 return false;
3321
3322 // Find the type this will be legalized too. Otherwise we might prematurely
3323 // convert this to shl+add/sub and then still have to type legalize those ops.
3324 // Another choice would be to defer the decision for illegal types until
3325 // after type legalization. But constant splat vectors of i64 can't make it
3326 // through type legalization on 32-bit targets so we would need to special
3327 // case vXi64.
3328 while (getTypeAction(Context, VT) != TypeLegal)
3329 VT = getTypeToTransformTo(Context, VT);
3330
3331 // If vector multiply is legal, assume that's faster than shl + add/sub.
3332 // Multiply is a complex op with higher latency and lower throughput in
3333 // most implementations, sub-vXi32 vector multiplies are always fast,
3334 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3335 // is always going to be slow.
3336 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3337 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3338 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3339 return false;
3340
3341 // shl+add, shl+sub, shl+add+neg
3342 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3343 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3344}
3345
3347 unsigned Index) const {
3349 return false;
3350
3351 // Mask vectors support all subregister combinations and operations that
3352 // extract half of vector.
3353 if (ResVT.getVectorElementType() == MVT::i1)
3354 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3355 (Index == ResVT.getVectorNumElements()));
3356
3357 return (Index % ResVT.getVectorNumElements()) == 0;
3358}
3359
3361 unsigned Opc = VecOp.getOpcode();
3362
3363 // Assume target opcodes can't be scalarized.
3364 // TODO - do we have any exceptions?
3365 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3366 return false;
3367
3368 // If the vector op is not supported, try to convert to scalar.
3369 EVT VecVT = VecOp.getValueType();
3370 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3371 return true;
3372
3373 // If the vector op is supported, but the scalar op is not, the transform may
3374 // not be worthwhile.
3375 EVT ScalarVT = VecVT.getScalarType();
3376 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3377}
3378
3380 bool) const {
3381 // TODO: Allow vectors?
3382 if (VT.isVector())
3383 return false;
3384 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3385}
3386
3388 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3389 // i32/i64 or can rely on BSF passthrough value.
3390 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3391 Subtarget.hasBitScanPassThrough() ||
3392 (!Ty->isVectorTy() &&
3393 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3394}
3395
3397 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3398 // passthrough value.
3399 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3400 Subtarget.hasBitScanPassThrough();
3401}
3402
3404 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3405 // expensive than a straight movsd. On the other hand, it's important to
3406 // shrink long double fp constant since fldt is very slow.
3407 return !Subtarget.hasSSE2() || VT == MVT::f80;
3408}
3409
3411 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3412 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3413}
3414
3416 const SelectionDAG &DAG,
3417 const MachineMemOperand &MMO) const {
3418 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3419 BitcastVT.getVectorElementType() == MVT::i1)
3420 return false;
3421
3422 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3423 return false;
3424
3425 // If both types are legal vectors, it's always ok to convert them.
3426 if (LoadVT.isVector() && BitcastVT.isVector() &&
3427 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3428 return true;
3429
3430 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3431}
3432
3434 const MachineFunction &MF) const {
3435 // Do not merge to float value size (128 bytes) if no implicit
3436 // float attribute is set.
3437 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3438
3439 if (NoFloat) {
3440 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3441 return (MemVT.getSizeInBits() <= MaxIntSize);
3442 }
3443 // Make sure we don't merge greater than our preferred vector
3444 // width.
3445 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3446 return false;
3447
3448 return true;
3449}
3450
3452 return Subtarget.hasFastLZCNT();
3453}
3454
3456 const Instruction &AndI) const {
3457 return true;
3458}
3459
3461 EVT VT = Y.getValueType();
3462
3463 if (VT.isVector())
3464 return false;
3465
3466 if (!Subtarget.hasBMI())
3467 return false;
3468
3469 // There are only 32-bit and 64-bit forms for 'andn'.
3470 if (VT != MVT::i32 && VT != MVT::i64)
3471 return false;
3472
3473 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3474}
3475
3477 EVT VT = Y.getValueType();
3478
3479 if (!VT.isVector())
3480 return hasAndNotCompare(Y);
3481
3482 // Vector.
3483
3484 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3485 return false;
3486
3487 if (VT == MVT::v4i32)
3488 return true;
3489
3490 return Subtarget.hasSSE2();
3491}
3492
3494 return X.getValueType().isScalarInteger(); // 'bt'
3495}
3496
3500 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3501 SelectionDAG &DAG) const {
3502 // Does baseline recommend not to perform the fold by default?
3504 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3505 return false;
3506 // For scalars this transform is always beneficial.
3507 if (X.getValueType().isScalarInteger())
3508 return true;
3509 // If all the shift amounts are identical, then transform is beneficial even
3510 // with rudimentary SSE2 shifts.
3511 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3512 return true;
3513 // If we have AVX2 with it's powerful shift operations, then it's also good.
3514 if (Subtarget.hasAVX2())
3515 return true;
3516 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3517 return NewShiftOpcode == ISD::SHL;
3518}
3519
3521 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3522 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3523 if (!VT.isInteger())
3524 return ShiftOpc;
3525
3526 bool PreferRotate = false;
3527 if (VT.isVector()) {
3528 // For vectors, if we have rotate instruction support, then its definetly
3529 // best. Otherwise its not clear what the best so just don't make changed.
3530 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3531 VT.getScalarType() == MVT::i64);
3532 } else {
3533 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3534 // rotate unless we have a zext mask+shr.
3535 PreferRotate = Subtarget.hasBMI2();
3536 if (!PreferRotate) {
3537 unsigned MaskBits =
3538 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3539 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3540 }
3541 }
3542
3543 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3544 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3545
3546 if (PreferRotate && MayTransformRotate)
3547 return ISD::ROTL;
3548
3549 // If vector we don't really get much benefit swapping around constants.
3550 // Maybe we could check if the DAG has the flipped node already in the
3551 // future.
3552 if (VT.isVector())
3553 return ShiftOpc;
3554
3555 // See if the beneficial to swap shift type.
3556 if (ShiftOpc == ISD::SHL) {
3557 // If the current setup has imm64 mask, then inverse will have
3558 // at least imm32 mask (or be zext i32 -> i64).
3559 if (VT == MVT::i64)
3560 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3561 : ShiftOpc;
3562
3563 // We can only benefit if req at least 7-bit for the mask. We
3564 // don't want to replace shl of 1,2,3 as they can be implemented
3565 // with lea/add.
3566 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3567 }
3568
3569 if (VT == MVT::i64)
3570 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3571 // extremely efficient.
3572 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3573
3574 // Keep small shifts as shl so we can generate add/lea.
3575 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3576 }
3577
3578 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3579 // (PreferRotate will be set in the latter case).
3580 if (PreferRotate || !MayTransformRotate || VT.isVector())
3581 return ShiftOpc;
3582
3583 // Non-vector type and we have a zext mask with SRL.
3584 return ISD::SRL;
3585}
3586
3589 const Value *Lhs,
3590 const Value *Rhs) const {
3591 using namespace llvm::PatternMatch;
3592 int BaseCost = BrMergingBaseCostThresh.getValue();
3593 // With CCMP, branches can be merged in a more efficient way.
3594 if (BaseCost >= 0 && Subtarget.hasCCMP())
3595 BaseCost += BrMergingCcmpBias;
3596 // a == b && a == c is a fast pattern on x86.
3597 if (BaseCost >= 0 && Opc == Instruction::And &&
3600 BaseCost += 1;
3601 return {BaseCost, BrMergingLikelyBias.getValue(),
3602 BrMergingUnlikelyBias.getValue()};
3603}
3604
3606 return N->getOpcode() != ISD::FP_EXTEND;
3607}
3608
3610 const SDNode *N, CombineLevel Level) const {
3611 assert(((N->getOpcode() == ISD::SHL &&
3612 N->getOperand(0).getOpcode() == ISD::SRL) ||
3613 (N->getOpcode() == ISD::SRL &&
3614 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3615 "Expected shift-shift mask");
3616 // TODO: Should we always create i64 masks? Or only folded immediates?
3617 EVT VT = N->getValueType(0);
3618 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3619 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3620 // Only fold if the shift values are equal - so it folds to AND.
3621 // TODO - we should fold if either is a non-uniform vector but we don't do
3622 // the fold for non-splats yet.
3623 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3624 }
3626}
3627
3629 EVT VT = Y.getValueType();
3630
3631 // For vectors, we don't have a preference, but we probably want a mask.
3632 if (VT.isVector())
3633 return false;
3634
3635 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3636 if (VT == MVT::i64 && !Subtarget.is64Bit())
3637 return false;
3638
3639 return true;
3640}
3641
3644 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3646 !Subtarget.isOSWindows())
3649 ExpansionFactor);
3650}
3651
3653 // Any legal vector type can be splatted more efficiently than
3654 // loading/spilling from memory.
3655 return isTypeLegal(VT);
3656}
3657
3659 MVT VT = MVT::getIntegerVT(NumBits);
3660 if (isTypeLegal(VT))
3661 return VT;
3662
3663 // PMOVMSKB can handle this.
3664 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3665 return MVT::v16i8;
3666
3667 // VPMOVMSKB can handle this.
3668 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3669 return MVT::v32i8;
3670
3671 // TODO: Allow 64-bit type for 32-bit target.
3672 // TODO: 512-bit types should be allowed, but make sure that those
3673 // cases are handled in combineVectorSizedSetCCEquality().
3674
3676}
3677
3678/// Val is the undef sentinel value or equal to the specified value.
3679static bool isUndefOrEqual(int Val, int CmpVal) {
3680 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3681}
3682
3683/// Return true if every element in Mask is the undef sentinel value or equal to
3684/// the specified value.
3685static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3686 return llvm::all_of(Mask, [CmpVal](int M) {
3687 return (M == SM_SentinelUndef) || (M == CmpVal);
3688 });
3689}
3690
3691/// Return true if every element in Mask, beginning from position Pos and ending
3692/// in Pos+Size is the undef sentinel value or equal to the specified value.
3693static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3694 unsigned Size) {
3695 return llvm::all_of(Mask.slice(Pos, Size),
3696 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3697}
3698
3699/// Val is either the undef or zero sentinel value.
3700static bool isUndefOrZero(int Val) {
3701 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3702}
3703
3704/// Return true if every element in Mask, beginning from position Pos and ending
3705/// in Pos+Size is the undef sentinel value.
3706static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3707 return llvm::all_of(Mask.slice(Pos, Size),
3708 [](int M) { return M == SM_SentinelUndef; });
3709}
3710
3711/// Return true if the mask creates a vector whose lower half is undefined.
3713 unsigned NumElts = Mask.size();
3714 return isUndefInRange(Mask, 0, NumElts / 2);
3715}
3716
3717/// Return true if the mask creates a vector whose upper half is undefined.
3719 unsigned NumElts = Mask.size();
3720 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3721}
3722
3723/// Return true if Val falls within the specified range (L, H].
3724static bool isInRange(int Val, int Low, int Hi) {
3725 return (Val >= Low && Val < Hi);
3726}
3727
3728/// Return true if the value of any element in Mask falls within the specified
3729/// range (L, H].
3730static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3731 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3732}
3733
3734/// Return true if the value of any element in Mask is the zero sentinel value.
3735static bool isAnyZero(ArrayRef<int> Mask) {
3736 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3737}
3738
3739/// Return true if Val is undef or if its value falls within the
3740/// specified range (L, H].
3741static bool isUndefOrInRange(int Val, int Low, int Hi) {
3742 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3743}
3744
3745/// Return true if every element in Mask is undef or if its value
3746/// falls within the specified range (L, H].
3747static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3748 return llvm::all_of(
3749 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3750}
3751
3752/// Return true if Val is undef, zero or if its value falls within the
3753/// specified range (L, H].
3754static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3755 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3756}
3757
3758/// Return true if every element in Mask is undef, zero or if its value
3759/// falls within the specified range (L, H].
3760static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3761 return llvm::all_of(
3762 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3763}
3764
3765/// Return true if every element in Mask, is an in-place blend/select mask or is
3766/// undef.
3768 unsigned NumElts = Mask.size();
3769 for (auto [I, M] : enumerate(Mask))
3770 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3771 return false;
3772 return true;
3773}
3774
3775/// Return true if every element in Mask, beginning
3776/// from position Pos and ending in Pos + Size, falls within the specified
3777/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3778static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3779 unsigned Size, int Low, int Step = 1) {
3780 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3781 if (!isUndefOrEqual(Mask[i], Low))
3782 return false;
3783 return true;
3784}
3785
3786/// Return true if every element in Mask, beginning
3787/// from position Pos and ending in Pos+Size, falls within the specified
3788/// sequential range (Low, Low+Size], or is undef or is zero.
3790 unsigned Size, int Low,
3791 int Step = 1) {
3792 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3793 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3794 return false;
3795 return true;
3796}
3797
3798/// Return true if every element in Mask, beginning
3799/// from position Pos and ending in Pos+Size is undef or is zero.
3800static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3801 unsigned Size) {
3802 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3803}
3804
3805/// Return true if every element of a single input is referenced by the shuffle
3806/// mask. i.e. it just permutes them all.
3808 unsigned NumElts = Mask.size();
3809 APInt DemandedElts = APInt::getZero(NumElts);
3810 for (int M : Mask)
3811 if (isInRange(M, 0, NumElts))
3812 DemandedElts.setBit(M);
3813 return DemandedElts.isAllOnes();
3814}
3815
3816/// Helper function to test whether a shuffle mask could be
3817/// simplified by widening the elements being shuffled.
3818///
3819/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3820/// leaves it in an unspecified state.
3821///
3822/// NOTE: This must handle normal vector shuffle masks and *target* vector
3823/// shuffle masks. The latter have the special property of a '-2' representing
3824/// a zero-ed lane of a vector.
3826 SmallVectorImpl<int> &WidenedMask) {
3827 WidenedMask.assign(Mask.size() / 2, 0);
3828 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3829 int M0 = Mask[i];
3830 int M1 = Mask[i + 1];
3831
3832 // If both elements are undef, its trivial.
3833 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3834 WidenedMask[i / 2] = SM_SentinelUndef;
3835 continue;
3836 }
3837
3838 // Check for an undef mask and a mask value properly aligned to fit with
3839 // a pair of values. If we find such a case, use the non-undef mask's value.
3840 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3841 WidenedMask[i / 2] = M1 / 2;
3842 continue;
3843 }
3844 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3845 WidenedMask[i / 2] = M0 / 2;
3846 continue;
3847 }
3848
3849 // When zeroing, we need to spread the zeroing across both lanes to widen.
3850 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3851 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3853 WidenedMask[i / 2] = SM_SentinelZero;
3854 continue;
3855 }
3856 return false;
3857 }
3858
3859 // Finally check if the two mask values are adjacent and aligned with
3860 // a pair.
3861 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3862 WidenedMask[i / 2] = M0 / 2;
3863 continue;
3864 }
3865
3866 // Otherwise we can't safely widen the elements used in this shuffle.
3867 return false;
3868 }
3869 assert(WidenedMask.size() == Mask.size() / 2 &&
3870 "Incorrect size of mask after widening the elements!");
3871
3872 return true;
3873}
3874
3876 const APInt &Zeroable,
3877 bool V2IsZero,
3878 SmallVectorImpl<int> &WidenedMask) {
3879 // Create an alternative mask with info about zeroable elements.
3880 // Here we do not set undef elements as zeroable.
3881 SmallVector<int, 64> ZeroableMask(Mask);
3882 if (V2IsZero) {
3883 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3884 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3885 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3886 ZeroableMask[i] = SM_SentinelZero;
3887 }
3888 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3889}
3890
3892 SmallVector<int, 32> WidenedMask;
3893 return canWidenShuffleElements(Mask, WidenedMask);
3894}
3895
3896// Attempt to narrow/widen shuffle mask until it matches the target number of
3897// elements.
3898static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3899 SmallVectorImpl<int> &ScaledMask) {
3900 unsigned NumSrcElts = Mask.size();
3901 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3902 "Illegal shuffle scale factor");
3903
3904 // Narrowing is guaranteed to work.
3905 if (NumDstElts >= NumSrcElts) {
3906 int Scale = NumDstElts / NumSrcElts;
3907 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3908 return true;
3909 }
3910
3911 // We have to repeat the widening until we reach the target size, but we can
3912 // split out the first widening as it sets up ScaledMask for us.
3913 if (canWidenShuffleElements(Mask, ScaledMask)) {
3914 while (ScaledMask.size() > NumDstElts) {
3915 SmallVector<int, 16> WidenedMask;
3916 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3917 return false;
3918 ScaledMask = std::move(WidenedMask);
3919 }
3920 return true;
3921 }
3922
3923 return false;
3924}
3925
3926static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3927 SmallVector<int, 32> ScaledMask;
3928 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3929}
3930
3931/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3933 return isNullConstant(Elt) || isNullFPConstant(Elt);
3934}
3935
3936// Build a vector of constants.
3937// Use an UNDEF node if MaskElt == -1.
3938// Split 64-bit constants in the 32-bit mode.
3940 const SDLoc &dl, bool IsMask = false) {
3941
3943 bool Split = false;
3944
3945 MVT ConstVecVT = VT;
3946 unsigned NumElts = VT.getVectorNumElements();
3947 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3948 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3949 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3950 Split = true;
3951 }
3952
3953 MVT EltVT = ConstVecVT.getVectorElementType();
3954 for (unsigned i = 0; i < NumElts; ++i) {
3955 bool IsUndef = Values[i] < 0 && IsMask;
3956 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3957 DAG.getConstant(Values[i], dl, EltVT);
3958 Ops.push_back(OpNode);
3959 if (Split)
3960 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3961 DAG.getConstant(0, dl, EltVT));
3962 }
3963 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3964 if (Split)
3965 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3966 return ConstsNode;
3967}
3968
3969static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3970 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3971 assert(Bits.size() == Undefs.getBitWidth() &&
3972 "Unequal constant and undef arrays");
3974 bool Split = false;
3975
3976 MVT ConstVecVT = VT;
3977 unsigned NumElts = VT.getVectorNumElements();
3978 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3979 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3980 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3981 Split = true;
3982 }
3983
3984 MVT EltVT = ConstVecVT.getVectorElementType();
3985 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3986 if (Undefs[i]) {
3987 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3988 continue;
3989 }
3990 const APInt &V = Bits[i];
3991 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3992 if (Split) {
3993 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3994 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3995 } else if (EltVT == MVT::f32) {
3997 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3998 } else if (EltVT == MVT::f64) {
4000 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4001 } else {
4002 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4003 }
4004 }
4005
4006 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4007 return DAG.getBitcast(VT, ConstsNode);
4008}
4009
4011 SelectionDAG &DAG, const SDLoc &dl) {
4012 APInt Undefs = APInt::getZero(Bits.size());
4013 return getConstVector(Bits, Undefs, VT, DAG, dl);
4014}
4015
4016/// Returns a vector of specified type with all zero elements.
4017static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4018 SelectionDAG &DAG, const SDLoc &dl) {
4019 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4020 VT.getVectorElementType() == MVT::i1) &&
4021 "Unexpected vector type");
4022
4023 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4024 // type. This ensures they get CSE'd. But if the integer type is not
4025 // available, use a floating-point +0.0 instead.
4026 SDValue Vec;
4027 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4028 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4029 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4030 } else if (VT.isFloatingPoint() &&
4032 Vec = DAG.getConstantFP(+0.0, dl, VT);
4033 } else if (VT.getVectorElementType() == MVT::i1) {
4034 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4035 "Unexpected vector type");
4036 Vec = DAG.getConstant(0, dl, VT);
4037 } else {
4038 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4039 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4040 }
4041 return DAG.getBitcast(VT, Vec);
4042}
4043
4044// Helper to determine if the ops are all the extracted subvectors come from a
4045// single source. If we allow commute they don't have to be in order (Lo/Hi).
4046static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4047 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4048 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4049 LHS.getValueType() != RHS.getValueType() ||
4050 LHS.getOperand(0) != RHS.getOperand(0))
4051 return SDValue();
4052
4053 SDValue Src = LHS.getOperand(0);
4054 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4055 return SDValue();
4056
4057 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4058 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4059 RHS.getConstantOperandAPInt(1) == NumElts) ||
4060 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4061 LHS.getConstantOperandAPInt(1) == NumElts))
4062 return Src;
4063
4064 return SDValue();
4065}
4066
4067static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4068 const SDLoc &dl, unsigned vectorWidth) {
4069 EVT VT = Vec.getValueType();
4070 EVT ElVT = VT.getVectorElementType();
4071 unsigned Factor = VT.getSizeInBits() / vectorWidth;
4072 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4073 VT.getVectorNumElements() / Factor);
4074
4075 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4076 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4077 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4078
4079 // This is the index of the first element of the vectorWidth-bit chunk
4080 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4081 IdxVal &= ~(ElemsPerChunk - 1);
4082
4083 // If the input is a buildvector just emit a smaller one.
4084 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4085 return DAG.getBuildVector(ResultVT, dl,
4086 Vec->ops().slice(IdxVal, ElemsPerChunk));
4087
4088 // Check if we're extracting the upper undef of a widening pattern.
4089 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4090 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4091 isNullConstant(Vec.getOperand(2)))
4092 return DAG.getUNDEF(ResultVT);
4093
4094 SDValue VecIdx = DAG.getVectorIdxConstant(IdxVal, dl);
4095 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4096}
4097
4098/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4099/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4100/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4101/// instructions or a simple subregister reference. Idx is an index in the
4102/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4103/// lowering EXTRACT_VECTOR_ELT operations easier.
4104static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4105 SelectionDAG &DAG, const SDLoc &dl) {
4107 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4108 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4109}
4110
4111/// Generate a DAG to grab 256-bits from a 512-bit vector.
4112static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4113 SelectionDAG &DAG, const SDLoc &dl) {
4114 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4115 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4116}
4117
4118static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4119 SelectionDAG &DAG, const SDLoc &dl,
4120 unsigned vectorWidth) {
4121 assert((vectorWidth == 128 || vectorWidth == 256) &&
4122 "Unsupported vector width");
4123 // Inserting UNDEF is Result
4124 if (Vec.isUndef())
4125 return Result;
4126 EVT VT = Vec.getValueType();
4127 EVT ElVT = VT.getVectorElementType();
4128 EVT ResultVT = Result.getValueType();
4129
4130 // Insert the relevant vectorWidth bits.
4131 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4132 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4133
4134 // This is the index of the first element of the vectorWidth-bit chunk
4135 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4136 IdxVal &= ~(ElemsPerChunk - 1);
4137
4138 SDValue VecIdx = DAG.getVectorIdxConstant(IdxVal, dl);
4139 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4140}
4141
4142/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4143/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4144/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4145/// simple superregister reference. Idx is an index in the 128 bits
4146/// we want. It need not be aligned to a 128-bit boundary. That makes
4147/// lowering INSERT_VECTOR_ELT operations easier.
4148static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4149 SelectionDAG &DAG, const SDLoc &dl) {
4150 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4151 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Widen a vector to a larger size with the same scalar type, with the new
4155/// elements either zero or undef.
4156static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4157 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4158 const SDLoc &dl) {
4159 EVT VecVT = Vec.getValueType();
4161 VecVT.getScalarType() == VT.getScalarType() &&
4162 "Unsupported vector widening type");
4163 // If the upper 128-bits of a build vector are already undef/zero, then try to
4164 // widen from the lower 128-bits.
4165 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4166 unsigned NumSrcElts = VecVT.getVectorNumElements();
4167 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4168 if (all_of(Hi, [&](SDValue V) {
4169 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4170 }))
4171 Vec = extract128BitVector(Vec, 0, DAG, dl);
4172 }
4173 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4174 : DAG.getUNDEF(VT);
4175 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
4176 DAG.getVectorIdxConstant(0, dl));
4177}
4178
4179/// Widen a vector to a larger size with the same scalar type, with the new
4180/// elements either zero or undef.
4181static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4182 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4183 const SDLoc &dl, unsigned WideSizeInBits) {
4184 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4185 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4186 "Unsupported vector widening type");
4187 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4188 MVT SVT = Vec.getSimpleValueType().getScalarType();
4189 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4190 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4191}
4192
4193/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4194/// and bitcast with integer types.
4195static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4196 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4197 unsigned NumElts = VT.getVectorNumElements();
4198 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4199 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4200 return VT;
4201}
4202
4203/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4204/// bitcast with integer types.
4205static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4206 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4207 const SDLoc &dl) {
4208 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4209 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4210}
4211
4212// Helper function to collect subvector ops that are concatenated together,
4213// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4214// The subvectors in Ops are guaranteed to be the same type.
4216 SelectionDAG &DAG) {
4217 assert(Ops.empty() && "Expected an empty ops vector");
4218
4219 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4220 Ops.append(N->op_begin(), N->op_end());
4221 return true;
4222 }
4223
4224 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4225 SDValue Src = N->getOperand(0);
4226 SDValue Sub = N->getOperand(1);
4227 const APInt &Idx = N->getConstantOperandAPInt(2);
4228 EVT VT = Src.getValueType();
4229 EVT SubVT = Sub.getValueType();
4230
4231 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4232 // insert_subvector(undef, x, lo)
4233 if (Idx == 0 && Src.isUndef()) {
4234 Ops.push_back(Sub);
4235 Ops.push_back(DAG.getUNDEF(SubVT));
4236 return true;
4237 }
4238 if (Idx == (VT.getVectorNumElements() / 2)) {
4239 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4240 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4241 Src.getOperand(1).getValueType() == SubVT &&
4242 isNullConstant(Src.getOperand(2))) {
4243 // Attempt to recurse into inner (matching) concats.
4244 SDValue Lo = Src.getOperand(1);
4245 SDValue Hi = Sub;
4246 SmallVector<SDValue, 2> LoOps, HiOps;
4247 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4248 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4249 LoOps.size() == HiOps.size()) {
4250 Ops.append(LoOps);
4251 Ops.append(HiOps);
4252 return true;
4253 }
4254 Ops.push_back(Lo);
4255 Ops.push_back(Hi);
4256 return true;
4257 }
4258 // insert_subvector(x, extract_subvector(x, lo), hi)
4259 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4260 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4261 Ops.append(2, Sub);
4262 return true;
4263 }
4264 // insert_subvector(undef, x, hi)
4265 if (Src.isUndef()) {
4266 Ops.push_back(DAG.getUNDEF(SubVT));
4267 Ops.push_back(Sub);
4268 return true;
4269 }
4270 }
4271 }
4272 }
4273
4274 return false;
4275}
4276
4277// Helper to check if \p V can be split into subvectors and the upper subvectors
4278// are all undef. In which case return the lower subvector.
4280 SelectionDAG &DAG) {
4281 SmallVector<SDValue> SubOps;
4282 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4283 return SDValue();
4284
4285 unsigned NumSubOps = SubOps.size();
4286 unsigned HalfNumSubOps = NumSubOps / 2;
4287 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4288
4289 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4290 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4291 return SDValue();
4292
4293 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4294 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4295 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4296}
4297
4298// Helper to check if we can access all the constituent subvectors without any
4299// extract ops.
4302 return collectConcatOps(N, Ops, DAG);
4303}
4304
4305static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4306 const SDLoc &dl) {
4307 EVT VT = Op.getValueType();
4308 unsigned NumElems = VT.getVectorNumElements();
4309 unsigned SizeInBits = VT.getSizeInBits();
4310 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4311 "Can't split odd sized vector");
4312
4313 // If this is a splat value (with no-undefs) then use the lower subvector,
4314 // which should be a free extraction.
4315 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4316 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4317 return std::make_pair(Lo, Lo);
4318
4319 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4320 return std::make_pair(Lo, Hi);
4321}
4322
4323/// Break an operation into 2 half sized ops and then concatenate the results.
4325 unsigned NumOps = Op.getNumOperands();
4326 EVT VT = Op.getValueType();
4327
4328 // Extract the LHS Lo/Hi vectors
4329 SmallVector<SDValue> LoOps(NumOps, SDValue());
4330 SmallVector<SDValue> HiOps(NumOps, SDValue());
4331 for (unsigned I = 0; I != NumOps; ++I) {
4332 SDValue SrcOp = Op.getOperand(I);
4333 if (!SrcOp.getValueType().isVector()) {
4334 LoOps[I] = HiOps[I] = SrcOp;
4335 continue;
4336 }
4337 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4338 }
4339
4340 EVT LoVT, HiVT;
4341 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4342 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4343 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4344 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4345}
4346
4347/// Break an unary integer operation into 2 half sized ops and then
4348/// concatenate the result back.
4350 const SDLoc &dl) {
4351 // Make sure we only try to split 256/512-bit types to avoid creating
4352 // narrow vectors.
4353 [[maybe_unused]] EVT VT = Op.getValueType();
4354 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4355 Op.getOperand(0).getValueType().is512BitVector()) &&
4356 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4357 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4358 VT.getVectorNumElements() &&
4359 "Unexpected VTs!");
4360 return splitVectorOp(Op, DAG, dl);
4361}
4362
4363/// Break a binary integer operation into 2 half sized ops and then
4364/// concatenate the result back.
4366 const SDLoc &dl) {
4367 // Assert that all the types match.
4368 [[maybe_unused]] EVT VT = Op.getValueType();
4369 assert(Op.getOperand(0).getValueType() == VT &&
4370 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4371 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4372 return splitVectorOp(Op, DAG, dl);
4373}
4374
4375// Helper for splitting operands of an operation to legal target size and
4376// apply a function on each part.
4377// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4378// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4379// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4380// The argument Builder is a function that will be applied on each split part:
4381// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4382template <typename F>
4384 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4385 F Builder, bool CheckBWI = true) {
4386 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4387 unsigned NumSubs = 1;
4388 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4389 (!CheckBWI && Subtarget.useAVX512Regs())) {
4390 if (VT.getSizeInBits() > 512) {
4391 NumSubs = VT.getSizeInBits() / 512;
4392 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4393 }
4394 } else if (Subtarget.hasAVX2()) {
4395 if (VT.getSizeInBits() > 256) {
4396 NumSubs = VT.getSizeInBits() / 256;
4397 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4398 }
4399 } else {
4400 if (VT.getSizeInBits() > 128) {
4401 NumSubs = VT.getSizeInBits() / 128;
4402 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4403 }
4404 }
4405
4406 if (NumSubs == 1)
4407 return Builder(DAG, DL, Ops);
4408
4410 for (unsigned i = 0; i != NumSubs; ++i) {
4412 for (SDValue Op : Ops) {
4413 EVT OpVT = Op.getValueType();
4414 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4415 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4416 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4417 }
4418 Subs.push_back(Builder(DAG, DL, SubOps));
4419 }
4420 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4421}
4422
4423// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4424// targets.
4425static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4427 const X86Subtarget &Subtarget) {
4428 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4429 MVT SVT = VT.getScalarType();
4430
4431 // If we have a 32/64 splatted constant, splat it to DstTy to
4432 // encourage a foldable broadcast'd operand.
4433 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4434 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4435 // AVX512 broadcasts 32/64-bit operands.
4436 // TODO: Support float once getAVX512Node is used by fp-ops.
4437 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4439 return SDValue();
4440 // If we're not widening, don't bother if we're not bitcasting.
4441 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4442 return SDValue();
4443 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4444 APInt SplatValue, SplatUndef;
4445 unsigned SplatBitSize;
4446 bool HasAnyUndefs;
4447 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4448 HasAnyUndefs, OpEltSizeInBits) &&
4449 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4450 return DAG.getConstant(SplatValue, DL, DstVT);
4451 }
4452 return SDValue();
4453 };
4454
4455 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4456
4457 MVT DstVT = VT;
4458 if (Widen)
4459 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4460
4461 // Canonicalize src operands.
4462 SmallVector<SDValue> SrcOps(Ops);
4463 for (SDValue &Op : SrcOps) {
4464 MVT OpVT = Op.getSimpleValueType();
4465 // Just pass through scalar operands.
4466 if (!OpVT.isVector())
4467 continue;
4468 assert(OpVT == VT && "Vector type mismatch");
4469
4470 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4471 Op = BroadcastOp;
4472 continue;
4473 }
4474
4475 // Just widen the subvector by inserting into an undef wide vector.
4476 if (Widen)
4477 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4478 }
4479
4480 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4481
4482 // Perform the 512-bit op then extract the bottom subvector.
4483 if (Widen)
4484 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4485 return Res;
4486}
4487
4488/// Insert i1-subvector to i1-vector.
4490 const X86Subtarget &Subtarget) {
4491
4492 SDLoc dl(Op);
4493 SDValue Vec = Op.getOperand(0);
4494 SDValue SubVec = Op.getOperand(1);
4495 SDValue Idx = Op.getOperand(2);
4496 unsigned IdxVal = Op.getConstantOperandVal(2);
4497
4498 // Inserting undef is a nop. We can just return the original vector.
4499 if (SubVec.isUndef())
4500 return Vec;
4501
4502 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4503 return Op;
4504
4505 MVT OpVT = Op.getSimpleValueType();
4506 unsigned NumElems = OpVT.getVectorNumElements();
4507 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4508
4509 // Extend to natively supported kshift.
4510 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4511
4512 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4513 // if necessary.
4514 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4515 // May need to promote to a legal type.
4516 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4517 DAG.getConstant(0, dl, WideOpVT),
4518 SubVec, Idx);
4519 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4520 }
4521
4522 MVT SubVecVT = SubVec.getSimpleValueType();
4523 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4524 assert(IdxVal + SubVecNumElems <= NumElems &&
4525 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4526 "Unexpected index value in INSERT_SUBVECTOR");
4527
4528 SDValue Undef = DAG.getUNDEF(WideOpVT);
4529
4530 if (IdxVal == 0) {
4531 // Zero lower bits of the Vec
4532 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4533 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4534 ZeroIdx);
4535 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4536 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4537 // Merge them together, SubVec should be zero extended.
4538 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4539 DAG.getConstant(0, dl, WideOpVT),
4540 SubVec, ZeroIdx);
4541 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4542 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4543 }
4544
4545 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4546 Undef, SubVec, ZeroIdx);
4547
4548 if (Vec.isUndef()) {
4549 assert(IdxVal != 0 && "Unexpected index");
4550 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4551 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4552 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4553 }
4554
4556 assert(IdxVal != 0 && "Unexpected index");
4557 // If upper elements of Vec are known undef, then just shift into place.
4558 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4559 [](SDValue V) { return V.isUndef(); })) {
4560 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4561 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4562 } else {
4563 NumElems = WideOpVT.getVectorNumElements();
4564 unsigned ShiftLeft = NumElems - SubVecNumElems;
4565 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4566 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4567 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4568 if (ShiftRight != 0)
4569 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4570 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4571 }
4572 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4573 }
4574
4575 // Simple case when we put subvector in the upper part
4576 if (IdxVal + SubVecNumElems == NumElems) {
4577 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4578 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4579 if (SubVecNumElems * 2 == NumElems) {
4580 // Special case, use legal zero extending insert_subvector. This allows
4581 // isel to optimize when bits are known zero.
4582 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4583 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4584 DAG.getConstant(0, dl, WideOpVT),
4585 Vec, ZeroIdx);
4586 } else {
4587 // Otherwise use explicit shifts to zero the bits.
4588 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4589 Undef, Vec, ZeroIdx);
4590 NumElems = WideOpVT.getVectorNumElements();
4591 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4592 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4593 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4594 }
4595 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4596 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4597 }
4598
4599 // Inserting into the middle is more complicated.
4600
4601 NumElems = WideOpVT.getVectorNumElements();
4602
4603 // Widen the vector if needed.
4604 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4605
4606 unsigned ShiftLeft = NumElems - SubVecNumElems;
4607 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4608
4609 // Do an optimization for the most frequently used types.
4610 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4611 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4612 Mask0.flipAllBits();
4613 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4614 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4615 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4616 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4617 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4618 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4619 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4620 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4621
4622 // Reduce to original width if needed.
4623 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4624 }
4625
4626 // Clear the upper bits of the subvector and move it to its insert position.
4627 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4628 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4629 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4630 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4631
4632 // Isolate the bits below the insertion point.
4633 unsigned LowShift = NumElems - IdxVal;
4634 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4635 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4636 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4637 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4638
4639 // Isolate the bits after the last inserted bit.
4640 unsigned HighShift = IdxVal + SubVecNumElems;
4641 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4642 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4643 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4644 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4645
4646 // Now OR all 3 pieces together.
4647 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4648 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4649
4650 // Reduce to original width if needed.
4651 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4652}
4653
4655 const SDLoc &dl) {
4656 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4657 EVT SubVT = V1.getValueType();
4658 EVT SubSVT = SubVT.getScalarType();
4659 unsigned SubNumElts = SubVT.getVectorNumElements();
4660 unsigned SubVectorWidth = SubVT.getSizeInBits();
4661 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4662 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4663 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4664}
4665
4666/// Returns a vector of specified type with all bits set.
4667/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4668/// Then bitcast to their original type, ensuring they get CSE'd.
4669static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4670 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4671 "Expected a 128/256/512-bit vector type");
4672 unsigned NumElts = VT.getSizeInBits() / 32;
4673 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4674 return DAG.getBitcast(VT, Vec);
4675}
4676
4677static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4678 SDValue In, SelectionDAG &DAG) {
4679 EVT InVT = In.getValueType();
4680 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4681 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4682 ISD::ZERO_EXTEND == Opcode) &&
4683 "Unknown extension opcode");
4684
4685 // For 256-bit vectors, we only need the lower (128-bit) input half.
4686 // For 512-bit vectors, we only need the lower input half or quarter.
4687 if (InVT.getSizeInBits() > 128) {
4688 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4689 "Expected VTs to be the same size!");
4690 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4691 In = extractSubVector(In, 0, DAG, DL,
4692 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4693 InVT = In.getValueType();
4694 }
4695
4696 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4697 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4698
4699 return DAG.getNode(Opcode, DL, VT, In);
4700}
4701
4702// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4703static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4704 SDValue Mask, SelectionDAG &DAG) {
4705 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4706 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4707 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4708}
4709
4711 bool Lo, bool Unary) {
4712 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4713 "Illegal vector type to unpack");
4714 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4715 int NumElts = VT.getVectorNumElements();
4716 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4717 for (int i = 0; i < NumElts; ++i) {
4718 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4719 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4720 Pos += (Unary ? 0 : NumElts * (i % 2));
4721 Pos += (Lo ? 0 : NumEltsInLane / 2);
4722 Mask.push_back(Pos);
4723 }
4724}
4725
4726/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4727/// imposed by AVX and specific to the unary pattern. Example:
4728/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4729/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4731 bool Lo) {
4732 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4733 int NumElts = VT.getVectorNumElements();
4734 for (int i = 0; i < NumElts; ++i) {
4735 int Pos = i / 2;
4736 Pos += (Lo ? 0 : NumElts / 2);
4737 Mask.push_back(Pos);
4738 }
4739}
4740
4741// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4742static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4743 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4745 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4746 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4747 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4748 int M = Mask[I];
4749 if (M < 0)
4750 continue;
4751 SDValue V = (M < NumElts) ? V1 : V2;
4752 if (V.isUndef())
4753 continue;
4754 Ops[I] = V.getOperand(M % NumElts);
4755 }
4756 return DAG.getBuildVector(VT, dl, Ops);
4757 }
4758
4759 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4760}
4761
4762/// Returns a vector_shuffle node for an unpackl operation.
4763static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4764 SDValue V1, SDValue V2) {
4766 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4767 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4768}
4769
4770/// Returns a vector_shuffle node for an unpackh operation.
4771static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4772 SDValue V1, SDValue V2) {
4774 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4775 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4776}
4777
4778/// Returns a node that packs the LHS + RHS nodes together at half width.
4779/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4780/// TODO: Add subvector splitting if/when we have a need for it.
4781static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4782 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4783 bool PackHiHalf = false) {
4784 MVT OpVT = LHS.getSimpleValueType();
4785 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4786 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4787 assert(OpVT == RHS.getSimpleValueType() &&
4788 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4789 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4790 "Unexpected PACK operand types");
4791 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4792 "Unexpected PACK result type");
4793
4794 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4795 if (EltSizeInBits == 32) {
4796 SmallVector<int> PackMask;
4797 int Offset = PackHiHalf ? 1 : 0;
4798 int NumElts = VT.getVectorNumElements();
4799 for (int I = 0; I != NumElts; I += 4) {
4800 PackMask.push_back(I + Offset);
4801 PackMask.push_back(I + Offset + 2);
4802 PackMask.push_back(I + Offset + NumElts);
4803 PackMask.push_back(I + Offset + NumElts + 2);
4804 }
4805 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4806 DAG.getBitcast(VT, RHS), PackMask);
4807 }
4808
4809 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4810 if (!PackHiHalf) {
4811 if (UsePackUS &&
4812 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4813 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4814 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4815
4816 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4817 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4818 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4819 }
4820
4821 // Fallback to sign/zero extending the requested half and pack.
4822 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4823 if (UsePackUS) {
4824 if (PackHiHalf) {
4825 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4826 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4827 } else {
4828 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4829 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4830 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4831 };
4832 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4833 };
4834
4835 if (!PackHiHalf) {
4836 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4837 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4838 }
4839 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4840 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4841 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4842}
4843
4844/// Return a vector_shuffle of the specified vector of zero or undef vector.
4845/// This produces a shuffle where the low element of V2 is swizzled into the
4846/// zero/undef vector, landing at element Idx.
4847/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4849 bool IsZero,
4850 const X86Subtarget &Subtarget,
4851 SelectionDAG &DAG) {
4852 MVT VT = V2.getSimpleValueType();
4853 SDValue V1 = IsZero
4854 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4855 int NumElems = VT.getVectorNumElements();
4856 SmallVector<int, 16> MaskVec(NumElems);
4857 for (int i = 0; i != NumElems; ++i)
4858 // If this is the insertion idx, put the low elt of V2 here.
4859 MaskVec[i] = (i == Idx) ? NumElems : i;
4860 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4861}
4862
4864 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4865 Ptr.getOpcode() == X86ISD::WrapperRIP)
4866 Ptr = Ptr.getOperand(0);
4867 return dyn_cast<ConstantPoolSDNode>(Ptr);
4868}
4869
4870// TODO: Add support for non-zero offsets.
4873 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4874 return nullptr;
4875 return CNode->getConstVal();
4876}
4877
4879 if (!Load || !ISD::isNormalLoad(Load))
4880 return nullptr;
4881 return getTargetConstantFromBasePtr(Load->getBasePtr());
4882}
4883
4886 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4887}
4888
4889const Constant *
4891 assert(LD && "Unexpected null LoadSDNode");
4892 return getTargetConstantFromNode(LD);
4893}
4894
4895// Extract raw constant bits from constant pools.
4896static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4897 APInt &UndefElts,
4898 SmallVectorImpl<APInt> &EltBits,
4899 bool AllowWholeUndefs = true,
4900 bool AllowPartialUndefs = false) {
4901 assert(EltBits.empty() && "Expected an empty EltBits vector");
4902
4904
4905 EVT VT = Op.getValueType();
4906 unsigned SizeInBits = VT.getSizeInBits();
4907 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4908 unsigned NumElts = SizeInBits / EltSizeInBits;
4909
4910 // Bitcast a source array of element bits to the target size.
4911 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4912 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4913 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4914 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4915 "Constant bit sizes don't match");
4916
4917 // Don't split if we don't allow undef bits.
4918 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4919 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4920 return false;
4921
4922 // If we're already the right size, don't bother bitcasting.
4923 if (NumSrcElts == NumElts) {
4924 UndefElts = UndefSrcElts;
4925 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4926 return true;
4927 }
4928
4929 // Extract all the undef/constant element data and pack into single bitsets.
4930 APInt UndefBits(SizeInBits, 0);
4931 APInt MaskBits(SizeInBits, 0);
4932
4933 for (unsigned i = 0; i != NumSrcElts; ++i) {
4934 unsigned BitOffset = i * SrcEltSizeInBits;
4935 if (UndefSrcElts[i])
4936 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4937 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4938 }
4939
4940 // Split the undef/constant single bitset data into the target elements.
4941 UndefElts = APInt(NumElts, 0);
4942 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4943
4944 for (unsigned i = 0; i != NumElts; ++i) {
4945 unsigned BitOffset = i * EltSizeInBits;
4946 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4947
4948 // Only treat an element as UNDEF if all bits are UNDEF.
4949 if (UndefEltBits.isAllOnes()) {
4950 if (!AllowWholeUndefs)
4951 return false;
4952 UndefElts.setBit(i);
4953 continue;
4954 }
4955
4956 // If only some bits are UNDEF then treat them as zero (or bail if not
4957 // supported).
4958 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4959 return false;
4960
4961 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4962 }
4963 return true;
4964 };
4965
4966 // Collect constant bits and insert into mask/undef bit masks.
4967 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4968 unsigned UndefBitIndex) {
4969 if (!Cst)
4970 return false;
4971 if (isa<UndefValue>(Cst)) {
4972 Undefs.setBit(UndefBitIndex);
4973 return true;
4974 }
4975 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4976 Mask = CInt->getValue();
4977 return true;
4978 }
4979 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4980 Mask = CFP->getValueAPF().bitcastToAPInt();
4981 return true;
4982 }
4983 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4984 Type *Ty = CDS->getType();
4986 Type *EltTy = CDS->getElementType();
4987 bool IsInteger = EltTy->isIntegerTy();
4988 bool IsFP =
4989 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4990 if (!IsInteger && !IsFP)
4991 return false;
4992 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4993 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4994 if (IsInteger)
4995 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4996 else
4997 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4998 I * EltBits);
4999 return true;
5000 }
5001 return false;
5002 };
5003
5004 // Handle UNDEFs.
5005 if (Op.isUndef()) {
5006 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5007 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5008 return CastBitData(UndefSrcElts, SrcEltBits);
5009 }
5010
5011 // Extract scalar constant bits.
5012 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5013 APInt UndefSrcElts = APInt::getZero(1);
5014 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5015 return CastBitData(UndefSrcElts, SrcEltBits);
5016 }
5017 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5018 APInt UndefSrcElts = APInt::getZero(1);
5019 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5020 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5021 return CastBitData(UndefSrcElts, SrcEltBits);
5022 }
5023
5024 // Extract constant bits from build vector.
5025 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5026 BitVector Undefs;
5027 SmallVector<APInt> SrcEltBits;
5028 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5029 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5030 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5031 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5032 if (Undefs[I])
5033 UndefSrcElts.setBit(I);
5034 return CastBitData(UndefSrcElts, SrcEltBits);
5035 }
5036 }
5037
5038 // Extract constant bits from constant pool vector.
5039 if (auto *Cst = getTargetConstantFromNode(Op)) {
5040 Type *CstTy = Cst->getType();
5041 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5042 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5043 return false;
5044
5045 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5046 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5047 if ((SizeInBits % SrcEltSizeInBits) != 0)
5048 return false;
5049
5050 APInt UndefSrcElts(NumSrcElts, 0);
5051 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5052 for (unsigned i = 0; i != NumSrcElts; ++i)
5053 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5054 UndefSrcElts, i))
5055 return false;
5056
5057 return CastBitData(UndefSrcElts, SrcEltBits);
5058 }
5059
5060 // Extract constant bits from a broadcasted constant pool scalar.
5061 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5062 EltSizeInBits <= VT.getScalarSizeInBits()) {
5063 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5064 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5065 return false;
5066
5067 SDValue Ptr = MemIntr->getBasePtr();
5069 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5070 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5071
5072 APInt UndefSrcElts(NumSrcElts, 0);
5073 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5074 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5075 if (UndefSrcElts[0])
5076 UndefSrcElts.setBits(0, NumSrcElts);
5077 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5078 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5079 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5080 return CastBitData(UndefSrcElts, SrcEltBits);
5081 }
5082 }
5083 }
5084
5085 // Extract constant bits from a subvector broadcast.
5086 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5087 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5088 SDValue Ptr = MemIntr->getBasePtr();
5089 // The source constant may be larger than the subvector broadcast,
5090 // ensure we extract the correct subvector constants.
5091 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5092 Type *CstTy = Cst->getType();
5093 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5094 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5095 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5096 (SizeInBits % SubVecSizeInBits) != 0)
5097 return false;
5098 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5099 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5100 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5101 APInt UndefSubElts(NumSubElts, 0);
5102 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5103 APInt(CstEltSizeInBits, 0));
5104 for (unsigned i = 0; i != NumSubElts; ++i) {
5105 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5106 UndefSubElts, i))
5107 return false;
5108 for (unsigned j = 1; j != NumSubVecs; ++j)
5109 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5110 }
5111 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5112 UndefSubElts);
5113 return CastBitData(UndefSubElts, SubEltBits);
5114 }
5115 }
5116
5117 // Extract a rematerialized scalar constant insertion.
5118 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5119 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5120 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5121 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5122 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5123
5124 APInt UndefSrcElts(NumSrcElts, 0);
5125 SmallVector<APInt, 64> SrcEltBits;
5126 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5127 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5128 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5129 return CastBitData(UndefSrcElts, SrcEltBits);
5130 }
5131
5132 // Insert constant bits from a base and sub vector sources.
5133 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5134 // If bitcasts to larger elements we might lose track of undefs - don't
5135 // allow any to be safe.
5136 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5137 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5138
5139 APInt UndefSrcElts, UndefSubElts;
5140 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5141 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5142 UndefSubElts, EltSubBits,
5143 AllowWholeUndefs && AllowUndefs,
5144 AllowPartialUndefs && AllowUndefs) &&
5145 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5146 UndefSrcElts, EltSrcBits,
5147 AllowWholeUndefs && AllowUndefs,
5148 AllowPartialUndefs && AllowUndefs)) {
5149 unsigned BaseIdx = Op.getConstantOperandVal(2);
5150 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5151 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5152 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5153 return CastBitData(UndefSrcElts, EltSrcBits);
5154 }
5155 }
5156
5157 // Extract constant bits from a subvector's source.
5158 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
5159 // TODO - support extract_subvector through bitcasts.
5160 if (EltSizeInBits != VT.getScalarSizeInBits())
5161 return false;
5162
5163 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5164 UndefElts, EltBits, AllowWholeUndefs,
5165 AllowPartialUndefs)) {
5166 EVT SrcVT = Op.getOperand(0).getValueType();
5167 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5168 unsigned NumSubElts = VT.getVectorNumElements();
5169 unsigned BaseIdx = Op.getConstantOperandVal(1);
5170 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5171 if ((BaseIdx + NumSubElts) != NumSrcElts)
5172 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5173 if (BaseIdx != 0)
5174 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5175 return true;
5176 }
5177 }
5178
5179 // Extract constant bits from shuffle node sources.
5180 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5181 // TODO - support shuffle through bitcasts.
5182 if (EltSizeInBits != VT.getScalarSizeInBits())
5183 return false;
5184
5185 ArrayRef<int> Mask = SVN->getMask();
5186 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5187 llvm::any_of(Mask, [](int M) { return M < 0; }))
5188 return false;
5189
5190 APInt UndefElts0, UndefElts1;
5191 SmallVector<APInt, 32> EltBits0, EltBits1;
5192 if (isAnyInRange(Mask, 0, NumElts) &&
5193 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5194 UndefElts0, EltBits0, AllowWholeUndefs,
5195 AllowPartialUndefs))
5196 return false;
5197 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5198 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5199 UndefElts1, EltBits1, AllowWholeUndefs,
5200 AllowPartialUndefs))
5201 return false;
5202
5203 UndefElts = APInt::getZero(NumElts);
5204 for (int i = 0; i != (int)NumElts; ++i) {
5205 int M = Mask[i];
5206 if (M < 0) {
5207 UndefElts.setBit(i);
5208 EltBits.push_back(APInt::getZero(EltSizeInBits));
5209 } else if (M < (int)NumElts) {
5210 if (UndefElts0[M])
5211 UndefElts.setBit(i);
5212 EltBits.push_back(EltBits0[M]);
5213 } else {
5214 if (UndefElts1[M - NumElts])
5215 UndefElts.setBit(i);
5216 EltBits.push_back(EltBits1[M - NumElts]);
5217 }
5218 }
5219 return true;
5220 }
5221
5222 return false;
5223}
5224
5225namespace llvm {
5226namespace X86 {
5227bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5228 APInt UndefElts;
5229 SmallVector<APInt, 16> EltBits;
5231 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5232 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5233 int SplatIndex = -1;
5234 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5235 if (UndefElts[i])
5236 continue;
5237 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5238 SplatIndex = -1;
5239 break;
5240 }
5241 SplatIndex = i;
5242 }
5243 if (0 <= SplatIndex) {
5244 SplatVal = EltBits[SplatIndex];
5245 return true;
5246 }
5247 }
5248
5249 return false;
5250}
5251} // namespace X86
5252} // namespace llvm
5253
5255 unsigned MaskEltSizeInBits,
5257 APInt &UndefElts) {
5258 // Extract the raw target constant bits.
5259 SmallVector<APInt, 64> EltBits;
5260 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5261 EltBits, /* AllowWholeUndefs */ true,
5262 /* AllowPartialUndefs */ false))
5263 return false;
5264
5265 // Insert the extracted elements into the mask.
5266 for (const APInt &Elt : EltBits)
5267 RawMask.push_back(Elt.getZExtValue());
5268
5269 return true;
5270}
5271
5272static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5273 bool AllowUndefs) {
5274 APInt UndefElts;
5275 SmallVector<APInt, 64> EltBits;
5276 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5277 /*AllowWholeUndefs*/ AllowUndefs,
5278 /*AllowPartialUndefs*/ false))
5279 return false;
5280
5281 bool IsPow2OrUndef = true;
5282 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5283 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5284 return IsPow2OrUndef;
5285}
5286
5287// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5289 // TODO: don't always ignore oneuse constraints.
5290 V = peekThroughBitcasts(V);
5291 EVT VT = V.getValueType();
5292
5293 // Match not(xor X, -1) -> X.
5294 if (V.getOpcode() == ISD::XOR &&
5295 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5296 isAllOnesConstant(V.getOperand(1))))
5297 return V.getOperand(0);
5298
5299 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5300 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5301 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5302 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5303 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5304 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5305 V.getOperand(1));
5306 }
5307 }
5308
5309 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5310 if (V.getOpcode() == X86ISD::PCMPGT &&
5311 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5312 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5313 V.getOperand(0).hasOneUse()) {
5314 APInt UndefElts;
5315 SmallVector<APInt> EltBits;
5316 if (getTargetConstantBitsFromNode(V.getOperand(0),
5317 V.getScalarValueSizeInBits(), UndefElts,
5318 EltBits) &&
5319 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5320 // Don't fold min_signed_value -> (min_signed_value - 1)
5321 bool MinSigned = false;
5322 for (APInt &Elt : EltBits) {
5323 MinSigned |= Elt.isMinSignedValue();
5324 Elt -= 1;
5325 }
5326 if (!MinSigned) {
5327 SDLoc DL(V);
5328 MVT VT = V.getSimpleValueType();
5329 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5330 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5331 }
5332 }
5333 }
5334
5335 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5337 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5338 for (SDValue &CatOp : CatOps) {
5339 SDValue NotCat = IsNOT(CatOp, DAG);
5340 if (!NotCat)
5341 return SDValue();
5342 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5343 }
5344 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5345 }
5346
5347 // Match not(or(not(X),not(Y))) -> and(X, Y).
5348 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5349 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5350 // TODO: Handle cases with single NOT operand -> ANDNP
5351 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5352 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5353 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5354 DAG.getBitcast(VT, Op1));
5355 }
5356
5357 return SDValue();
5358}
5359
5360/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5361/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5362/// Note: This ignores saturation, so inputs must be checked first.
5364 bool Unary, unsigned NumStages = 1) {
5365 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5366 unsigned NumElts = VT.getVectorNumElements();
5367 unsigned NumLanes = VT.getSizeInBits() / 128;
5368 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5369 unsigned Offset = Unary ? 0 : NumElts;
5370 unsigned Repetitions = 1u << (NumStages - 1);
5371 unsigned Increment = 1u << NumStages;
5372 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5373
5374 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5375 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5376 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5377 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5378 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5379 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5380 }
5381 }
5382}
5383
5384// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5385static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5386 APInt &DemandedLHS, APInt &DemandedRHS) {
5387 int NumLanes = VT.getSizeInBits() / 128;
5388 int NumElts = DemandedElts.getBitWidth();
5389 int NumInnerElts = NumElts / 2;
5390 int NumEltsPerLane = NumElts / NumLanes;
5391 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5392
5393 DemandedLHS = APInt::getZero(NumInnerElts);
5394 DemandedRHS = APInt::getZero(NumInnerElts);
5395
5396 // Map DemandedElts to the packed operands.
5397 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5398 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5399 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5400 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5401 if (DemandedElts[OuterIdx])
5402 DemandedLHS.setBit(InnerIdx);
5403 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5404 DemandedRHS.setBit(InnerIdx);
5405 }
5406 }
5407}
5408
5409// Split the demanded elts of a HADD/HSUB node between its operands.
5410static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5411 APInt &DemandedLHS, APInt &DemandedRHS) {
5413 DemandedLHS, DemandedRHS);
5414 DemandedLHS |= DemandedLHS << 1;
5415 DemandedRHS |= DemandedRHS << 1;
5416}
5417
5418/// Calculates the shuffle mask corresponding to the target-specific opcode.
5419/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5420/// operands in \p Ops, and returns true.
5421/// Sets \p IsUnary to true if only one source is used. Note that this will set
5422/// IsUnary for shuffles which use a single input multiple times, and in those
5423/// cases it will adjust the mask to only have indices within that single input.
5424/// It is an error to call this with non-empty Mask/Ops vectors.
5425static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5427 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5428 if (!isTargetShuffle(N.getOpcode()))
5429 return false;
5430
5431 MVT VT = N.getSimpleValueType();
5432 unsigned NumElems = VT.getVectorNumElements();
5433 unsigned MaskEltSize = VT.getScalarSizeInBits();
5435 APInt RawUndefs;
5436 uint64_t ImmN;
5437
5438 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5439 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5440
5441 IsUnary = false;
5442 bool IsFakeUnary = false;
5443 switch (N.getOpcode()) {
5444 case X86ISD::BLENDI:
5445 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5446 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5447 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5448 DecodeBLENDMask(NumElems, ImmN, Mask);
5449 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5450 break;
5451 case X86ISD::SHUFP:
5452 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5453 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5454 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5455 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5456 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5457 break;
5458 case X86ISD::INSERTPS:
5459 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5460 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5461 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5462 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5463 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5464 break;
5465 case X86ISD::EXTRQI:
5466 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5467 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5468 isa<ConstantSDNode>(N.getOperand(2))) {
5469 int BitLen = N.getConstantOperandVal(1);
5470 int BitIdx = N.getConstantOperandVal(2);
5471 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5472 IsUnary = true;
5473 }
5474 break;
5475 case X86ISD::INSERTQI:
5476 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5477 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5478 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5479 isa<ConstantSDNode>(N.getOperand(3))) {
5480 int BitLen = N.getConstantOperandVal(2);
5481 int BitIdx = N.getConstantOperandVal(3);
5482 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5483 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5484 }
5485 break;
5486 case X86ISD::UNPCKH:
5487 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5488 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5489 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5490 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5491 break;
5492 case X86ISD::UNPCKL:
5493 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5494 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5495 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5496 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5497 break;
5498 case X86ISD::MOVHLPS:
5499 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5500 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5501 DecodeMOVHLPSMask(NumElems, Mask);
5502 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5503 break;
5504 case X86ISD::MOVLHPS:
5505 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5506 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5507 DecodeMOVLHPSMask(NumElems, Mask);
5508 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5509 break;
5510 case X86ISD::VALIGN:
5511 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5512 "Only 32-bit and 64-bit elements are supported!");
5513 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5514 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5515 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5516 DecodeVALIGNMask(NumElems, ImmN, Mask);
5517 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5518 Ops.push_back(N.getOperand(1));
5519 Ops.push_back(N.getOperand(0));
5520 break;
5521 case X86ISD::PALIGNR:
5522 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5523 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5524 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5525 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5526 DecodePALIGNRMask(NumElems, ImmN, Mask);
5527 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5528 Ops.push_back(N.getOperand(1));
5529 Ops.push_back(N.getOperand(0));
5530 break;
5531 case X86ISD::VSHLDQ:
5532 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5533 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5534 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5535 DecodePSLLDQMask(NumElems, ImmN, Mask);
5536 IsUnary = true;
5537 break;
5538 case X86ISD::VSRLDQ:
5539 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5540 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5541 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5542 DecodePSRLDQMask(NumElems, ImmN, Mask);
5543 IsUnary = true;
5544 break;
5545 case X86ISD::PSHUFD:
5546 case X86ISD::VPERMILPI:
5547 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5548 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5549 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5550 IsUnary = true;
5551 break;
5552 case X86ISD::PSHUFHW:
5553 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5554 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5555 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5556 IsUnary = true;
5557 break;
5558 case X86ISD::PSHUFLW:
5559 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5560 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5561 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5562 IsUnary = true;
5563 break;
5564 case X86ISD::VZEXT_MOVL:
5565 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5566 DecodeZeroMoveLowMask(NumElems, Mask);
5567 IsUnary = true;
5568 break;
5569 case X86ISD::VBROADCAST:
5570 // We only decode broadcasts of same-sized vectors, peeking through to
5571 // extracted subvectors is likely to cause hasOneUse issues with
5572 // SimplifyDemandedBits etc.
5573 if (N.getOperand(0).getValueType() == VT) {
5574 DecodeVectorBroadcast(NumElems, Mask);
5575 IsUnary = true;
5576 break;
5577 }
5578 return false;
5579 case X86ISD::VPERMILPV: {
5580 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5581 IsUnary = true;
5582 SDValue MaskNode = N.getOperand(1);
5583 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5584 RawUndefs)) {
5585 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5586 break;
5587 }
5588 return false;
5589 }
5590 case X86ISD::PSHUFB: {
5591 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5592 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5593 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5594 IsUnary = true;
5595 SDValue MaskNode = N.getOperand(1);
5596 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5597 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5598 break;
5599 }
5600 return false;
5601 }
5602 case X86ISD::VPERMI:
5603 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5604 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5605 DecodeVPERMMask(NumElems, ImmN, Mask);
5606 IsUnary = true;
5607 break;
5608 case X86ISD::MOVSS:
5609 case X86ISD::MOVSD:
5610 case X86ISD::MOVSH:
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5614 break;
5615 case X86ISD::VPERM2X128:
5616 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5617 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5618 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5619 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5620 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5621 break;
5622 case X86ISD::SHUF128:
5623 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5624 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5625 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5626 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5627 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5628 break;
5629 case X86ISD::MOVSLDUP:
5630 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5631 DecodeMOVSLDUPMask(NumElems, Mask);
5632 IsUnary = true;
5633 break;
5634 case X86ISD::MOVSHDUP:
5635 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5636 DecodeMOVSHDUPMask(NumElems, Mask);
5637 IsUnary = true;
5638 break;
5639 case X86ISD::MOVDDUP:
5640 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5641 DecodeMOVDDUPMask(NumElems, Mask);
5642 IsUnary = true;
5643 break;
5644 case X86ISD::VPERMIL2: {
5645 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5646 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5647 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5648 SDValue MaskNode = N.getOperand(2);
5649 SDValue CtrlNode = N.getOperand(3);
5650 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5651 unsigned CtrlImm = CtrlOp->getZExtValue();
5652 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5653 RawUndefs)) {
5654 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5655 Mask);
5656 break;
5657 }
5658 }
5659 return false;
5660 }
5661 case X86ISD::VPPERM: {
5662 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5663 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5664 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5665 SDValue MaskNode = N.getOperand(2);
5666 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5667 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5668 break;
5669 }
5670 return false;
5671 }
5672 case X86ISD::VPERMV: {
5673 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5674 IsUnary = true;
5675 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5676 Ops.push_back(N.getOperand(1));
5677 SDValue MaskNode = N.getOperand(0);
5678 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5679 RawUndefs)) {
5680 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5681 break;
5682 }
5683 return false;
5684 }
5685 case X86ISD::VPERMV3: {
5686 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5687 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5688 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5689 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5690 Ops.push_back(N.getOperand(0));
5691 Ops.push_back(N.getOperand(2));
5692 SDValue MaskNode = N.getOperand(1);
5693 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5694 RawUndefs)) {
5695 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5696 break;
5697 }
5698 return false;
5699 }
5700 default:
5701 llvm_unreachable("unknown target shuffle node");
5702 }
5703
5704 // Empty mask indicates the decode failed.
5705 if (Mask.empty())
5706 return false;
5707
5708 // Check if we're getting a shuffle mask with zero'd elements.
5709 if (!AllowSentinelZero && isAnyZero(Mask))
5710 return false;
5711
5712 // If we have a fake unary shuffle, the shuffle mask is spread across two
5713 // inputs that are actually the same node. Re-map the mask to always point
5714 // into the first input.
5715 if (IsFakeUnary)
5716 for (int &M : Mask)
5717 if (M >= (int)Mask.size())
5718 M -= Mask.size();
5719
5720 // If we didn't already add operands in the opcode-specific code, default to
5721 // adding 1 or 2 operands starting at 0.
5722 if (Ops.empty()) {
5723 Ops.push_back(N.getOperand(0));
5724 if (!IsUnary || IsFakeUnary)
5725 Ops.push_back(N.getOperand(1));
5726 }
5727
5728 return true;
5729}
5730
5731// Wrapper for getTargetShuffleMask with InUnary;
5732static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5734 SmallVectorImpl<int> &Mask) {
5735 bool IsUnary;
5736 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5737}
5738
5739/// Compute whether each element of a shuffle is zeroable.
5740///
5741/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5742/// Either it is an undef element in the shuffle mask, the element of the input
5743/// referenced is undef, or the element of the input referenced is known to be
5744/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5745/// as many lanes with this technique as possible to simplify the remaining
5746/// shuffle.
5748 SDValue V1, SDValue V2,
5749 APInt &KnownUndef, APInt &KnownZero) {
5750 int Size = Mask.size();
5751 KnownUndef = KnownZero = APInt::getZero(Size);
5752
5753 V1 = peekThroughBitcasts(V1);
5754 V2 = peekThroughBitcasts(V2);
5755
5756 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5757 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5758
5759 int VectorSizeInBits = V1.getValueSizeInBits();
5760 int ScalarSizeInBits = VectorSizeInBits / Size;
5761 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5762
5763 for (int i = 0; i < Size; ++i) {
5764 int M = Mask[i];
5765 // Handle the easy cases.
5766 if (M < 0) {
5767 KnownUndef.setBit(i);
5768 continue;
5769 }
5770 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5771 KnownZero.setBit(i);
5772 continue;
5773 }
5774
5775 // Determine shuffle input and normalize the mask.
5776 SDValue V = M < Size ? V1 : V2;
5777 M %= Size;
5778
5779 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5780 if (V.getOpcode() != ISD::BUILD_VECTOR)
5781 continue;
5782
5783 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5784 // the (larger) source element must be UNDEF/ZERO.
5785 if ((Size % V.getNumOperands()) == 0) {
5786 int Scale = Size / V->getNumOperands();
5787 SDValue Op = V.getOperand(M / Scale);
5788 if (Op.isUndef())
5789 KnownUndef.setBit(i);
5790 if (X86::isZeroNode(Op))
5791 KnownZero.setBit(i);
5792 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5793 APInt Val = Cst->getAPIntValue();
5794 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5795 if (Val == 0)
5796 KnownZero.setBit(i);
5797 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5798 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5799 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5800 if (Val == 0)
5801 KnownZero.setBit(i);
5802 }
5803 continue;
5804 }
5805
5806 // If the BUILD_VECTOR has more elements then all the (smaller) source
5807 // elements must be UNDEF or ZERO.
5808 if ((V.getNumOperands() % Size) == 0) {
5809 int Scale = V->getNumOperands() / Size;
5810 bool AllUndef = true;
5811 bool AllZero = true;
5812 for (int j = 0; j < Scale; ++j) {
5813 SDValue Op = V.getOperand((M * Scale) + j);
5814 AllUndef &= Op.isUndef();
5815 AllZero &= X86::isZeroNode(Op);
5816 }
5817 if (AllUndef)
5818 KnownUndef.setBit(i);
5819 if (AllZero)
5820 KnownZero.setBit(i);
5821 continue;
5822 }
5823 }
5824}
5825
5826/// Decode a target shuffle mask and inputs and see if any values are
5827/// known to be undef or zero from their inputs.
5828/// Returns true if the target shuffle mask was decoded.
5829/// FIXME: Merge this with computeZeroableShuffleElements?
5832 APInt &KnownUndef, APInt &KnownZero) {
5833 bool IsUnary;
5834 if (!isTargetShuffle(N.getOpcode()))
5835 return false;
5836
5837 MVT VT = N.getSimpleValueType();
5838 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5839 return false;
5840
5841 int Size = Mask.size();
5842 SDValue V1 = Ops[0];
5843 SDValue V2 = IsUnary ? V1 : Ops[1];
5844 KnownUndef = KnownZero = APInt::getZero(Size);
5845
5846 V1 = peekThroughBitcasts(V1);
5847 V2 = peekThroughBitcasts(V2);
5848
5849 assert((VT.getSizeInBits() % Size) == 0 &&
5850 "Illegal split of shuffle value type");
5851 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5852
5853 // Extract known constant input data.
5854 APInt UndefSrcElts[2];
5855 SmallVector<APInt, 32> SrcEltBits[2];
5856 bool IsSrcConstant[2] = {
5857 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5858 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5859 /*AllowPartialUndefs*/ false),
5860 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5861 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5862 /*AllowPartialUndefs*/ false)};
5863
5864 for (int i = 0; i < Size; ++i) {
5865 int M = Mask[i];
5866
5867 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5868 if (M < 0) {
5869 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5870 if (SM_SentinelUndef == M)
5871 KnownUndef.setBit(i);
5872 if (SM_SentinelZero == M)
5873 KnownZero.setBit(i);
5874 continue;
5875 }
5876
5877 // Determine shuffle input and normalize the mask.
5878 unsigned SrcIdx = M / Size;
5879 SDValue V = M < Size ? V1 : V2;
5880 M %= Size;
5881
5882 // We are referencing an UNDEF input.
5883 if (V.isUndef()) {
5884 KnownUndef.setBit(i);
5885 continue;
5886 }
5887
5888 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5889 // TODO: We currently only set UNDEF for integer types - floats use the same
5890 // registers as vectors and many of the scalar folded loads rely on the
5891 // SCALAR_TO_VECTOR pattern.
5892 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5893 (Size % V.getValueType().getVectorNumElements()) == 0) {
5894 int Scale = Size / V.getValueType().getVectorNumElements();
5895 int Idx = M / Scale;
5896 if (Idx != 0 && !VT.isFloatingPoint())
5897 KnownUndef.setBit(i);
5898 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5899 KnownZero.setBit(i);
5900 continue;
5901 }
5902
5903 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5904 // base vectors.
5905 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5906 SDValue Vec = V.getOperand(0);
5907 int NumVecElts = Vec.getValueType().getVectorNumElements();
5908 if (Vec.isUndef() && Size == NumVecElts) {
5909 int Idx = V.getConstantOperandVal(2);
5910 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5911 if (M < Idx || (Idx + NumSubElts) <= M)
5912 KnownUndef.setBit(i);
5913 }
5914 continue;
5915 }
5916
5917 // Attempt to extract from the source's constant bits.
5918 if (IsSrcConstant[SrcIdx]) {
5919 if (UndefSrcElts[SrcIdx][M])
5920 KnownUndef.setBit(i);
5921 else if (SrcEltBits[SrcIdx][M] == 0)
5922 KnownZero.setBit(i);
5923 }
5924 }
5925
5926 assert(VT.getVectorNumElements() == (unsigned)Size &&
5927 "Different mask size from vector size!");
5928 return true;
5929}
5930
5931// Replace target shuffle mask elements with known undef/zero sentinels.
5933 const APInt &KnownUndef,
5934 const APInt &KnownZero,
5935 bool ResolveKnownZeros= true) {
5936 unsigned NumElts = Mask.size();
5937 assert(KnownUndef.getBitWidth() == NumElts &&
5938 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5939
5940 for (unsigned i = 0; i != NumElts; ++i) {
5941 if (KnownUndef[i])
5942 Mask[i] = SM_SentinelUndef;
5943 else if (ResolveKnownZeros && KnownZero[i])
5944 Mask[i] = SM_SentinelZero;
5945 }
5946}
5947
5948// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5950 APInt &KnownUndef,
5951 APInt &KnownZero) {
5952 unsigned NumElts = Mask.size();
5953 KnownUndef = KnownZero = APInt::getZero(NumElts);
5954
5955 for (unsigned i = 0; i != NumElts; ++i) {
5956 int M = Mask[i];
5957 if (SM_SentinelUndef == M)
5958 KnownUndef.setBit(i);
5959 if (SM_SentinelZero == M)
5960 KnownZero.setBit(i);
5961 }
5962}
5963
5964// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5966 SDValue Cond, bool IsBLENDV = false) {
5967 EVT CondVT = Cond.getValueType();
5968 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5969 unsigned NumElts = CondVT.getVectorNumElements();
5970
5971 APInt UndefElts;
5972 SmallVector<APInt, 32> EltBits;
5973 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5974 /*AllowWholeUndefs*/ true,
5975 /*AllowPartialUndefs*/ false))
5976 return false;
5977
5978 Mask.resize(NumElts, SM_SentinelUndef);
5979
5980 for (int i = 0; i != (int)NumElts; ++i) {
5981 Mask[i] = i;
5982 // Arbitrarily choose from the 2nd operand if the select condition element
5983 // is undef.
5984 // TODO: Can we do better by matching patterns such as even/odd?
5985 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5986 (IsBLENDV && EltBits[i].isNonNegative()))
5987 Mask[i] += NumElts;
5988 }
5989
5990 return true;
5991}
5992
5993// Forward declaration (for getFauxShuffleMask recursive check).
5994static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5997 const SelectionDAG &DAG, unsigned Depth,
5998 bool ResolveKnownElts);
5999
6000// Attempt to decode ops that could be represented as a shuffle mask.
6001// The decoded shuffle mask may contain a different number of elements to the
6002// destination value type.
6003// TODO: Merge into getTargetShuffleInputs()
6004static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6007 const SelectionDAG &DAG, unsigned Depth,
6008 bool ResolveKnownElts) {
6009 Mask.clear();
6010 Ops.clear();
6011
6012 MVT VT = N.getSimpleValueType();
6013 unsigned NumElts = VT.getVectorNumElements();
6014 unsigned NumSizeInBits = VT.getSizeInBits();
6015 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6016 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6017 return false;
6018 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6019 unsigned NumSizeInBytes = NumSizeInBits / 8;
6020 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6021
6022 unsigned Opcode = N.getOpcode();
6023 switch (Opcode) {
6024 case ISD::VECTOR_SHUFFLE: {
6025 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6026 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6027 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6028 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6029 Ops.push_back(N.getOperand(0));
6030 Ops.push_back(N.getOperand(1));
6031 return true;
6032 }
6033 return false;
6034 }
6035 case ISD::AND:
6036 case X86ISD::ANDNP: {
6037 // Attempt to decode as a per-byte mask.
6038 APInt UndefElts;
6039 SmallVector<APInt, 32> EltBits;
6040 SDValue N0 = N.getOperand(0);
6041 SDValue N1 = N.getOperand(1);
6042 bool IsAndN = (X86ISD::ANDNP == Opcode);
6043 uint64_t ZeroMask = IsAndN ? 255 : 0;
6044 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6045 /*AllowWholeUndefs*/ false,
6046 /*AllowPartialUndefs*/ false))
6047 return false;
6048 // We can't assume an undef src element gives an undef dst - the other src
6049 // might be zero.
6050 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6051 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6052 const APInt &ByteBits = EltBits[i];
6053 if (ByteBits != 0 && ByteBits != 255)
6054 return false;
6055 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6056 }
6057 Ops.push_back(IsAndN ? N1 : N0);
6058 return true;
6059 }
6060 case ISD::OR: {
6061 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6062 // is a valid shuffle index.
6063 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6064 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6065 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6066 return false;
6067
6068 SmallVector<int, 64> SrcMask0, SrcMask1;
6069 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6072 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6073 Depth + 1, true) ||
6074 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6075 Depth + 1, true))
6076 return false;
6077
6078 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6079 SmallVector<int, 64> Mask0, Mask1;
6080 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6081 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6082 for (int i = 0; i != (int)MaskSize; ++i) {
6083 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6084 // loops converting between OR and BLEND shuffles due to
6085 // canWidenShuffleElements merging away undef elements, meaning we
6086 // fail to recognise the OR as the undef element isn't known zero.
6087 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6088 Mask.push_back(SM_SentinelZero);
6089 else if (Mask1[i] == SM_SentinelZero)
6090 Mask.push_back(i);
6091 else if (Mask0[i] == SM_SentinelZero)
6092 Mask.push_back(i + MaskSize);
6093 else
6094 return false;
6095 }
6096 Ops.push_back(N0);
6097 Ops.push_back(N1);
6098 return true;
6099 }
6100 case ISD::INSERT_SUBVECTOR: {
6101 SDValue Src = N.getOperand(0);
6102 SDValue Sub = N.getOperand(1);
6103 EVT SubVT = Sub.getValueType();
6104 unsigned NumSubElts = SubVT.getVectorNumElements();
6105 if (!N->isOnlyUserOf(Sub.getNode()))
6106 return false;
6107 SDValue SubBC = peekThroughBitcasts(Sub);
6108 uint64_t InsertIdx = N.getConstantOperandVal(2);
6109 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6110 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6111 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6112 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
6113 SDValue SubBCSrc = SubBC.getOperand(0);
6114 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
6115 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
6116 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
6117 "Subvector valuetype mismatch");
6118 InsertIdx *= (MaxElts / NumElts);
6119 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
6120 NumSubElts *= (MaxElts / NumElts);
6121 bool SrcIsUndef = Src.isUndef();
6122 for (int i = 0; i != (int)MaxElts; ++i)
6123 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6124 for (int i = 0; i != (int)NumSubElts; ++i)
6125 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6126 if (!SrcIsUndef)
6127 Ops.push_back(Src);
6128 Ops.push_back(SubBCSrc);
6129 return true;
6130 }
6131 // Handle CONCAT(SUB0, SUB1).
6132 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
6133 // cross lane shuffles.
6134 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6135 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
6136 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6137 Src.getOperand(0).isUndef() &&
6138 Src.getOperand(1).getValueType() == SubVT &&
6139 Src.getConstantOperandVal(2) == 0) {
6140 for (int i = 0; i != (int)NumSubElts; ++i)
6141 Mask.push_back(i);
6142 for (int i = 0; i != (int)NumSubElts; ++i)
6143 Mask.push_back(i + NumElts);
6144 Ops.push_back(Src.getOperand(1));
6145 Ops.push_back(Sub);
6146 return true;
6147 }
6148 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6149 SmallVector<int, 64> SubMask;
6150 SmallVector<SDValue, 2> SubInputs;
6151 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
6152 EVT SubSrcVT = SubSrc.getValueType();
6153 if (!SubSrcVT.isVector())
6154 return false;
6155
6156 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6157 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6158 Depth + 1, ResolveKnownElts))
6159 return false;
6160
6161 // Subvector shuffle inputs must not be larger than the subvector.
6162 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6163 return SubVT.getFixedSizeInBits() <
6164 SubInput.getValueSizeInBits().getFixedValue();
6165 }))
6166 return false;
6167
6168 if (SubMask.size() != NumSubElts) {
6169 assert(((SubMask.size() % NumSubElts) == 0 ||
6170 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
6171 if ((NumSubElts % SubMask.size()) == 0) {
6172 int Scale = NumSubElts / SubMask.size();
6173 SmallVector<int,64> ScaledSubMask;
6174 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6175 SubMask = ScaledSubMask;
6176 } else {
6177 int Scale = SubMask.size() / NumSubElts;
6178 NumSubElts = SubMask.size();
6179 NumElts *= Scale;
6180 InsertIdx *= Scale;
6181 }
6182 }
6183 Ops.push_back(Src);
6184 Ops.append(SubInputs.begin(), SubInputs.end());
6185 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6186 Mask.append(NumElts, SM_SentinelZero);
6187 else
6188 for (int i = 0; i != (int)NumElts; ++i)
6189 Mask.push_back(i);
6190 for (int i = 0; i != (int)NumSubElts; ++i) {
6191 int M = SubMask[i];
6192 if (0 <= M) {
6193 int InputIdx = M / NumSubElts;
6194 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6195 }
6196 Mask[i + InsertIdx] = M;
6197 }
6198 return true;
6199 }
6200 case X86ISD::PINSRB:
6201 case X86ISD::PINSRW:
6204 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6205 // vector, for matching src/dst vector types.
6206 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6207
6208 unsigned DstIdx = 0;
6209 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6210 // Check we have an in-range constant insertion index.
6211 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6212 N.getConstantOperandAPInt(2).uge(NumElts))
6213 return false;
6214 DstIdx = N.getConstantOperandVal(2);
6215
6216 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6217 if (X86::isZeroNode(Scl)) {
6218 Ops.push_back(N.getOperand(0));
6219 for (unsigned i = 0; i != NumElts; ++i)
6220 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6221 return true;
6222 }
6223 }
6224
6225 // Peek through trunc/aext/zext/bitcast.
6226 // TODO: aext shouldn't require SM_SentinelZero padding.
6227 // TODO: handle shift of scalars.
6228 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6229 while (Scl.getOpcode() == ISD::TRUNCATE ||
6230 Scl.getOpcode() == ISD::ANY_EXTEND ||
6231 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6232 (Scl.getOpcode() == ISD::BITCAST &&
6235 Scl = Scl.getOperand(0);
6236 MinBitsPerElt =
6237 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6238 }
6239 if ((MinBitsPerElt % 8) != 0)
6240 return false;
6241
6242 // Attempt to find the source vector the scalar was extracted from.
6243 SDValue SrcExtract;
6244 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6245 Scl.getOpcode() == X86ISD::PEXTRW ||
6246 Scl.getOpcode() == X86ISD::PEXTRB) &&
6247 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6248 SrcExtract = Scl;
6249 }
6250 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6251 return false;
6252
6253 SDValue SrcVec = SrcExtract.getOperand(0);
6254 EVT SrcVT = SrcVec.getValueType();
6255 if (!SrcVT.getScalarType().isByteSized())
6256 return false;
6257 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6258 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6259 unsigned DstByte = DstIdx * NumBytesPerElt;
6260 MinBitsPerElt =
6261 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6262
6263 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6264 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6265 Ops.push_back(SrcVec);
6266 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6267 } else {
6268 Ops.push_back(SrcVec);
6269 Ops.push_back(N.getOperand(0));
6270 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6271 Mask.push_back(NumSizeInBytes + i);
6272 }
6273
6274 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6275 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6276 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6277 Mask[DstByte + i] = SrcByte + i;
6278 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6279 Mask[DstByte + i] = SM_SentinelZero;
6280 return true;
6281 }
6282 case X86ISD::PACKSS:
6283 case X86ISD::PACKUS: {
6284 SDValue N0 = N.getOperand(0);
6285 SDValue N1 = N.getOperand(1);
6286 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6287 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6288 "Unexpected input value type");
6289
6290 APInt EltsLHS, EltsRHS;
6291 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6292
6293 // If we know input saturation won't happen (or we don't care for particular
6294 // lanes), we can treat this as a truncation shuffle.
6295 bool Offset0 = false, Offset1 = false;
6296 if (Opcode == X86ISD::PACKSS) {
6297 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6298 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6299 (!(N1.isUndef() || EltsRHS.isZero()) &&
6300 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6301 return false;
6302 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6303 // PACKSS then it was likely being used for sign-extension for a
6304 // truncation, so just peek through and adjust the mask accordingly.
6305 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6306 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6307 Offset0 = true;
6308 N0 = N0.getOperand(0);
6309 }
6310 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6311 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6312 Offset1 = true;
6313 N1 = N1.getOperand(0);
6314 }
6315 } else {
6316 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6317 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6318 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6319 (!(N1.isUndef() || EltsRHS.isZero()) &&
6320 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6321 return false;
6322 }
6323
6324 bool IsUnary = (N0 == N1);
6325
6326 Ops.push_back(N0);
6327 if (!IsUnary)
6328 Ops.push_back(N1);
6329
6330 createPackShuffleMask(VT, Mask, IsUnary);
6331
6332 if (Offset0 || Offset1) {
6333 for (int &M : Mask)
6334 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6335 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6336 ++M;
6337 }
6338 return true;
6339 }
6340 case ISD::VSELECT:
6341 case X86ISD::BLENDV: {
6342 SDValue Cond = N.getOperand(0);
6343 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6344 Ops.push_back(N.getOperand(1));
6345 Ops.push_back(N.getOperand(2));
6346 return true;
6347 }
6348 return false;
6349 }
6350 case X86ISD::VTRUNC: {
6351 SDValue Src = N.getOperand(0);
6352 EVT SrcVT = Src.getValueType();
6353 // Truncated source must be a simple vector.
6354 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6355 (SrcVT.getScalarSizeInBits() % 8) != 0)
6356 return false;
6357 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6358 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6359 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6360 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6361 for (unsigned i = 0; i != NumSrcElts; ++i)
6362 Mask.push_back(i * Scale);
6363 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6364 Ops.push_back(Src);
6365 return true;
6366 }
6367 case ISD::SHL:
6368 case ISD::SRL: {
6369 // We can only decode 'whole byte' bit shifts as shuffles.
6370 std::optional<uint64_t> Amt = DAG.getValidShiftAmount(N, DemandedElts);
6371 if (!Amt || (*Amt % 8) != 0)
6372 return false;
6373
6374 uint64_t ByteShift = *Amt / 8;
6375 Ops.push_back(N.getOperand(0));
6376
6377 // Clear mask to all zeros and insert the shifted byte indices.
6378 Mask.append(NumSizeInBytes, SM_SentinelZero);
6379
6380 if (ISD::SHL == Opcode) {
6381 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6382 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6383 Mask[i + j] = i + j - ByteShift;
6384 } else {
6385 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6386 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6387 Mask[i + j - ByteShift] = i + j;
6388 }
6389 return true;
6390 }
6391 case X86ISD::VSHLI:
6392 case X86ISD::VSRLI: {
6393 uint64_t ShiftVal = N.getConstantOperandVal(1);
6394 // Out of range bit shifts are guaranteed to be zero.
6395 if (NumBitsPerElt <= ShiftVal) {
6396 Mask.append(NumElts, SM_SentinelZero);
6397 return true;
6398 }
6399
6400 // We can only decode 'whole byte' bit shifts as shuffles.
6401 if ((ShiftVal % 8) != 0)
6402 break;
6403
6404 uint64_t ByteShift = ShiftVal / 8;
6405 Ops.push_back(N.getOperand(0));
6406
6407 // Clear mask to all zeros and insert the shifted byte indices.
6408 Mask.append(NumSizeInBytes, SM_SentinelZero);
6409
6410 if (X86ISD::VSHLI == Opcode) {
6411 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6412 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6413 Mask[i + j] = i + j - ByteShift;
6414 } else {
6415 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6416 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6417 Mask[i + j - ByteShift] = i + j;
6418 }
6419 return true;
6420 }
6421 case X86ISD::VROTLI:
6422 case X86ISD::VROTRI: {
6423 // We can only decode 'whole byte' bit rotates as shuffles.
6424 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6425 if ((RotateVal % 8) != 0)
6426 return false;
6427 Ops.push_back(N.getOperand(0));
6428 int Offset = RotateVal / 8;
6429 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6430 for (int i = 0; i != (int)NumElts; ++i) {
6431 int BaseIdx = i * NumBytesPerElt;
6432 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6433 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6434 }
6435 }
6436 return true;
6437 }
6438 case X86ISD::VBROADCAST: {
6439 SDValue Src = N.getOperand(0);
6440 if (!Src.getSimpleValueType().isVector()) {
6441 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6442 !isNullConstant(Src.getOperand(1)) ||
6443 Src.getOperand(0).getValueType().getScalarType() !=
6444 VT.getScalarType())
6445 return false;
6446 Src = Src.getOperand(0);
6447 }
6448 Ops.push_back(Src);
6449 Mask.append(NumElts, 0);
6450 return true;
6451 }
6453 SDValue Src = N.getOperand(0);
6454 EVT SrcVT = Src.getValueType();
6455 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6456
6457 // Extended source must be a simple vector.
6458 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6459 (NumBitsPerSrcElt % 8) != 0)
6460 return false;
6461
6462 // We can only handle all-signbits extensions.
6463 APInt DemandedSrcElts =
6464 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6465 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6466 return false;
6467
6468 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6469 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6470 for (unsigned I = 0; I != NumElts; ++I)
6471 Mask.append(Scale, I);
6472 Ops.push_back(Src);
6473 return true;
6474 }
6475 case ISD::ZERO_EXTEND:
6476 case ISD::ANY_EXTEND:
6479 SDValue Src = N.getOperand(0);
6480 EVT SrcVT = Src.getValueType();
6481
6482 // Extended source must be a simple vector.
6483 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6484 (SrcVT.getScalarSizeInBits() % 8) != 0)
6485 return false;
6486
6487 bool IsAnyExtend =
6488 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6489 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6490 IsAnyExtend, Mask);
6491 Ops.push_back(Src);
6492 return true;
6493 }
6494 }
6495
6496 return false;
6497}
6498
6499/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6501 SmallVectorImpl<int> &Mask) {
6502 int MaskWidth = Mask.size();
6503 SmallVector<SDValue, 16> UsedInputs;
6504 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6505 int lo = UsedInputs.size() * MaskWidth;
6506 int hi = lo + MaskWidth;
6507
6508 // Strip UNDEF input usage.
6509 if (Inputs[i].isUndef())
6510 for (int &M : Mask)
6511 if ((lo <= M) && (M < hi))
6512 M = SM_SentinelUndef;
6513
6514 // Check for unused inputs.
6515 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6516 for (int &M : Mask)
6517 if (lo <= M)
6518 M -= MaskWidth;
6519 continue;
6520 }
6521
6522 // Check for repeated inputs.
6523 bool IsRepeat = false;
6524 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6525 if (UsedInputs[j] != Inputs[i])
6526 continue;
6527 for (int &M : Mask)
6528 if (lo <= M)
6529 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6530 IsRepeat = true;
6531 break;
6532 }
6533 if (IsRepeat)
6534 continue;
6535
6536 UsedInputs.push_back(Inputs[i]);
6537 }
6538 Inputs = UsedInputs;
6539}
6540
6541/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6542/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6543/// Returns true if the target shuffle mask was decoded.
6544static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6547 APInt &KnownUndef, APInt &KnownZero,
6548 const SelectionDAG &DAG, unsigned Depth,
6549 bool ResolveKnownElts) {
6551 return false; // Limit search depth.
6552
6553 EVT VT = Op.getValueType();
6554 if (!VT.isSimple() || !VT.isVector())
6555 return false;
6556
6557 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6558 if (ResolveKnownElts)
6559 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6560 return true;
6561 }
6562 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6563 ResolveKnownElts)) {
6564 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6565 return true;
6566 }
6567 return false;
6568}
6569
6570static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6573 const SelectionDAG &DAG, unsigned Depth,
6574 bool ResolveKnownElts) {
6575 APInt KnownUndef, KnownZero;
6576 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6577 KnownZero, DAG, Depth, ResolveKnownElts);
6578}
6579
6582 const SelectionDAG &DAG, unsigned Depth = 0,
6583 bool ResolveKnownElts = true) {
6584 EVT VT = Op.getValueType();
6585 if (!VT.isSimple() || !VT.isVector())
6586 return false;
6587
6588 unsigned NumElts = Op.getValueType().getVectorNumElements();
6589 APInt DemandedElts = APInt::getAllOnes(NumElts);
6590 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6591 ResolveKnownElts);
6592}
6593
6594// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6595static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6596 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6597 SelectionDAG &DAG) {
6598 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6599 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6600 "Unknown broadcast load type");
6601
6602 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6603 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6604 return SDValue();
6605
6608 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6609 SDValue Ops[] = {Mem->getChain(), Ptr};
6610 SDValue BcstLd = DAG.getMemIntrinsicNode(
6611 Opcode, DL, Tys, Ops, MemVT,
6613 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6614 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6615 return BcstLd;
6616}
6617
6618/// Returns the scalar element that will make up the i'th
6619/// element of the result of the vector shuffle.
6620static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6621 SelectionDAG &DAG, unsigned Depth) {
6623 return SDValue(); // Limit search depth.
6624
6625 EVT VT = Op.getValueType();
6626 unsigned Opcode = Op.getOpcode();
6627 unsigned NumElems = VT.getVectorNumElements();
6628
6629 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6630 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6631 int Elt = SV->getMaskElt(Index);
6632
6633 if (Elt < 0)
6634 return DAG.getUNDEF(VT.getVectorElementType());
6635
6636 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6637 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6638 }
6639
6640 // Recurse into target specific vector shuffles to find scalars.
6641 if (isTargetShuffle(Opcode)) {
6642 MVT ShufVT = VT.getSimpleVT();
6643 MVT ShufSVT = ShufVT.getVectorElementType();
6644 int NumElems = (int)ShufVT.getVectorNumElements();
6645 SmallVector<int, 16> ShuffleMask;
6647 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6648 return SDValue();
6649
6650 int Elt = ShuffleMask[Index];
6651 if (Elt == SM_SentinelZero)
6652 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6653 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6654 if (Elt == SM_SentinelUndef)
6655 return DAG.getUNDEF(ShufSVT);
6656
6657 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6658 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6659 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6660 }
6661
6662 // Recurse into insert_subvector base/sub vector to find scalars.
6663 if (Opcode == ISD::INSERT_SUBVECTOR) {
6664 SDValue Vec = Op.getOperand(0);
6665 SDValue Sub = Op.getOperand(1);
6666 uint64_t SubIdx = Op.getConstantOperandVal(2);
6667 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6668
6669 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6670 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6671 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6672 }
6673
6674 // Recurse into concat_vectors sub vector to find scalars.
6675 if (Opcode == ISD::CONCAT_VECTORS) {
6676 EVT SubVT = Op.getOperand(0).getValueType();
6677 unsigned NumSubElts = SubVT.getVectorNumElements();
6678 uint64_t SubIdx = Index / NumSubElts;
6679 uint64_t SubElt = Index % NumSubElts;
6680 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6681 }
6682
6683 // Recurse into extract_subvector src vector to find scalars.
6684 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6685 SDValue Src = Op.getOperand(0);
6686 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6687 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6688 }
6689
6690 // We only peek through bitcasts of the same vector width.
6691 if (Opcode == ISD::BITCAST) {
6692 SDValue Src = Op.getOperand(0);
6693 EVT SrcVT = Src.getValueType();
6694 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6695 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6696 return SDValue();
6697 }
6698
6699 // Actual nodes that may contain scalar elements
6700
6701 // For insert_vector_elt - either return the index matching scalar or recurse
6702 // into the base vector.
6703 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6704 isa<ConstantSDNode>(Op.getOperand(2))) {
6705 if (Op.getConstantOperandAPInt(2) == Index)
6706 return Op.getOperand(1);
6707 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6708 }
6709
6710 if (Opcode == ISD::SCALAR_TO_VECTOR)
6711 return (Index == 0) ? Op.getOperand(0)
6712 : DAG.getUNDEF(VT.getVectorElementType());
6713
6714 if (Opcode == ISD::BUILD_VECTOR)
6715 return Op.getOperand(Index);
6716
6717 return SDValue();
6718}
6719
6720// Use PINSRB/PINSRW/PINSRD to create a build vector.
6722 const APInt &NonZeroMask,
6723 unsigned NumNonZero, unsigned NumZero,
6724 SelectionDAG &DAG,
6725 const X86Subtarget &Subtarget) {
6726 MVT VT = Op.getSimpleValueType();
6727 unsigned NumElts = VT.getVectorNumElements();
6728 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6729 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6730 "Illegal vector insertion");
6731
6732 SDValue V;
6733 bool First = true;
6734
6735 for (unsigned i = 0; i < NumElts; ++i) {
6736 bool IsNonZero = NonZeroMask[i];
6737 if (!IsNonZero)
6738 continue;
6739
6740 // If the build vector contains zeros or our first insertion is not the
6741 // first index then insert into zero vector to break any register
6742 // dependency else use SCALAR_TO_VECTOR.
6743 if (First) {
6744 First = false;
6745 if (NumZero || 0 != i)
6746 V = getZeroVector(VT, Subtarget, DAG, DL);
6747 else {
6748 assert(0 == i && "Expected insertion into zero-index");
6749 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6750 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6751 V = DAG.getBitcast(VT, V);
6752 continue;
6753 }
6754 }
6755 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6756 DAG.getVectorIdxConstant(i, DL));
6757 }
6758
6759 return V;
6760}
6761
6762/// Custom lower build_vector of v16i8.
6764 const APInt &NonZeroMask,
6765 unsigned NumNonZero, unsigned NumZero,
6766 SelectionDAG &DAG,
6767 const X86Subtarget &Subtarget) {
6768 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6769 return SDValue();
6770
6771 // SSE4.1 - use PINSRB to insert each byte directly.
6772 if (Subtarget.hasSSE41())
6773 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6774 DAG, Subtarget);
6775
6776 SDValue V;
6777
6778 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6779 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6780 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6781 !NonZeroMask.extractBits(2, 2).isZero()) {
6782 for (unsigned I = 0; I != 4; ++I) {
6783 if (!NonZeroMask[I])
6784 continue;
6785 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6786 if (I != 0)
6787 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6788 DAG.getConstant(I * 8, DL, MVT::i8));
6789 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6790 }
6791 assert(V && "Failed to fold v16i8 vector to zero");
6792 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6793 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6794 V = DAG.getBitcast(MVT::v8i16, V);
6795 }
6796 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6797 bool ThisIsNonZero = NonZeroMask[i];
6798 bool NextIsNonZero = NonZeroMask[i + 1];
6799 if (!ThisIsNonZero && !NextIsNonZero)
6800 continue;
6801
6802 SDValue Elt;
6803 if (ThisIsNonZero) {
6804 if (NumZero || NextIsNonZero)
6805 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6806 else
6807 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6808 }
6809
6810 if (NextIsNonZero) {
6811 SDValue NextElt = Op.getOperand(i + 1);
6812 if (i == 0 && NumZero)
6813 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6814 else
6815 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6816 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6817 DAG.getConstant(8, DL, MVT::i8));
6818 if (ThisIsNonZero)
6819 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6820 else
6821 Elt = NextElt;
6822 }
6823
6824 // If our first insertion is not the first index or zeros are needed, then
6825 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6826 // elements undefined).
6827 if (!V) {
6828 if (i != 0 || NumZero)
6829 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6830 else {
6831 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6832 V = DAG.getBitcast(MVT::v8i16, V);
6833 continue;
6834 }
6835 }
6836 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6837 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6838 DAG.getVectorIdxConstant(i / 2, DL));
6839 }
6840
6841 return DAG.getBitcast(MVT::v16i8, V);
6842}
6843
6844/// Custom lower build_vector of v8i16.
6846 const APInt &NonZeroMask,
6847 unsigned NumNonZero, unsigned NumZero,
6848 SelectionDAG &DAG,
6849 const X86Subtarget &Subtarget) {
6850 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6851 return SDValue();
6852
6853 // Use PINSRW to insert each byte directly.
6854 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6855 Subtarget);
6856}
6857
6858/// Custom lower build_vector of v4i32 or v4f32.
6860 SelectionDAG &DAG,
6861 const X86Subtarget &Subtarget) {
6862 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6863 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6864 // Because we're creating a less complicated build vector here, we may enable
6865 // further folding of the MOVDDUP via shuffle transforms.
6866 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6867 Op.getOperand(0) == Op.getOperand(2) &&
6868 Op.getOperand(1) == Op.getOperand(3) &&
6869 Op.getOperand(0) != Op.getOperand(1)) {
6870 MVT VT = Op.getSimpleValueType();
6871 MVT EltVT = VT.getVectorElementType();
6872 // Create a new build vector with the first 2 elements followed by undef
6873 // padding, bitcast to v2f64, duplicate, and bitcast back.
6874 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6875 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6876 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6877 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6878 return DAG.getBitcast(VT, Dup);
6879 }
6880
6881 // Find all zeroable elements.
6882 std::bitset<4> Zeroable, Undefs;
6883 for (int i = 0; i < 4; ++i) {
6884 SDValue Elt = Op.getOperand(i);
6885 Undefs[i] = Elt.isUndef();
6886 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6887 }
6888 assert(Zeroable.size() - Zeroable.count() > 1 &&
6889 "We expect at least two non-zero elements!");
6890
6891 // We only know how to deal with build_vector nodes where elements are either
6892 // zeroable or extract_vector_elt with constant index.
6893 SDValue FirstNonZero;
6894 unsigned FirstNonZeroIdx;
6895 for (unsigned i = 0; i < 4; ++i) {
6896 if (Zeroable[i])
6897 continue;
6898 SDValue Elt = Op.getOperand(i);
6899 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6900 !isa<ConstantSDNode>(Elt.getOperand(1)))
6901 return SDValue();
6902 // Make sure that this node is extracting from a 128-bit vector.
6903 MVT VT = Elt.getOperand(0).getSimpleValueType();
6904 if (!VT.is128BitVector())
6905 return SDValue();
6906 if (!FirstNonZero.getNode()) {
6907 FirstNonZero = Elt;
6908 FirstNonZeroIdx = i;
6909 }
6910 }
6911
6912 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6913 SDValue V1 = FirstNonZero.getOperand(0);
6914 MVT VT = V1.getSimpleValueType();
6915
6916 // See if this build_vector can be lowered as a blend with zero.
6917 SDValue Elt;
6918 unsigned EltMaskIdx, EltIdx;
6919 int Mask[4];
6920 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6921 if (Zeroable[EltIdx]) {
6922 // The zero vector will be on the right hand side.
6923 Mask[EltIdx] = EltIdx+4;
6924 continue;
6925 }
6926
6927 Elt = Op->getOperand(EltIdx);
6928 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6929 EltMaskIdx = Elt.getConstantOperandVal(1);
6930 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6931 break;
6932 Mask[EltIdx] = EltIdx;
6933 }
6934
6935 if (EltIdx == 4) {
6936 // Let the shuffle legalizer deal with blend operations.
6937 SDValue VZeroOrUndef = (Zeroable == Undefs)
6938 ? DAG.getUNDEF(VT)
6939 : getZeroVector(VT, Subtarget, DAG, DL);
6940 if (V1.getSimpleValueType() != VT)
6941 V1 = DAG.getBitcast(VT, V1);
6942 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6943 }
6944
6945 // See if we can lower this build_vector to a INSERTPS.
6946 if (!Subtarget.hasSSE41())
6947 return SDValue();
6948
6949 SDValue V2 = Elt.getOperand(0);
6950 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6951 V1 = SDValue();
6952
6953 bool CanFold = true;
6954 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6955 if (Zeroable[i])
6956 continue;
6957
6958 SDValue Current = Op->getOperand(i);
6959 SDValue SrcVector = Current->getOperand(0);
6960 if (!V1.getNode())
6961 V1 = SrcVector;
6962 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6963 }
6964
6965 if (!CanFold)
6966 return SDValue();
6967
6968 assert(V1.getNode() && "Expected at least two non-zero elements!");
6969 if (V1.getSimpleValueType() != MVT::v4f32)
6970 V1 = DAG.getBitcast(MVT::v4f32, V1);
6971 if (V2.getSimpleValueType() != MVT::v4f32)
6972 V2 = DAG.getBitcast(MVT::v4f32, V2);
6973
6974 // Ok, we can emit an INSERTPS instruction.
6975 unsigned ZMask = Zeroable.to_ulong();
6976
6977 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6978 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6979 SDValue Result =
6980 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6981 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
6982 return DAG.getBitcast(VT, Result);
6983}
6984
6985/// Return a vector logical shift node.
6986static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6987 SelectionDAG &DAG, const TargetLowering &TLI,
6988 const SDLoc &dl) {
6989 assert(VT.is128BitVector() && "Unknown type for VShift");
6990 MVT ShVT = MVT::v16i8;
6991 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6992 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6993 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6994 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6995 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6996}
6997
6999 SelectionDAG &DAG) {
7000
7001 // Check if the scalar load can be widened into a vector load. And if
7002 // the address is "base + cst" see if the cst can be "absorbed" into
7003 // the shuffle mask.
7004 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
7005 SDValue Ptr = LD->getBasePtr();
7006 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7007 return SDValue();
7008 EVT PVT = LD->getValueType(0);
7009 if (PVT != MVT::i32 && PVT != MVT::f32)
7010 return SDValue();
7011
7012 int FI = -1;
7013 int64_t Offset = 0;
7014 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7015 FI = FINode->getIndex();
7016 Offset = 0;
7017 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7018 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7019 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7020 Offset = Ptr.getConstantOperandVal(1);
7021 Ptr = Ptr.getOperand(0);
7022 } else {
7023 return SDValue();
7024 }
7025
7026 // FIXME: 256-bit vector instructions don't require a strict alignment,
7027 // improve this code to support it better.
7028 Align RequiredAlign(VT.getSizeInBits() / 8);
7029 SDValue Chain = LD->getChain();
7030 // Make sure the stack object alignment is at least 16 or 32.
7032 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7033 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7034 if (MFI.isFixedObjectIndex(FI)) {
7035 // Can't change the alignment. FIXME: It's possible to compute
7036 // the exact stack offset and reference FI + adjust offset instead.
7037 // If someone *really* cares about this. That's the way to implement it.
7038 return SDValue();
7039 } else {
7040 MFI.setObjectAlignment(FI, RequiredAlign);
7041 }
7042 }
7043
7044 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7045 // Ptr + (Offset & ~15).
7046 if (Offset < 0)
7047 return SDValue();
7048 if ((Offset % RequiredAlign.value()) & 3)
7049 return SDValue();
7050 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7051 if (StartOffset) {
7052 SDLoc DL(Ptr);
7053 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7054 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7055 }
7056
7057 int EltNo = (Offset - StartOffset) >> 2;
7058 unsigned NumElems = VT.getVectorNumElements();
7059
7060 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7061 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7062 LD->getPointerInfo().getWithOffset(StartOffset));
7063
7064 SmallVector<int, 8> Mask(NumElems, EltNo);
7065
7066 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7067 }
7068
7069 return SDValue();
7070}
7071
7072// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7073static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7074 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7075 auto *BaseLd = cast<LoadSDNode>(Elt);
7076 if (!BaseLd->isSimple())
7077 return false;
7078 Ld = BaseLd;
7079 ByteOffset = 0;
7080 return true;
7081 }
7082
7083 switch (Elt.getOpcode()) {
7084 case ISD::BITCAST:
7085 case ISD::TRUNCATE:
7087 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7088 case ISD::SRL:
7089 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7090 uint64_t Amt = AmtC->getZExtValue();
7091 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7092 ByteOffset += Amt / 8;
7093 return true;
7094 }
7095 }
7096 break;
7098 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7099 SDValue Src = Elt.getOperand(0);
7100 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7101 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7102 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7103 findEltLoadSrc(Src, Ld, ByteOffset)) {
7104 uint64_t Idx = IdxC->getZExtValue();
7105 ByteOffset += Idx * (SrcSizeInBits / 8);
7106 return true;
7107 }
7108 }
7109 break;
7110 }
7111
7112 return false;
7113}
7114
7115/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7116/// elements can be replaced by a single large load which has the same value as
7117/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7118///
7119/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7121 const SDLoc &DL, SelectionDAG &DAG,
7122 const X86Subtarget &Subtarget,
7123 bool IsAfterLegalize) {
7124 if ((VT.getScalarSizeInBits() % 8) != 0)
7125 return SDValue();
7126
7127 unsigned NumElems = Elts.size();
7128
7129 int LastLoadedElt = -1;
7130 APInt LoadMask = APInt::getZero(NumElems);
7131 APInt ZeroMask = APInt::getZero(NumElems);
7132 APInt UndefMask = APInt::getZero(NumElems);
7133
7134 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7135 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7136
7137 // For each element in the initializer, see if we've found a load, zero or an
7138 // undef.
7139 for (unsigned i = 0; i < NumElems; ++i) {
7140 SDValue Elt = peekThroughBitcasts(Elts[i]);
7141 if (!Elt.getNode())
7142 return SDValue();
7143 if (Elt.isUndef()) {
7144 UndefMask.setBit(i);
7145 continue;
7146 }
7148 ZeroMask.setBit(i);
7149 continue;
7150 }
7151
7152 // Each loaded element must be the correct fractional portion of the
7153 // requested vector load.
7154 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7155 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7156 return SDValue();
7157
7158 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7159 return SDValue();
7160 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7161 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7162 return SDValue();
7163
7164 LoadMask.setBit(i);
7165 LastLoadedElt = i;
7166 }
7167 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7168 NumElems &&
7169 "Incomplete element masks");
7170
7171 // Handle Special Cases - all undef or undef/zero.
7172 if (UndefMask.popcount() == NumElems)
7173 return DAG.getUNDEF(VT);
7174 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7175 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7176 : DAG.getConstantFP(0.0, DL, VT);
7177
7178 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7179 int FirstLoadedElt = LoadMask.countr_zero();
7180 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7181 EVT EltBaseVT = EltBase.getValueType();
7182 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7183 "Register/Memory size mismatch");
7184 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7185 assert(LDBase && "Did not find base load for merging consecutive loads");
7186 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7187 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7188 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7189 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7190 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7191
7192 // TODO: Support offsetting the base load.
7193 if (ByteOffsets[FirstLoadedElt] != 0)
7194 return SDValue();
7195
7196 // Check to see if the element's load is consecutive to the base load
7197 // or offset from a previous (already checked) load.
7198 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7199 LoadSDNode *Ld = Loads[EltIdx];
7200 int64_t ByteOffset = ByteOffsets[EltIdx];
7201 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7202 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7203 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7204 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7205 }
7206 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7207 EltIdx - FirstLoadedElt);
7208 };
7209
7210 // Consecutive loads can contain UNDEFS but not ZERO elements.
7211 // Consecutive loads with UNDEFs and ZEROs elements require a
7212 // an additional shuffle stage to clear the ZERO elements.
7213 bool IsConsecutiveLoad = true;
7214 bool IsConsecutiveLoadWithZeros = true;
7215 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7216 if (LoadMask[i]) {
7217 if (!CheckConsecutiveLoad(LDBase, i)) {
7218 IsConsecutiveLoad = false;
7219 IsConsecutiveLoadWithZeros = false;
7220 break;
7221 }
7222 } else if (ZeroMask[i]) {
7223 IsConsecutiveLoad = false;
7224 }
7225 }
7226
7227 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7228 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7229 assert(LDBase->isSimple() &&
7230 "Cannot merge volatile or atomic loads.");
7231 SDValue NewLd =
7232 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7233 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
7234 MMOFlags);
7235 for (auto *LD : Loads)
7236 if (LD)
7237 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7238 return NewLd;
7239 };
7240
7241 // Check if the base load is entirely dereferenceable.
7242 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7243 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7244
7245 // LOAD - all consecutive load/undefs (must start/end with a load or be
7246 // entirely dereferenceable). If we have found an entire vector of loads and
7247 // undefs, then return a large load of the entire vector width starting at the
7248 // base pointer. If the vector contains zeros, then attempt to shuffle those
7249 // elements.
7250 if (FirstLoadedElt == 0 &&
7251 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7252 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7253 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7254 return SDValue();
7255
7256 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7257 // will lower to regular temporal loads and use the cache.
7258 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7259 VT.is256BitVector() && !Subtarget.hasInt256())
7260 return SDValue();
7261
7262 if (NumElems == 1)
7263 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7264
7265 if (!ZeroMask)
7266 return CreateLoad(VT, LDBase);
7267
7268 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7269 // vector and a zero vector to clear out the zero elements.
7270 if (!IsAfterLegalize && VT.isVector()) {
7271 unsigned NumMaskElts = VT.getVectorNumElements();
7272 if ((NumMaskElts % NumElems) == 0) {
7273 unsigned Scale = NumMaskElts / NumElems;
7274 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7275 for (unsigned i = 0; i < NumElems; ++i) {
7276 if (UndefMask[i])
7277 continue;
7278 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7279 for (unsigned j = 0; j != Scale; ++j)
7280 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7281 }
7282 SDValue V = CreateLoad(VT, LDBase);
7283 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7284 : DAG.getConstantFP(0.0, DL, VT);
7285 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7286 }
7287 }
7288 }
7289
7290 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7291 if (VT.is256BitVector() || VT.is512BitVector()) {
7292 unsigned HalfNumElems = NumElems / 2;
7293 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7294 EVT HalfVT =
7295 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7296 SDValue HalfLD =
7297 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7298 DAG, Subtarget, IsAfterLegalize);
7299 if (HalfLD)
7300 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7301 HalfLD, DAG.getVectorIdxConstant(0, DL));
7302 }
7303 }
7304
7305 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7306 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7307 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7308 LoadSizeInBits == 64) &&
7309 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7310 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7311 : MVT::getIntegerVT(LoadSizeInBits);
7312 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7313 // Allow v4f32 on SSE1 only targets.
7314 // FIXME: Add more isel patterns so we can just use VT directly.
7315 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7316 VecVT = MVT::v4f32;
7317 if (TLI.isTypeLegal(VecVT)) {
7318 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7319 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7320 SDValue ResNode = DAG.getMemIntrinsicNode(
7321 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7323 for (auto *LD : Loads)
7324 if (LD)
7325 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7326 return DAG.getBitcast(VT, ResNode);
7327 }
7328 }
7329
7330 // BROADCAST - match the smallest possible repetition pattern, load that
7331 // scalar/subvector element and then broadcast to the entire vector.
7332 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7333 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7334 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7335 unsigned RepeatSize = SubElems * BaseSizeInBits;
7336 unsigned ScalarSize = std::min(RepeatSize, 64u);
7337 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7338 continue;
7339
7340 // Don't attempt a 1:N subvector broadcast - it should be caught by
7341 // combineConcatVectorOps, else will cause infinite loops.
7342 if (RepeatSize > ScalarSize && SubElems == 1)
7343 continue;
7344
7345 bool Match = true;
7346 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7347 for (unsigned i = 0; i != NumElems && Match; ++i) {
7348 if (!LoadMask[i])
7349 continue;
7350 SDValue Elt = peekThroughBitcasts(Elts[i]);
7351 if (RepeatedLoads[i % SubElems].isUndef())
7352 RepeatedLoads[i % SubElems] = Elt;
7353 else
7354 Match &= (RepeatedLoads[i % SubElems] == Elt);
7355 }
7356
7357 // We must have loads at both ends of the repetition.
7358 Match &= !RepeatedLoads.front().isUndef();
7359 Match &= !RepeatedLoads.back().isUndef();
7360 if (!Match)
7361 continue;
7362
7363 EVT RepeatVT =
7364 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7365 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7366 : EVT::getFloatingPointVT(ScalarSize);
7367 if (RepeatSize > ScalarSize)
7368 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7369 RepeatSize / ScalarSize);
7370 EVT BroadcastVT =
7371 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7372 VT.getSizeInBits() / ScalarSize);
7373 if (TLI.isTypeLegal(BroadcastVT)) {
7374 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7375 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7376 SDValue Broadcast = RepeatLoad;
7377 if (RepeatSize > ScalarSize) {
7378 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7379 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7380 } else {
7381 if (!Subtarget.hasAVX2() &&
7383 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7384 Subtarget,
7385 /*AssumeSingleUse=*/true))
7386 return SDValue();
7387 Broadcast =
7388 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7389 }
7390 return DAG.getBitcast(VT, Broadcast);
7391 }
7392 }
7393 }
7394 }
7395
7396 return SDValue();
7397}
7398
7399// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7400// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7401// are consecutive, non-overlapping, and in the right order.
7403 SelectionDAG &DAG,
7404 const X86Subtarget &Subtarget,
7405 bool IsAfterLegalize) {
7407 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7408 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7409 Elts.push_back(Elt);
7410 continue;
7411 }
7412 return SDValue();
7413 }
7414 assert(Elts.size() == VT.getVectorNumElements());
7415 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7416 IsAfterLegalize);
7417}
7418
7420 const APInt &Undefs, LLVMContext &C) {
7421 unsigned ScalarSize = VT.getScalarSizeInBits();
7422 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7423
7424 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7425 if (VT.isFloatingPoint()) {
7426 if (ScalarSize == 16)
7427 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7428 if (ScalarSize == 32)
7429 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7430 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7431 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7432 }
7433 return Constant::getIntegerValue(Ty, Val);
7434 };
7435
7436 SmallVector<Constant *, 32> ConstantVec;
7437 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7438 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7439 : getConstantScalar(Bits[I]));
7440
7441 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7442}
7443
7444static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7445 unsigned SplatBitSize, LLVMContext &C) {
7446 unsigned ScalarSize = VT.getScalarSizeInBits();
7447
7448 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7449 if (VT.isFloatingPoint()) {
7450 if (ScalarSize == 16)
7451 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7452 if (ScalarSize == 32)
7453 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7454 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7455 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7456 }
7457 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7458 };
7459
7460 if (ScalarSize == SplatBitSize)
7461 return getConstantScalar(SplatValue);
7462
7463 unsigned NumElm = SplatBitSize / ScalarSize;
7464 SmallVector<Constant *, 32> ConstantVec;
7465 for (unsigned I = 0; I != NumElm; ++I) {
7466 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7467 ConstantVec.push_back(getConstantScalar(Val));
7468 }
7469 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7470}
7471
7473 for (auto *U : N->users()) {
7474 unsigned Opc = U->getOpcode();
7475 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7476 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7477 return false;
7478 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7479 return false;
7480 if (isTargetShuffle(Opc))
7481 return true;
7482 if (Opc == ISD::BITCAST) // Ignore bitcasts
7483 return isFoldableUseOfShuffle(U);
7484 if (N->hasOneUse()) {
7485 // TODO, there may be some general way to know if a SDNode can
7486 // be folded. We now only know whether an MI is foldable.
7487 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7488 return false;
7489 return true;
7490 }
7491 }
7492 return false;
7493}
7494
7495/// Attempt to use the vbroadcast instruction to generate a splat value
7496/// from a splat BUILD_VECTOR which uses:
7497/// a. A single scalar load, or a constant.
7498/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7499///
7500/// The VBROADCAST node is returned when a pattern is found,
7501/// or SDValue() otherwise.
7503 const SDLoc &dl,
7504 const X86Subtarget &Subtarget,
7505 SelectionDAG &DAG) {
7506 // VBROADCAST requires AVX.
7507 // TODO: Splats could be generated for non-AVX CPUs using SSE
7508 // instructions, but there's less potential gain for only 128-bit vectors.
7509 if (!Subtarget.hasAVX())
7510 return SDValue();
7511
7512 MVT VT = BVOp->getSimpleValueType(0);
7513 unsigned NumElts = VT.getVectorNumElements();
7514 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7515 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7516 "Unsupported vector type for broadcast.");
7517
7518 // See if the build vector is a repeating sequence of scalars (inc. splat).
7519 SDValue Ld;
7520 BitVector UndefElements;
7521 SmallVector<SDValue, 16> Sequence;
7522 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7523 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7524 if (Sequence.size() == 1)
7525 Ld = Sequence[0];
7526 }
7527
7528 // Attempt to use VBROADCASTM
7529 // From this pattern:
7530 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7531 // b. t1 = (build_vector t0 t0)
7532 //
7533 // Create (VBROADCASTM v2i1 X)
7534 if (!Sequence.empty() && Subtarget.hasCDI()) {
7535 // If not a splat, are the upper sequence values zeroable?
7536 unsigned SeqLen = Sequence.size();
7537 bool UpperZeroOrUndef =
7538 SeqLen == 1 ||
7539 llvm::all_of(ArrayRef(Sequence).drop_front(),
7540 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7541 SDValue Op0 = Sequence[0];
7542 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7543 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7544 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7545 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7546 ? Op0.getOperand(0)
7547 : Op0.getOperand(0).getOperand(0);
7548 MVT MaskVT = BOperand.getSimpleValueType();
7549 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7550 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7551 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7552 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7553 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7554 unsigned Scale = 512 / VT.getSizeInBits();
7555 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7556 }
7557 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7558 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7559 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7560 return DAG.getBitcast(VT, Bcst);
7561 }
7562 }
7563 }
7564
7565 unsigned NumUndefElts = UndefElements.count();
7566 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7567 APInt SplatValue, Undef;
7568 unsigned SplatBitSize;
7569 bool HasUndef;
7570 // Check if this is a repeated constant pattern suitable for broadcasting.
7571 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7572 SplatBitSize > VT.getScalarSizeInBits() &&
7573 SplatBitSize < VT.getSizeInBits()) {
7574 // Avoid replacing with broadcast when it's a use of a shuffle
7575 // instruction to preserve the present custom lowering of shuffles.
7576 if (isFoldableUseOfShuffle(BVOp))
7577 return SDValue();
7578 // replace BUILD_VECTOR with broadcast of the repeated constants.
7579 LLVMContext *Ctx = DAG.getContext();
7580 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7581 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7582 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7583 // Load the constant scalar/subvector and broadcast it.
7584 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7585 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7586 SDValue CP = DAG.getConstantPool(C, PVT);
7587 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7588
7589 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7590 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7591 SDValue Ops[] = {DAG.getEntryNode(), CP};
7592 MachinePointerInfo MPI =
7594 SDValue Brdcst =
7595 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7596 MPI, Alignment, MachineMemOperand::MOLoad);
7597 return DAG.getBitcast(VT, Brdcst);
7598 }
7599 if (SplatBitSize > 64) {
7600 // Load the vector of constants and broadcast it.
7601 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7602 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7603 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7604 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7605 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7606 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7607 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7608 MachinePointerInfo MPI =
7611 Ops, VVT, MPI, Alignment,
7613 }
7614 }
7615
7616 // If we are moving a scalar into a vector (Ld must be set and all elements
7617 // but 1 are undef) and that operation is not obviously supported by
7618 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7619 // That's better than general shuffling and may eliminate a load to GPR and
7620 // move from scalar to vector register.
7621 if (!Ld || NumElts - NumUndefElts != 1)
7622 return SDValue();
7623 unsigned ScalarSize = Ld.getValueSizeInBits();
7624 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7625 return SDValue();
7626 }
7627
7628 bool ConstSplatVal =
7629 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7630 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7631
7632 // TODO: Handle broadcasts of non-constant sequences.
7633
7634 // Make sure that all of the users of a non-constant load are from the
7635 // BUILD_VECTOR node.
7636 // FIXME: Is the use count needed for non-constant, non-load case?
7637 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7638 return SDValue();
7639
7640 unsigned ScalarSize = Ld.getValueSizeInBits();
7641 bool IsGE256 = (VT.getSizeInBits() >= 256);
7642
7643 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7644 // instruction to save 8 or more bytes of constant pool data.
7645 // TODO: If multiple splats are generated to load the same constant,
7646 // it may be detrimental to overall size. There needs to be a way to detect
7647 // that condition to know if this is truly a size win.
7648 bool OptForSize = DAG.shouldOptForSize();
7649
7650 // Handle broadcasting a single constant scalar from the constant pool
7651 // into a vector.
7652 // On Sandybridge (no AVX2), it is still better to load a constant vector
7653 // from the constant pool and not to broadcast it from a scalar.
7654 // But override that restriction when optimizing for size.
7655 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7656 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7657 EVT CVT = Ld.getValueType();
7658 assert(!CVT.isVector() && "Must not broadcast a vector type");
7659
7660 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7661 // For size optimization, also splat v2f64 and v2i64, and for size opt
7662 // with AVX2, also splat i8 and i16.
7663 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7664 if (ScalarSize == 32 ||
7665 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7666 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7667 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7668 const Constant *C = nullptr;
7669 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7670 C = CI->getConstantIntValue();
7671 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7672 C = CF->getConstantFPValue();
7673
7674 assert(C && "Invalid constant type");
7675
7676 SDValue CP =
7678 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7679
7680 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7681 SDValue Ops[] = {DAG.getEntryNode(), CP};
7682 MachinePointerInfo MPI =
7684 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7685 MPI, Alignment, MachineMemOperand::MOLoad);
7686 }
7687 }
7688
7689 // Handle AVX2 in-register broadcasts.
7690 if (!IsLoad && Subtarget.hasInt256() &&
7691 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7692 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7693
7694 // The scalar source must be a normal load.
7695 if (!IsLoad)
7696 return SDValue();
7697
7698 // Make sure the non-chain result is only used by this build vector.
7699 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7700 return SDValue();
7701
7702 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7703 (Subtarget.hasVLX() && ScalarSize == 64)) {
7704 auto *LN = cast<LoadSDNode>(Ld);
7705 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7706 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7707 SDValue BCast =
7709 LN->getMemoryVT(), LN->getMemOperand());
7710 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7711 return BCast;
7712 }
7713
7714 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7715 // double since there is no vbroadcastsd xmm
7716 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7717 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7718 auto *LN = cast<LoadSDNode>(Ld);
7719 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7720 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7721 SDValue BCast =
7723 LN->getMemoryVT(), LN->getMemOperand());
7724 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7725 return BCast;
7726 }
7727
7728 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7729 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7730
7731 // Unsupported broadcast.
7732 return SDValue();
7733}
7734
7735/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7736/// underlying vector and index.
7737///
7738/// Modifies \p ExtractedFromVec to the real vector and returns the real
7739/// index.
7740static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7741 SDValue ExtIdx) {
7742 int Idx = ExtIdx->getAsZExtVal();
7743 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7744 return Idx;
7745
7746 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7747 // lowered this:
7748 // (extract_vector_elt (v8f32 %1), Constant<6>)
7749 // to:
7750 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7751 // (extract_subvector (v8f32 %0), Constant<4>),
7752 // undef)
7753 // Constant<0>)
7754 // In this case the vector is the extract_subvector expression and the index
7755 // is 2, as specified by the shuffle.
7756 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7757 SDValue ShuffleVec = SVOp->getOperand(0);
7758 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7759 assert(ShuffleVecVT.getVectorElementType() ==
7760 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7761
7762 int ShuffleIdx = SVOp->getMaskElt(Idx);
7763 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7764 ExtractedFromVec = ShuffleVec;
7765 return ShuffleIdx;
7766 }
7767 return Idx;
7768}
7769
7771 SelectionDAG &DAG) {
7772 MVT VT = Op.getSimpleValueType();
7773
7774 // Skip if insert_vec_elt is not supported.
7775 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7777 return SDValue();
7778
7779 unsigned NumElems = Op.getNumOperands();
7780 SDValue VecIn1;
7781 SDValue VecIn2;
7782 SmallVector<unsigned, 4> InsertIndices;
7783 SmallVector<int, 8> Mask(NumElems, -1);
7784
7785 for (unsigned i = 0; i != NumElems; ++i) {
7786 unsigned Opc = Op.getOperand(i).getOpcode();
7787
7788 if (Opc == ISD::UNDEF)
7789 continue;
7790
7791 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7792 // Quit if more than 1 elements need inserting.
7793 if (InsertIndices.size() > 1)
7794 return SDValue();
7795
7796 InsertIndices.push_back(i);
7797 continue;
7798 }
7799
7800 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7801 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7802
7803 // Quit if non-constant index.
7804 if (!isa<ConstantSDNode>(ExtIdx))
7805 return SDValue();
7806 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7807
7808 // Quit if extracted from vector of different type.
7809 if (ExtractedFromVec.getValueType() != VT)
7810 return SDValue();
7811
7812 if (!VecIn1.getNode())
7813 VecIn1 = ExtractedFromVec;
7814 else if (VecIn1 != ExtractedFromVec) {
7815 if (!VecIn2.getNode())
7816 VecIn2 = ExtractedFromVec;
7817 else if (VecIn2 != ExtractedFromVec)
7818 // Quit if more than 2 vectors to shuffle
7819 return SDValue();
7820 }
7821
7822 if (ExtractedFromVec == VecIn1)
7823 Mask[i] = Idx;
7824 else if (ExtractedFromVec == VecIn2)
7825 Mask[i] = Idx + NumElems;
7826 }
7827
7828 if (!VecIn1.getNode())
7829 return SDValue();
7830
7831 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7832 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7833
7834 for (unsigned Idx : InsertIndices)
7835 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7837
7838 return NV;
7839}
7840
7841// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7843 const X86Subtarget &Subtarget) {
7844 MVT VT = Op.getSimpleValueType();
7845 MVT IVT =
7846 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7848 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7849 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7850 Op.getOperand(I)));
7851 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7852 return DAG.getBitcast(VT, Res);
7853}
7854
7855// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7857 SelectionDAG &DAG,
7858 const X86Subtarget &Subtarget) {
7859
7860 MVT VT = Op.getSimpleValueType();
7861 assert((VT.getVectorElementType() == MVT::i1) &&
7862 "Unexpected type in LowerBUILD_VECTORvXi1!");
7863 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7864 ISD::isBuildVectorAllOnes(Op.getNode()))
7865 return Op;
7866
7867 uint64_t Immediate = 0;
7868 SmallVector<unsigned, 16> NonConstIdx;
7869 bool IsSplat = true;
7870 bool HasConstElts = false;
7871 int SplatIdx = -1;
7872 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7873 SDValue In = Op.getOperand(idx);
7874 if (In.isUndef())
7875 continue;
7876 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7877 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7878 HasConstElts = true;
7879 } else {
7880 NonConstIdx.push_back(idx);
7881 }
7882 if (SplatIdx < 0)
7883 SplatIdx = idx;
7884 else if (In != Op.getOperand(SplatIdx))
7885 IsSplat = false;
7886 }
7887
7888 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7889 if (IsSplat) {
7890 // The build_vector allows the scalar element to be larger than the vector
7891 // element type. We need to mask it to use as a condition unless we know
7892 // the upper bits are zero.
7893 // FIXME: Use computeKnownBits instead of checking specific opcode?
7894 SDValue Cond = Op.getOperand(SplatIdx);
7895 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7896 if (Cond.getOpcode() != ISD::SETCC)
7897 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7898 DAG.getConstant(1, dl, MVT::i8));
7899
7900 // Perform the select in the scalar domain so we can use cmov.
7901 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7902 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7903 DAG.getAllOnesConstant(dl, MVT::i32),
7904 DAG.getConstant(0, dl, MVT::i32));
7905 Select = DAG.getBitcast(MVT::v32i1, Select);
7906 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7907 } else {
7908 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7909 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7910 DAG.getAllOnesConstant(dl, ImmVT),
7911 DAG.getConstant(0, dl, ImmVT));
7912 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7913 Select = DAG.getBitcast(VecVT, Select);
7914 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7915 DAG.getVectorIdxConstant(0, dl));
7916 }
7917 }
7918
7919 // insert elements one by one
7920 SDValue DstVec;
7921 if (HasConstElts) {
7922 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7923 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7924 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7925 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7926 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7927 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7928 } else {
7929 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7930 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7931 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7932 DstVec = DAG.getBitcast(VecVT, Imm);
7933 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7934 DAG.getVectorIdxConstant(0, dl));
7935 }
7936 } else
7937 DstVec = DAG.getUNDEF(VT);
7938
7939 for (unsigned InsertIdx : NonConstIdx) {
7940 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7941 Op.getOperand(InsertIdx),
7942 DAG.getVectorIdxConstant(InsertIdx, dl));
7943 }
7944 return DstVec;
7945}
7946
7947LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7948 switch (Opcode) {
7949 case X86ISD::PACKSS:
7950 case X86ISD::PACKUS:
7951 case X86ISD::FHADD:
7952 case X86ISD::FHSUB:
7953 case X86ISD::HADD:
7954 case X86ISD::HSUB:
7955 return true;
7956 }
7957 return false;
7958}
7959
7960/// This is a helper function of LowerToHorizontalOp().
7961/// This function checks that the build_vector \p N in input implements a
7962/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7963/// may not match the layout of an x86 256-bit horizontal instruction.
7964/// In other words, if this returns true, then some extraction/insertion will
7965/// be required to produce a valid horizontal instruction.
7966///
7967/// Parameter \p Opcode defines the kind of horizontal operation to match.
7968/// For example, if \p Opcode is equal to ISD::ADD, then this function
7969/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7970/// is equal to ISD::SUB, then this function checks if this is a horizontal
7971/// arithmetic sub.
7972///
7973/// This function only analyzes elements of \p N whose indices are
7974/// in range [BaseIdx, LastIdx).
7975///
7976/// TODO: This function was originally used to match both real and fake partial
7977/// horizontal operations, but the index-matching logic is incorrect for that.
7978/// See the corrected implementation in isHopBuildVector(). Can we reduce this
7979/// code because it is only used for partial h-op matching now?
7980static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7981 const SDLoc &DL, SelectionDAG &DAG,
7982 unsigned BaseIdx, unsigned LastIdx,
7983 SDValue &V0, SDValue &V1) {
7984 EVT VT = N->getValueType(0);
7985 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7986 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7987 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7988 "Invalid Vector in input!");
7989
7990 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7991 bool CanFold = true;
7992 unsigned ExpectedVExtractIdx = BaseIdx;
7993 unsigned NumElts = LastIdx - BaseIdx;
7994 V0 = DAG.getUNDEF(VT);
7995 V1 = DAG.getUNDEF(VT);
7996
7997 // Check if N implements a horizontal binop.
7998 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7999 SDValue Op = N->getOperand(i + BaseIdx);
8000
8001 // Skip UNDEFs.
8002 if (Op->isUndef()) {
8003 // Update the expected vector extract index.
8004 if (i * 2 == NumElts)
8005 ExpectedVExtractIdx = BaseIdx;
8006 ExpectedVExtractIdx += 2;
8007 continue;
8008 }
8009
8010 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8011
8012 if (!CanFold)
8013 break;
8014
8015 SDValue Op0 = Op.getOperand(0);
8016 SDValue Op1 = Op.getOperand(1);
8017
8018 // Try to match the following pattern:
8019 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8020 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8022 Op0.getOperand(0) == Op1.getOperand(0) &&
8023 isa<ConstantSDNode>(Op0.getOperand(1)) &&
8024 isa<ConstantSDNode>(Op1.getOperand(1)));
8025 if (!CanFold)
8026 break;
8027
8028 unsigned I0 = Op0.getConstantOperandVal(1);
8029 unsigned I1 = Op1.getConstantOperandVal(1);
8030
8031 if (i * 2 < NumElts) {
8032 if (V0.isUndef()) {
8033 V0 = Op0.getOperand(0);
8034 if (V0.getValueType() != VT)
8035 return false;
8036 }
8037 } else {
8038 if (V1.isUndef()) {
8039 V1 = Op0.getOperand(0);
8040 if (V1.getValueType() != VT)
8041 return false;
8042 }
8043 if (i * 2 == NumElts)
8044 ExpectedVExtractIdx = BaseIdx;
8045 }
8046
8047 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8048 if (I0 == ExpectedVExtractIdx)
8049 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8050 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8051 // Try to match the following dag sequence:
8052 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8053 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8054 } else
8055 CanFold = false;
8056
8057 ExpectedVExtractIdx += 2;
8058 }
8059
8060 return CanFold;
8061}
8062
8063/// Emit a sequence of two 128-bit horizontal add/sub followed by
8064/// a concat_vector.
8065///
8066/// This is a helper function of LowerToHorizontalOp().
8067/// This function expects two 256-bit vectors called V0 and V1.
8068/// At first, each vector is split into two separate 128-bit vectors.
8069/// Then, the resulting 128-bit vectors are used to implement two
8070/// horizontal binary operations.
8071///
8072/// The kind of horizontal binary operation is defined by \p X86Opcode.
8073///
8074/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8075/// the two new horizontal binop.
8076/// When Mode is set, the first horizontal binop dag node would take as input
8077/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8078/// horizontal binop dag node would take as input the lower 128-bit of V1
8079/// and the upper 128-bit of V1.
8080/// Example:
8081/// HADD V0_LO, V0_HI
8082/// HADD V1_LO, V1_HI
8083///
8084/// Otherwise, the first horizontal binop dag node takes as input the lower
8085/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8086/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8087/// Example:
8088/// HADD V0_LO, V1_LO
8089/// HADD V0_HI, V1_HI
8090///
8091/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8092/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8093/// the upper 128-bits of the result.
8094static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8095 const SDLoc &DL, SelectionDAG &DAG,
8096 unsigned X86Opcode, bool Mode,
8097 bool isUndefLO, bool isUndefHI) {
8098 MVT VT = V0.getSimpleValueType();
8099 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8100 "Invalid nodes in input!");
8101
8102 unsigned NumElts = VT.getVectorNumElements();
8103 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8104 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8105 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8106 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8107 MVT NewVT = V0_LO.getSimpleValueType();
8108
8109 SDValue LO = DAG.getUNDEF(NewVT);
8110 SDValue HI = DAG.getUNDEF(NewVT);
8111
8112 if (Mode) {
8113 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8114 if (!isUndefLO && !V0->isUndef())
8115 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8116 if (!isUndefHI && !V1->isUndef())
8117 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8118 } else {
8119 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8120 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8121 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8122
8123 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8124 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8125 }
8126
8127 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8128}
8129
8130/// Returns true iff \p BV builds a vector with the result equivalent to
8131/// the result of ADDSUB/SUBADD operation.
8132/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8133/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8134/// \p Opnd0 and \p Opnd1.
8136 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8137 SDValue &Opnd0, SDValue &Opnd1,
8138 unsigned &NumExtracts,
8139 bool &IsSubAdd) {
8140
8141 MVT VT = BV->getSimpleValueType(0);
8142 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8143 return false;
8144
8145 unsigned NumElts = VT.getVectorNumElements();
8146 SDValue InVec0 = DAG.getUNDEF(VT);
8147 SDValue InVec1 = DAG.getUNDEF(VT);
8148
8149 NumExtracts = 0;
8150
8151 // Odd-numbered elements in the input build vector are obtained from
8152 // adding/subtracting two integer/float elements.
8153 // Even-numbered elements in the input build vector are obtained from
8154 // subtracting/adding two integer/float elements.
8155 unsigned Opc[2] = {0, 0};
8156 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8157 SDValue Op = BV->getOperand(i);
8158
8159 // Skip 'undef' values.
8160 unsigned Opcode = Op.getOpcode();
8161 if (Opcode == ISD::UNDEF)
8162 continue;
8163
8164 // Early exit if we found an unexpected opcode.
8165 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8166 return false;
8167
8168 SDValue Op0 = Op.getOperand(0);
8169 SDValue Op1 = Op.getOperand(1);
8170
8171 // Try to match the following pattern:
8172 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8173 // Early exit if we cannot match that sequence.
8174 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8176 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8177 Op0.getOperand(1) != Op1.getOperand(1))
8178 return false;
8179
8180 unsigned I0 = Op0.getConstantOperandVal(1);
8181 if (I0 != i)
8182 return false;
8183
8184 // We found a valid add/sub node, make sure its the same opcode as previous
8185 // elements for this parity.
8186 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8187 return false;
8188 Opc[i % 2] = Opcode;
8189
8190 // Update InVec0 and InVec1.
8191 if (InVec0.isUndef()) {
8192 InVec0 = Op0.getOperand(0);
8193 if (InVec0.getSimpleValueType() != VT)
8194 return false;
8195 }
8196 if (InVec1.isUndef()) {
8197 InVec1 = Op1.getOperand(0);
8198 if (InVec1.getSimpleValueType() != VT)
8199 return false;
8200 }
8201
8202 // Make sure that operands in input to each add/sub node always
8203 // come from a same pair of vectors.
8204 if (InVec0 != Op0.getOperand(0)) {
8205 if (Opcode == ISD::FSUB)
8206 return false;
8207
8208 // FADD is commutable. Try to commute the operands
8209 // and then test again.
8210 std::swap(Op0, Op1);
8211 if (InVec0 != Op0.getOperand(0))
8212 return false;
8213 }
8214
8215 if (InVec1 != Op1.getOperand(0))
8216 return false;
8217
8218 // Increment the number of extractions done.
8219 ++NumExtracts;
8220 }
8221
8222 // Ensure we have found an opcode for both parities and that they are
8223 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8224 // inputs are undef.
8225 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8226 InVec0.isUndef() || InVec1.isUndef())
8227 return false;
8228
8229 IsSubAdd = Opc[0] == ISD::FADD;
8230
8231 Opnd0 = InVec0;
8232 Opnd1 = InVec1;
8233 return true;
8234}
8235
8236/// Returns true if is possible to fold MUL and an idiom that has already been
8237/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8238/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8239/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8240///
8241/// Prior to calling this function it should be known that there is some
8242/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8243/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8244/// before replacement of such SDNode with ADDSUB operation. Thus the number
8245/// of \p Opnd0 uses is expected to be equal to 2.
8246/// For example, this function may be called for the following IR:
8247/// %AB = fmul fast <2 x double> %A, %B
8248/// %Sub = fsub fast <2 x double> %AB, %C
8249/// %Add = fadd fast <2 x double> %AB, %C
8250/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8251/// <2 x i32> <i32 0, i32 3>
8252/// There is a def for %Addsub here, which potentially can be replaced by
8253/// X86ISD::ADDSUB operation:
8254/// %Addsub = X86ISD::ADDSUB %AB, %C
8255/// and such ADDSUB can further be replaced with FMADDSUB:
8256/// %Addsub = FMADDSUB %A, %B, %C.
8257///
8258/// The main reason why this method is called before the replacement of the
8259/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8260/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8261/// FMADDSUB is.
8262static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8263 SelectionDAG &DAG,
8264 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8265 unsigned ExpectedUses) {
8266 if (Opnd0.getOpcode() != ISD::FMUL ||
8267 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8268 return false;
8269
8270 // FIXME: These checks must match the similar ones in
8271 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8272 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8273 // or MUL + ADDSUB to FMADDSUB.
8274 const TargetOptions &Options = DAG.getTarget().Options;
8275 bool AllowFusion =
8276 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8277 if (!AllowFusion)
8278 return false;
8279
8280 Opnd2 = Opnd1;
8281 Opnd1 = Opnd0.getOperand(1);
8282 Opnd0 = Opnd0.getOperand(0);
8283
8284 return true;
8285}
8286
8287/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8288/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8289/// X86ISD::FMSUBADD node.
8291 const SDLoc &DL,
8292 const X86Subtarget &Subtarget,
8293 SelectionDAG &DAG) {
8294 SDValue Opnd0, Opnd1;
8295 unsigned NumExtracts;
8296 bool IsSubAdd;
8297 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8298 IsSubAdd))
8299 return SDValue();
8300
8301 MVT VT = BV->getSimpleValueType(0);
8302
8303 // Try to generate X86ISD::FMADDSUB node here.
8304 SDValue Opnd2;
8305 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8306 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8307 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8308 }
8309
8310 // We only support ADDSUB.
8311 if (IsSubAdd)
8312 return SDValue();
8313
8314 // There are no known X86 targets with 512-bit ADDSUB instructions!
8315 // Convert to blend(fsub,fadd).
8316 if (VT.is512BitVector()) {
8317 SmallVector<int> Mask;
8318 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8319 Mask.push_back(I);
8320 Mask.push_back(I + E + 1);
8321 }
8322 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8323 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8324 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8325 }
8326
8327 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8328}
8329
8331 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8332 // Initialize outputs to known values.
8333 MVT VT = BV->getSimpleValueType(0);
8334 HOpcode = ISD::DELETED_NODE;
8335 V0 = DAG.getUNDEF(VT);
8336 V1 = DAG.getUNDEF(VT);
8337
8338 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8339 // half of the result is calculated independently from the 128-bit halves of
8340 // the inputs, so that makes the index-checking logic below more complicated.
8341 unsigned NumElts = VT.getVectorNumElements();
8342 unsigned GenericOpcode = ISD::DELETED_NODE;
8343 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8344 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8345 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8346 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8347 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8348 // Ignore undef elements.
8349 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8350 if (Op.isUndef())
8351 continue;
8352
8353 // If there's an opcode mismatch, we're done.
8354 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8355 return false;
8356
8357 // Initialize horizontal opcode.
8358 if (HOpcode == ISD::DELETED_NODE) {
8359 GenericOpcode = Op.getOpcode();
8360 switch (GenericOpcode) {
8361 // clang-format off
8362 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8363 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8364 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8365 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8366 default: return false;
8367 // clang-format on
8368 }
8369 }
8370
8371 SDValue Op0 = Op.getOperand(0);
8372 SDValue Op1 = Op.getOperand(1);
8373 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8375 Op0.getOperand(0) != Op1.getOperand(0) ||
8376 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8377 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8378 return false;
8379
8380 // The source vector is chosen based on which 64-bit half of the
8381 // destination vector is being calculated.
8382 if (j < NumEltsIn64Bits) {
8383 if (V0.isUndef())
8384 V0 = Op0.getOperand(0);
8385 } else {
8386 if (V1.isUndef())
8387 V1 = Op0.getOperand(0);
8388 }
8389
8390 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8391 if (SourceVec != Op0.getOperand(0))
8392 return false;
8393
8394 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8395 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8396 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8397 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8398 (j % NumEltsIn64Bits) * 2;
8399 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8400 continue;
8401
8402 // If this is not a commutative op, this does not match.
8403 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8404 return false;
8405
8406 // Addition is commutative, so try swapping the extract indexes.
8407 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8408 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8409 continue;
8410
8411 // Extract indexes do not match horizontal requirement.
8412 return false;
8413 }
8414 }
8415 // We matched. Opcode and operands are returned by reference as arguments.
8416 return true;
8417}
8418
8420 const SDLoc &DL, SelectionDAG &DAG,
8421 unsigned HOpcode, SDValue V0, SDValue V1) {
8422 // If either input vector is not the same size as the build vector,
8423 // extract/insert the low bits to the correct size.
8424 // This is free (examples: zmm --> xmm, xmm --> ymm).
8425 MVT VT = BV->getSimpleValueType(0);
8426 unsigned Width = VT.getSizeInBits();
8427 if (V0.getValueSizeInBits() > Width)
8428 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8429 else if (V0.getValueSizeInBits() < Width)
8430 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8431
8432 if (V1.getValueSizeInBits() > Width)
8433 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8434 else if (V1.getValueSizeInBits() < Width)
8435 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8436
8437 unsigned NumElts = VT.getVectorNumElements();
8438 APInt DemandedElts = APInt::getAllOnes(NumElts);
8439 for (unsigned i = 0; i != NumElts; ++i)
8440 if (BV->getOperand(i).isUndef())
8441 DemandedElts.clearBit(i);
8442
8443 // If we don't need the upper xmm, then perform as a xmm hop.
8444 unsigned HalfNumElts = NumElts / 2;
8445 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8446 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8447 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8448 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8449 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8450 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8451 }
8452
8453 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8454}
8455
8456/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8458 const X86Subtarget &Subtarget,
8459 SelectionDAG &DAG) {
8460 // We need at least 2 non-undef elements to make this worthwhile by default.
8461 unsigned NumNonUndefs =
8462 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8463 if (NumNonUndefs < 2)
8464 return SDValue();
8465
8466 // There are 4 sets of horizontal math operations distinguished by type:
8467 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8468 // subtarget feature. Try to match those "native" patterns first.
8469 MVT VT = BV->getSimpleValueType(0);
8470 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8471 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8472 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8473 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8474 unsigned HOpcode;
8475 SDValue V0, V1;
8476 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8477 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8478 }
8479
8480 // Try harder to match 256-bit ops by using extract/concat.
8481 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8482 return SDValue();
8483
8484 // Count the number of UNDEF operands in the build_vector in input.
8485 unsigned NumElts = VT.getVectorNumElements();
8486 unsigned Half = NumElts / 2;
8487 unsigned NumUndefsLO = 0;
8488 unsigned NumUndefsHI = 0;
8489 for (unsigned i = 0, e = Half; i != e; ++i)
8490 if (BV->getOperand(i)->isUndef())
8491 NumUndefsLO++;
8492
8493 for (unsigned i = Half, e = NumElts; i != e; ++i)
8494 if (BV->getOperand(i)->isUndef())
8495 NumUndefsHI++;
8496
8497 SDValue InVec0, InVec1;
8498 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8499 SDValue InVec2, InVec3;
8500 unsigned X86Opcode;
8501 bool CanFold = true;
8502
8503 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8504 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8505 InVec3) &&
8506 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8507 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8508 X86Opcode = X86ISD::HADD;
8509 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8510 InVec1) &&
8511 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8512 InVec3) &&
8513 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8514 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8515 X86Opcode = X86ISD::HSUB;
8516 else
8517 CanFold = false;
8518
8519 if (CanFold) {
8520 // Do not try to expand this build_vector into a pair of horizontal
8521 // add/sub if we can emit a pair of scalar add/sub.
8522 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8523 return SDValue();
8524
8525 // Convert this build_vector into a pair of horizontal binops followed by
8526 // a concat vector. We must adjust the outputs from the partial horizontal
8527 // matching calls above to account for undefined vector halves.
8528 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8529 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8530 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8531 bool isUndefLO = NumUndefsLO == Half;
8532 bool isUndefHI = NumUndefsHI == Half;
8533 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8534 isUndefHI);
8535 }
8536 }
8537
8538 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8539 VT == MVT::v16i16) {
8540 unsigned X86Opcode;
8541 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8542 InVec1))
8543 X86Opcode = X86ISD::HADD;
8544 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8545 InVec1))
8546 X86Opcode = X86ISD::HSUB;
8547 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8548 InVec1))
8549 X86Opcode = X86ISD::FHADD;
8550 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8551 InVec1))
8552 X86Opcode = X86ISD::FHSUB;
8553 else
8554 return SDValue();
8555
8556 // Don't try to expand this build_vector into a pair of horizontal add/sub
8557 // if we can simply emit a pair of scalar add/sub.
8558 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8559 return SDValue();
8560
8561 // Convert this build_vector into two horizontal add/sub followed by
8562 // a concat vector.
8563 bool isUndefLO = NumUndefsLO == Half;
8564 bool isUndefHI = NumUndefsHI == Half;
8565 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8566 isUndefLO, isUndefHI);
8567 }
8568
8569 return SDValue();
8570}
8571
8572static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8573 SelectionDAG &DAG);
8574
8575/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8576/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8577/// just apply the bit to the vectors.
8578/// NOTE: Its not in our interest to start make a general purpose vectorizer
8579/// from this, but enough scalar bit operations are created from the later
8580/// legalization + scalarization stages to need basic support.
8582 const X86Subtarget &Subtarget,
8583 SelectionDAG &DAG) {
8584 MVT VT = Op->getSimpleValueType(0);
8585 unsigned NumElems = VT.getVectorNumElements();
8586 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8587
8588 // Check that all elements have the same opcode.
8589 // TODO: Should we allow UNDEFS and if so how many?
8590 unsigned Opcode = Op->getOperand(0).getOpcode();
8591 for (unsigned i = 1; i < NumElems; ++i)
8592 if (Opcode != Op->getOperand(i).getOpcode())
8593 return SDValue();
8594
8595 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8596 bool IsShift = false;
8597 switch (Opcode) {
8598 default:
8599 return SDValue();
8600 case ISD::SHL:
8601 case ISD::SRL:
8602 case ISD::SRA:
8603 IsShift = true;
8604 break;
8605 case ISD::AND:
8606 case ISD::XOR:
8607 case ISD::OR:
8608 // Don't do this if the buildvector is a splat - we'd replace one
8609 // constant with an entire vector.
8610 if (Op->getSplatValue())
8611 return SDValue();
8612 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8613 return SDValue();
8614 break;
8615 }
8616
8617 SmallVector<SDValue, 4> LHSElts, RHSElts;
8618 for (SDValue Elt : Op->ops()) {
8619 SDValue LHS = Elt.getOperand(0);
8620 SDValue RHS = Elt.getOperand(1);
8621
8622 // We expect the canonicalized RHS operand to be the constant.
8623 if (!isa<ConstantSDNode>(RHS))
8624 return SDValue();
8625
8626 // Extend shift amounts.
8627 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8628 if (!IsShift)
8629 return SDValue();
8630 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8631 }
8632
8633 LHSElts.push_back(LHS);
8634 RHSElts.push_back(RHS);
8635 }
8636
8637 // Limit to shifts by uniform immediates.
8638 // TODO: Only accept vXi8/vXi64 special cases?
8639 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8640 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8641 return SDValue();
8642
8643 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8644 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8645 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8646
8647 if (!IsShift)
8648 return Res;
8649
8650 // Immediately lower the shift to ensure the constant build vector doesn't
8651 // get converted to a constant pool before the shift is lowered.
8652 return LowerShift(Res, Subtarget, DAG);
8653}
8654
8655/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8656/// functionality to do this, so it's all zeros, all ones, or some derivation
8657/// that is cheap to calculate.
8659 SelectionDAG &DAG,
8660 const X86Subtarget &Subtarget) {
8661 MVT VT = Op.getSimpleValueType();
8662
8663 // Vectors containing all zeros can be matched by pxor and xorps.
8664 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8665 return Op;
8666
8667 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8668 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8669 // vpcmpeqd on 256-bit vectors.
8670 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8671 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8672 return Op;
8673
8674 return getOnesVector(VT, DAG, DL);
8675 }
8676
8677 return SDValue();
8678}
8679
8680/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8681/// from a vector of source values and a vector of extraction indices.
8682/// The vectors might be manipulated to match the type of the permute op.
8683static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8684 const SDLoc &DL, SelectionDAG &DAG,
8685 const X86Subtarget &Subtarget) {
8686 MVT ShuffleVT = VT;
8687 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8688 unsigned NumElts = VT.getVectorNumElements();
8689 unsigned SizeInBits = VT.getSizeInBits();
8690
8691 // Adjust IndicesVec to match VT size.
8692 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8693 "Illegal variable permute mask size");
8694 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8695 // Narrow/widen the indices vector to the correct size.
8696 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8697 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8698 NumElts * VT.getScalarSizeInBits());
8699 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8700 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8701 SDLoc(IndicesVec), SizeInBits);
8702 // Zero-extend the index elements within the vector.
8703 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8704 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8705 IndicesVT, IndicesVec);
8706 }
8707 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8708
8709 // Handle SrcVec that don't match VT type.
8710 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8711 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8712 // Handle larger SrcVec by treating it as a larger permute.
8713 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8714 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8715 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8716 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8717 Subtarget, DAG, SDLoc(IndicesVec));
8718 SDValue NewSrcVec =
8719 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8720 if (NewSrcVec)
8721 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8722 return SDValue();
8723 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8724 // Widen smaller SrcVec to match VT.
8725 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8726 } else
8727 return SDValue();
8728 }
8729
8730 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8731 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8732 EVT SrcVT = Idx.getValueType();
8733 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8734 uint64_t IndexScale = 0;
8735 uint64_t IndexOffset = 0;
8736
8737 // If we're scaling a smaller permute op, then we need to repeat the
8738 // indices, scaling and offsetting them as well.
8739 // e.g. v4i32 -> v16i8 (Scale = 4)
8740 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8741 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8742 for (uint64_t i = 0; i != Scale; ++i) {
8743 IndexScale |= Scale << (i * NumDstBits);
8744 IndexOffset |= i << (i * NumDstBits);
8745 }
8746
8747 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8748 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8749 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8750 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8751 return Idx;
8752 };
8753
8754 unsigned Opcode = 0;
8755 switch (VT.SimpleTy) {
8756 default:
8757 break;
8758 case MVT::v16i8:
8759 if (Subtarget.hasSSSE3())
8760 Opcode = X86ISD::PSHUFB;
8761 break;
8762 case MVT::v8i16:
8763 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8764 Opcode = X86ISD::VPERMV;
8765 else if (Subtarget.hasSSSE3()) {
8766 Opcode = X86ISD::PSHUFB;
8767 ShuffleVT = MVT::v16i8;
8768 }
8769 break;
8770 case MVT::v4f32:
8771 case MVT::v4i32:
8772 if (Subtarget.hasAVX()) {
8773 Opcode = X86ISD::VPERMILPV;
8774 ShuffleVT = MVT::v4f32;
8775 } else if (Subtarget.hasSSSE3()) {
8776 Opcode = X86ISD::PSHUFB;
8777 ShuffleVT = MVT::v16i8;
8778 }
8779 break;
8780 case MVT::v2f64:
8781 case MVT::v2i64:
8782 if (Subtarget.hasAVX()) {
8783 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8784 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8785 Opcode = X86ISD::VPERMILPV;
8786 ShuffleVT = MVT::v2f64;
8787 } else if (Subtarget.hasSSE41()) {
8788 // SSE41 can compare v2i64 - select between indices 0 and 1.
8789 return DAG.getSelectCC(
8790 DL, IndicesVec,
8791 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8792 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8793 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8795 }
8796 break;
8797 case MVT::v32i8:
8798 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8799 Opcode = X86ISD::VPERMV;
8800 else if (Subtarget.hasXOP()) {
8801 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8802 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8803 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8804 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8805 return DAG.getNode(
8807 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8808 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8809 } else if (Subtarget.hasAVX()) {
8810 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8811 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8812 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8813 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8814 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8815 ArrayRef<SDValue> Ops) {
8816 // Permute Lo and Hi and then select based on index range.
8817 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8818 // care about the bit[7] as its just an index vector.
8819 SDValue Idx = Ops[2];
8820 EVT VT = Idx.getValueType();
8821 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8822 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8823 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8825 };
8826 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8827 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8828 PSHUFBBuilder);
8829 }
8830 break;
8831 case MVT::v16i16:
8832 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8833 Opcode = X86ISD::VPERMV;
8834 else if (Subtarget.hasAVX()) {
8835 // Scale to v32i8 and perform as v32i8.
8836 IndicesVec = ScaleIndices(IndicesVec, 2);
8837 return DAG.getBitcast(
8839 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8840 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8841 }
8842 break;
8843 case MVT::v8f32:
8844 case MVT::v8i32:
8845 if (Subtarget.hasAVX2())
8846 Opcode = X86ISD::VPERMV;
8847 else if (Subtarget.hasAVX()) {
8848 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8849 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8850 {0, 1, 2, 3, 0, 1, 2, 3});
8851 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8852 {4, 5, 6, 7, 4, 5, 6, 7});
8853 if (Subtarget.hasXOP())
8854 return DAG.getBitcast(
8855 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8856 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8857 // Permute Lo and Hi and then select based on index range.
8858 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8859 SDValue Res = DAG.getSelectCC(
8860 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8861 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8862 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8864 return DAG.getBitcast(VT, Res);
8865 }
8866 break;
8867 case MVT::v4i64:
8868 case MVT::v4f64:
8869 if (Subtarget.hasAVX512()) {
8870 if (!Subtarget.hasVLX()) {
8871 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8872 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8873 SDLoc(SrcVec));
8874 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8875 DAG, SDLoc(IndicesVec));
8876 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8877 DAG, Subtarget);
8878 return extract256BitVector(Res, 0, DAG, DL);
8879 }
8880 Opcode = X86ISD::VPERMV;
8881 } else if (Subtarget.hasAVX()) {
8882 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8883 SDValue LoLo =
8884 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8885 SDValue HiHi =
8886 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8887 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8888 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8889 if (Subtarget.hasXOP())
8890 return DAG.getBitcast(
8891 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8892 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8893 // Permute Lo and Hi and then select based on index range.
8894 // This works as VPERMILPD only uses index bit[1] to permute elements.
8895 SDValue Res = DAG.getSelectCC(
8896 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8897 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8898 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8900 return DAG.getBitcast(VT, Res);
8901 }
8902 break;
8903 case MVT::v64i8:
8904 if (Subtarget.hasVBMI())
8905 Opcode = X86ISD::VPERMV;
8906 break;
8907 case MVT::v32i16:
8908 if (Subtarget.hasBWI())
8909 Opcode = X86ISD::VPERMV;
8910 break;
8911 case MVT::v16f32:
8912 case MVT::v16i32:
8913 case MVT::v8f64:
8914 case MVT::v8i64:
8915 if (Subtarget.hasAVX512())
8916 Opcode = X86ISD::VPERMV;
8917 break;
8918 }
8919 if (!Opcode)
8920 return SDValue();
8921
8922 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8923 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8924 "Illegal variable permute shuffle type");
8925
8926 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8927 if (Scale > 1)
8928 IndicesVec = ScaleIndices(IndicesVec, Scale);
8929
8930 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8931 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8932
8933 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8934 SDValue Res = Opcode == X86ISD::VPERMV
8935 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8936 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8937 return DAG.getBitcast(VT, Res);
8938}
8939
8940// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8941// reasoned to be a permutation of a vector by indices in a non-constant vector.
8942// (build_vector (extract_elt V, (extract_elt I, 0)),
8943// (extract_elt V, (extract_elt I, 1)),
8944// ...
8945// ->
8946// (vpermv I, V)
8947//
8948// TODO: Handle undefs
8949// TODO: Utilize pshufb and zero mask blending to support more efficient
8950// construction of vectors with constant-0 elements.
8951static SDValue
8953 SelectionDAG &DAG,
8954 const X86Subtarget &Subtarget) {
8955 SDValue SrcVec, IndicesVec;
8956 // Check for a match of the permute source vector and permute index elements.
8957 // This is done by checking that the i-th build_vector operand is of the form:
8958 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8959 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8960 SDValue Op = V.getOperand(Idx);
8961 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8962 return SDValue();
8963
8964 // If this is the first extract encountered in V, set the source vector,
8965 // otherwise verify the extract is from the previously defined source
8966 // vector.
8967 if (!SrcVec)
8968 SrcVec = Op.getOperand(0);
8969 else if (SrcVec != Op.getOperand(0))
8970 return SDValue();
8971 SDValue ExtractedIndex = Op->getOperand(1);
8972 // Peek through extends.
8973 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8974 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8975 ExtractedIndex = ExtractedIndex.getOperand(0);
8976 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8977 return SDValue();
8978
8979 // If this is the first extract from the index vector candidate, set the
8980 // indices vector, otherwise verify the extract is from the previously
8981 // defined indices vector.
8982 if (!IndicesVec)
8983 IndicesVec = ExtractedIndex.getOperand(0);
8984 else if (IndicesVec != ExtractedIndex.getOperand(0))
8985 return SDValue();
8986
8987 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8988 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8989 return SDValue();
8990 }
8991
8992 MVT VT = V.getSimpleValueType();
8993 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8994}
8995
8996SDValue
8997X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8998 SDLoc dl(Op);
8999
9000 MVT VT = Op.getSimpleValueType();
9001 MVT EltVT = VT.getVectorElementType();
9002 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9003 unsigned NumElems = Op.getNumOperands();
9004
9005 // Generate vectors for predicate vectors.
9006 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9007 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9008
9009 if (VT.getVectorElementType() == MVT::bf16 &&
9010 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9011 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9012
9013 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9014 return VectorCst;
9015
9016 unsigned EVTBits = EltVT.getSizeInBits();
9017 APInt UndefMask = APInt::getZero(NumElems);
9018 APInt FrozenUndefMask = APInt::getZero(NumElems);
9019 APInt ZeroMask = APInt::getZero(NumElems);
9020 APInt NonZeroMask = APInt::getZero(NumElems);
9021 bool IsAllConstants = true;
9022 bool OneUseFrozenUndefs = true;
9023 SmallSet<SDValue, 8> Values;
9024 unsigned NumConstants = NumElems;
9025 for (unsigned i = 0; i < NumElems; ++i) {
9026 SDValue Elt = Op.getOperand(i);
9027 if (Elt.isUndef()) {
9028 UndefMask.setBit(i);
9029 continue;
9030 }
9031 if (ISD::isFreezeUndef(Elt.getNode())) {
9032 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9033 FrozenUndefMask.setBit(i);
9034 continue;
9035 }
9036 Values.insert(Elt);
9037 if (!isIntOrFPConstant(Elt)) {
9038 IsAllConstants = false;
9039 NumConstants--;
9040 }
9041 if (X86::isZeroNode(Elt)) {
9042 ZeroMask.setBit(i);
9043 } else {
9044 NonZeroMask.setBit(i);
9045 }
9046 }
9047
9048 // All undef vector. Return an UNDEF.
9049 if (UndefMask.isAllOnes())
9050 return DAG.getUNDEF(VT);
9051
9052 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9053 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9054 return DAG.getFreeze(DAG.getUNDEF(VT));
9055
9056 // All undef/freeze(undef)/zero vector. Return a zero vector.
9057 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9058 return getZeroVector(VT, Subtarget, DAG, dl);
9059
9060 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9061 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9062 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9063 // and blend the FREEZE-UNDEF operands back in.
9064 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9065 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9066 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9067 SmallVector<int, 16> BlendMask(NumElems, -1);
9068 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9069 for (unsigned i = 0; i < NumElems; ++i) {
9070 if (UndefMask[i]) {
9071 BlendMask[i] = -1;
9072 continue;
9073 }
9074 BlendMask[i] = i;
9075 if (!FrozenUndefMask[i])
9076 Elts[i] = Op.getOperand(i);
9077 else
9078 BlendMask[i] += NumElems;
9079 }
9080 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9081 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9082 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9083 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9084 }
9085
9086 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9087
9088 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9089 // be better off lowering to a smaller build vector and padding with
9090 // undef/zero.
9091 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9093 unsigned UpperElems = NumElems / 2;
9094 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9095 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9096 if (NumUpperUndefsOrZeros >= UpperElems) {
9097 if (VT.is512BitVector() &&
9098 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9099 UpperElems = NumElems - (NumElems / 4);
9100 // If freeze(undef) is in any upper elements, force to zero.
9101 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9102 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9103 SDValue NewBV =
9104 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9105 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9106 }
9107 }
9108
9109 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9110 return AddSub;
9111 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9112 return HorizontalOp;
9113 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9114 return Broadcast;
9115 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9116 return BitOp;
9117
9118 unsigned NumZero = ZeroMask.popcount();
9119 unsigned NumNonZero = NonZeroMask.popcount();
9120
9121 // If we are inserting one variable into a vector of non-zero constants, try
9122 // to avoid loading each constant element as a scalar. Load the constants as a
9123 // vector and then insert the variable scalar element. If insertion is not
9124 // supported, fall back to a shuffle to get the scalar blended with the
9125 // constants. Insertion into a zero vector is handled as a special-case
9126 // somewhere below here.
9127 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9128 FrozenUndefMask.isZero() &&
9131 // Create an all-constant vector. The variable element in the old
9132 // build vector is replaced by undef in the constant vector. Save the
9133 // variable scalar element and its index for use in the insertelement.
9134 LLVMContext &Context = *DAG.getContext();
9135 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9136 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9137 SDValue VarElt;
9138 SDValue InsIndex;
9139 for (unsigned i = 0; i != NumElems; ++i) {
9140 SDValue Elt = Op.getOperand(i);
9141 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9142 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9143 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9144 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9145 else if (!Elt.isUndef()) {
9146 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9147 "Expected one variable element in this vector");
9148 VarElt = Elt;
9149 InsIndex = DAG.getVectorIdxConstant(i, dl);
9150 }
9151 }
9152 Constant *CV = ConstantVector::get(ConstVecOps);
9153 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9154
9155 // The constants we just created may not be legal (eg, floating point). We
9156 // must lower the vector right here because we can not guarantee that we'll
9157 // legalize it before loading it. This is also why we could not just create
9158 // a new build vector here. If the build vector contains illegal constants,
9159 // it could get split back up into a series of insert elements.
9160 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9161 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9164 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9165 unsigned InsertC = InsIndex->getAsZExtVal();
9166 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9167 if (InsertC < NumEltsInLow128Bits)
9168 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9169
9170 // There's no good way to insert into the high elements of a >128-bit
9171 // vector, so use shuffles to avoid an extract/insert sequence.
9172 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9173 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9174 SmallVector<int, 8> ShuffleMask;
9175 unsigned NumElts = VT.getVectorNumElements();
9176 for (unsigned i = 0; i != NumElts; ++i)
9177 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9178 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9179 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9180 }
9181
9182 // Special case for single non-zero, non-undef, element.
9183 if (NumNonZero == 1) {
9184 unsigned Idx = NonZeroMask.countr_zero();
9185 SDValue Item = Op.getOperand(Idx);
9186
9187 // If we have a constant or non-constant insertion into the low element of
9188 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9189 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9190 // depending on what the source datatype is.
9191 if (Idx == 0) {
9192 if (NumZero == 0)
9193 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9194
9195 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9196 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9197 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9198 assert((VT.is128BitVector() || VT.is256BitVector() ||
9199 VT.is512BitVector()) &&
9200 "Expected an SSE value type!");
9201 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9202 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9203 // zero vector.
9204 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9205 }
9206
9207 // We can't directly insert an i8 or i16 into a vector, so zero extend
9208 // it to i32 first.
9209 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9210 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9211 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9212 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9213 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9214 return DAG.getBitcast(VT, Item);
9215 }
9216 }
9217
9218 // Is it a vector logical left shift?
9219 if (NumElems == 2 && Idx == 1 &&
9220 X86::isZeroNode(Op.getOperand(0)) &&
9221 !X86::isZeroNode(Op.getOperand(1))) {
9222 unsigned NumBits = VT.getSizeInBits();
9223 return getVShift(true, VT,
9225 VT, Op.getOperand(1)),
9226 NumBits/2, DAG, *this, dl);
9227 }
9228
9229 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9230 return SDValue();
9231
9232 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9233 // is a non-constant being inserted into an element other than the low one,
9234 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9235 // movd/movss) to move this into the low element, then shuffle it into
9236 // place.
9237 if (EVTBits == 32) {
9238 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9239 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9240 }
9241 }
9242
9243 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9244 if (Values.size() == 1) {
9245 if (EVTBits == 32) {
9246 // Instead of a shuffle like this:
9247 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9248 // Check if it's possible to issue this instead.
9249 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9250 unsigned Idx = NonZeroMask.countr_zero();
9251 SDValue Item = Op.getOperand(Idx);
9252 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9253 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9254 }
9255 return SDValue();
9256 }
9257
9258 // A vector full of immediates; various special cases are already
9259 // handled, so this is best done with a single constant-pool load.
9260 if (IsAllConstants)
9261 return SDValue();
9262
9263 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9264 return V;
9265
9266 // See if we can use a vector load to get all of the elements.
9267 {
9268 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9269 if (SDValue LD =
9270 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9271 return LD;
9272 }
9273
9274 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9275 // build_vector and broadcast it.
9276 // TODO: We could probably generalize this more.
9277 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9278 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9279 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9280 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9281 // Make sure all the even/odd operands match.
9282 for (unsigned i = 2; i != NumElems; ++i)
9283 if (Ops[i % 2] != Op.getOperand(i))
9284 return false;
9285 return true;
9286 };
9287 if (CanSplat(Op, NumElems, Ops)) {
9288 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9289 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9290 // Create a new build vector and cast to v2i64/v2f64.
9291 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9292 DAG.getBuildVector(NarrowVT, dl, Ops));
9293 // Broadcast from v2i64/v2f64 and cast to final VT.
9294 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9295 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9296 NewBV));
9297 }
9298 }
9299
9300 // For AVX-length vectors, build the individual 128-bit pieces and use
9301 // shuffles to put them in place.
9302 if (VT.getSizeInBits() > 128) {
9303 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9304
9305 // Build both the lower and upper subvector.
9306 SDValue Lower =
9307 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9309 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9310
9311 // Recreate the wider vector with the lower and upper part.
9312 return concatSubVectors(Lower, Upper, DAG, dl);
9313 }
9314
9315 // Let legalizer expand 2-wide build_vectors.
9316 if (EVTBits == 64) {
9317 if (NumNonZero == 1) {
9318 // One half is zero or undef.
9319 unsigned Idx = NonZeroMask.countr_zero();
9321 Op.getOperand(Idx));
9322 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9323 }
9324 return SDValue();
9325 }
9326
9327 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9328 if (EVTBits == 8 && NumElems == 16)
9329 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9330 NumZero, DAG, Subtarget))
9331 return V;
9332
9333 if (EltVT == MVT::i16 && NumElems == 8)
9334 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9335 NumZero, DAG, Subtarget))
9336 return V;
9337
9338 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9339 if (EVTBits == 32 && NumElems == 4)
9340 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9341 return V;
9342
9343 // If element VT is == 32 bits, turn it into a number of shuffles.
9344 if (NumElems == 4 && NumZero > 0) {
9345 SmallVector<SDValue, 8> Ops(NumElems);
9346 for (unsigned i = 0; i < 4; ++i) {
9347 bool isZero = !NonZeroMask[i];
9348 if (isZero)
9349 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9350 else
9351 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9352 }
9353
9354 for (unsigned i = 0; i < 2; ++i) {
9355 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9356 default: llvm_unreachable("Unexpected NonZero count");
9357 case 0:
9358 Ops[i] = Ops[i*2]; // Must be a zero vector.
9359 break;
9360 case 1:
9361 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9362 break;
9363 case 2:
9364 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9365 break;
9366 case 3:
9367 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9368 break;
9369 }
9370 }
9371
9372 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9373 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9374 int MaskVec[] = {
9375 Reverse1 ? 1 : 0,
9376 Reverse1 ? 0 : 1,
9377 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9378 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9379 };
9380 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9381 }
9382
9383 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9384
9385 // Check for a build vector from mostly shuffle plus few inserting.
9386 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9387 return Sh;
9388
9389 // For SSE 4.1, use insertps to put the high elements into the low element.
9390 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9392 if (!Op.getOperand(0).isUndef())
9393 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9394 else
9395 Result = DAG.getUNDEF(VT);
9396
9397 for (unsigned i = 1; i < NumElems; ++i) {
9398 if (Op.getOperand(i).isUndef()) continue;
9399 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9400 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9401 }
9402 return Result;
9403 }
9404
9405 // Otherwise, expand into a number of unpckl*, start by extending each of
9406 // our (non-undef) elements to the full vector width with the element in the
9407 // bottom slot of the vector (which generates no code for SSE).
9408 SmallVector<SDValue, 8> Ops(NumElems);
9409 for (unsigned i = 0; i < NumElems; ++i) {
9410 if (!Op.getOperand(i).isUndef())
9411 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9412 else
9413 Ops[i] = DAG.getUNDEF(VT);
9414 }
9415
9416 // Next, we iteratively mix elements, e.g. for v4f32:
9417 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9418 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9419 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9420 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9421 // Generate scaled UNPCKL shuffle mask.
9423 for(unsigned i = 0; i != Scale; ++i)
9424 Mask.push_back(i);
9425 for (unsigned i = 0; i != Scale; ++i)
9426 Mask.push_back(NumElems+i);
9427 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9428
9429 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9430 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9431 }
9432 return Ops[0];
9433}
9434
9435// 256-bit AVX can use the vinsertf128 instruction
9436// to create 256-bit vectors from two other 128-bit ones.
9437// TODO: Detect subvector broadcast here instead of DAG combine?
9439 const X86Subtarget &Subtarget) {
9440 SDLoc dl(Op);
9441 MVT ResVT = Op.getSimpleValueType();
9442
9443 assert((ResVT.is256BitVector() ||
9444 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9445
9446 unsigned NumOperands = Op.getNumOperands();
9447 unsigned NumFreezeUndef = 0;
9448 unsigned NumZero = 0;
9449 unsigned NumNonZero = 0;
9450 unsigned NonZeros = 0;
9451 for (unsigned i = 0; i != NumOperands; ++i) {
9452 SDValue SubVec = Op.getOperand(i);
9453 if (SubVec.isUndef())
9454 continue;
9455 if (ISD::isFreezeUndef(SubVec.getNode())) {
9456 // If the freeze(undef) has multiple uses then we must fold to zero.
9457 if (SubVec.hasOneUse())
9458 ++NumFreezeUndef;
9459 else
9460 ++NumZero;
9461 }
9462 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9463 ++NumZero;
9464 else {
9465 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9466 NonZeros |= 1 << i;
9467 ++NumNonZero;
9468 }
9469 }
9470
9471 // If we have more than 2 non-zeros, build each half separately.
9472 if (NumNonZero > 2) {
9473 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9474 ArrayRef<SDUse> Ops = Op->ops();
9475 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9476 Ops.slice(0, NumOperands/2));
9477 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9478 Ops.slice(NumOperands/2));
9479 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9480 }
9481
9482 // Otherwise, build it up through insert_subvectors.
9483 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9484 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9485 : DAG.getUNDEF(ResVT));
9486
9487 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9488 unsigned NumSubElems = SubVT.getVectorNumElements();
9489 for (unsigned i = 0; i != NumOperands; ++i) {
9490 if ((NonZeros & (1 << i)) == 0)
9491 continue;
9492
9493 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9494 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9495 }
9496
9497 return Vec;
9498}
9499
9500// Returns true if the given node is a type promotion (by concatenating i1
9501// zeros) of the result of a node that already zeros all upper bits of
9502// k-register.
9503// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9505 const X86Subtarget &Subtarget,
9506 SelectionDAG & DAG) {
9507 SDLoc dl(Op);
9508 MVT ResVT = Op.getSimpleValueType();
9509 unsigned NumOperands = Op.getNumOperands();
9510
9511 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9512 "Unexpected number of operands in CONCAT_VECTORS");
9513
9514 uint64_t Zeros = 0;
9515 uint64_t NonZeros = 0;
9516 for (unsigned i = 0; i != NumOperands; ++i) {
9517 SDValue SubVec = Op.getOperand(i);
9518 if (SubVec.isUndef())
9519 continue;
9520 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9521 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9522 Zeros |= (uint64_t)1 << i;
9523 else
9524 NonZeros |= (uint64_t)1 << i;
9525 }
9526
9527 unsigned NumElems = ResVT.getVectorNumElements();
9528
9529 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9530 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9531 // insert_subvector will give us two kshifts.
9532 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9533 Log2_64(NonZeros) != NumOperands - 1) {
9534 unsigned Idx = Log2_64(NonZeros);
9535 SDValue SubVec = Op.getOperand(Idx);
9536 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9537 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9538 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9539 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9540 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9541 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9542 DAG.getVectorIdxConstant(0, dl));
9543 }
9544
9545 // If there are zero or one non-zeros we can handle this very simply.
9546 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9547 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9548 if (!NonZeros)
9549 return Vec;
9550 unsigned Idx = Log2_64(NonZeros);
9551 SDValue SubVec = Op.getOperand(Idx);
9552 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9553 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9554 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9555 }
9556
9557 if (NumOperands > 2) {
9558 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9559 ArrayRef<SDUse> Ops = Op->ops();
9560 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9561 Ops.slice(0, NumOperands / 2));
9562 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9563 Ops.slice(NumOperands / 2));
9564 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9565 }
9566
9567 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9568
9569 if (ResVT.getVectorNumElements() >= 16)
9570 return Op; // The operation is legal with KUNPCK
9571
9572 SDValue Vec =
9573 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9574 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9575 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9576 DAG.getVectorIdxConstant(NumElems / 2, dl));
9577}
9578
9580 const X86Subtarget &Subtarget,
9581 SelectionDAG &DAG) {
9582 MVT VT = Op.getSimpleValueType();
9583 if (VT.getVectorElementType() == MVT::i1)
9584 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9585
9586 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9587 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9588 Op.getNumOperands() == 4)));
9589
9590 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9591 // from two other 128-bit ones.
9592
9593 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9594 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9595}
9596
9597//===----------------------------------------------------------------------===//
9598// Vector shuffle lowering
9599//
9600// This is an experimental code path for lowering vector shuffles on x86. It is
9601// designed to handle arbitrary vector shuffles and blends, gracefully
9602// degrading performance as necessary. It works hard to recognize idiomatic
9603// shuffles and lower them to optimal instruction patterns without leaving
9604// a framework that allows reasonably efficient handling of all vector shuffle
9605// patterns.
9606//===----------------------------------------------------------------------===//
9607
9608/// Tiny helper function to identify a no-op mask.
9609///
9610/// This is a somewhat boring predicate function. It checks whether the mask
9611/// array input, which is assumed to be a single-input shuffle mask of the kind
9612/// used by the X86 shuffle instructions (not a fully general
9613/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9614/// in-place shuffle are 'no-op's.
9616 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9617 assert(Mask[i] >= -1 && "Out of bound mask element!");
9618 if (Mask[i] >= 0 && Mask[i] != i)
9619 return false;
9620 }
9621 return true;
9622}
9623
9624/// Test whether there are elements crossing LaneSizeInBits lanes in this
9625/// shuffle mask.
9626///
9627/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9628/// and we routinely test for these.
9629static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9630 unsigned ScalarSizeInBits,
9631 ArrayRef<int> Mask) {
9632 assert(LaneSizeInBits && ScalarSizeInBits &&
9633 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9634 "Illegal shuffle lane size");
9635 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9636 int Size = Mask.size();
9637 for (int i = 0; i < Size; ++i)
9638 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9639 return true;
9640 return false;
9641}
9642
9643/// Test whether there are elements crossing 128-bit lanes in this
9644/// shuffle mask.
9646 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9647}
9648
9649/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9650/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9651/// better support 'repeated mask + lane permute' style shuffles.
9652static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9653 unsigned ScalarSizeInBits,
9654 ArrayRef<int> Mask) {
9655 assert(LaneSizeInBits && ScalarSizeInBits &&
9656 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9657 "Illegal shuffle lane size");
9658 int NumElts = Mask.size();
9659 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9660 int NumLanes = NumElts / NumEltsPerLane;
9661 if (NumLanes > 1) {
9662 for (int i = 0; i != NumLanes; ++i) {
9663 int SrcLane = -1;
9664 for (int j = 0; j != NumEltsPerLane; ++j) {
9665 int M = Mask[(i * NumEltsPerLane) + j];
9666 if (M < 0)
9667 continue;
9668 int Lane = (M % NumElts) / NumEltsPerLane;
9669 if (SrcLane >= 0 && SrcLane != Lane)
9670 return true;
9671 SrcLane = Lane;
9672 }
9673 }
9674 }
9675 return false;
9676}
9677
9678/// Test whether a shuffle mask is equivalent within each sub-lane.
9679///
9680/// This checks a shuffle mask to see if it is performing the same
9681/// lane-relative shuffle in each sub-lane. This trivially implies
9682/// that it is also not lane-crossing. It may however involve a blend from the
9683/// same lane of a second vector.
9684///
9685/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9686/// non-trivial to compute in the face of undef lanes. The representation is
9687/// suitable for use with existing 128-bit shuffles as entries from the second
9688/// vector have been remapped to [LaneSize, 2*LaneSize).
9689static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9690 ArrayRef<int> Mask,
9691 SmallVectorImpl<int> &RepeatedMask) {
9692 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9693 RepeatedMask.assign(LaneSize, -1);
9694 int Size = Mask.size();
9695 for (int i = 0; i < Size; ++i) {
9696 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9697 if (Mask[i] < 0)
9698 continue;
9699 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9700 // This entry crosses lanes, so there is no way to model this shuffle.
9701 return false;
9702
9703 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9704 // Adjust second vector indices to start at LaneSize instead of Size.
9705 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9706 : Mask[i] % LaneSize + LaneSize;
9707 if (RepeatedMask[i % LaneSize] < 0)
9708 // This is the first non-undef entry in this slot of a 128-bit lane.
9709 RepeatedMask[i % LaneSize] = LocalM;
9710 else if (RepeatedMask[i % LaneSize] != LocalM)
9711 // Found a mismatch with the repeated mask.
9712 return false;
9713 }
9714 return true;
9715}
9716
9717/// Test whether a shuffle mask is equivalent within each 128-bit lane.
9718static bool
9720 SmallVectorImpl<int> &RepeatedMask) {
9721 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9722}
9723
9724static bool
9726 SmallVector<int, 32> RepeatedMask;
9727 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9728}
9729
9730/// Test whether a shuffle mask is equivalent within each 256-bit lane.
9731static bool
9733 SmallVectorImpl<int> &RepeatedMask) {
9734 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9735}
9736
9737/// Test whether a target shuffle mask is equivalent within each sub-lane.
9738/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9739static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9740 unsigned EltSizeInBits,
9741 ArrayRef<int> Mask,
9742 SmallVectorImpl<int> &RepeatedMask) {
9743 int LaneSize = LaneSizeInBits / EltSizeInBits;
9744 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9745 int Size = Mask.size();
9746 for (int i = 0; i < Size; ++i) {
9747 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9748 if (Mask[i] == SM_SentinelUndef)
9749 continue;
9750 if (Mask[i] == SM_SentinelZero) {
9751 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9752 return false;
9753 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9754 continue;
9755 }
9756 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9757 // This entry crosses lanes, so there is no way to model this shuffle.
9758 return false;
9759
9760 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9761 // later vector indices to start at multiples of LaneSize instead of Size.
9762 int LaneM = Mask[i] / Size;
9763 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9764 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9765 // This is the first non-undef entry in this slot of a 128-bit lane.
9766 RepeatedMask[i % LaneSize] = LocalM;
9767 else if (RepeatedMask[i % LaneSize] != LocalM)
9768 // Found a mismatch with the repeated mask.
9769 return false;
9770 }
9771 return true;
9772}
9773
9774/// Test whether a target shuffle mask is equivalent within each sub-lane.
9775/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9776static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9777 ArrayRef<int> Mask,
9778 SmallVectorImpl<int> &RepeatedMask) {
9779 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9780 Mask, RepeatedMask);
9781}
9782
9783/// Checks whether the vector elements referenced by two shuffle masks are
9784/// equivalent.
9785static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9786 int Idx, int ExpectedIdx) {
9787 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9788 ExpectedIdx < MaskSize && "Out of range element index");
9789 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9790 return false;
9791
9792 switch (Op.getOpcode()) {
9793 case ISD::BUILD_VECTOR:
9794 // If the values are build vectors, we can look through them to find
9795 // equivalent inputs that make the shuffles equivalent.
9796 // TODO: Handle MaskSize != Op.getNumOperands()?
9797 if (MaskSize == (int)Op.getNumOperands() &&
9798 MaskSize == (int)ExpectedOp.getNumOperands())
9799 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9800 break;
9801 case X86ISD::VBROADCAST:
9803 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9804 return (Op == ExpectedOp &&
9805 (int)Op.getValueType().getVectorNumElements() == MaskSize);
9806 case X86ISD::HADD:
9807 case X86ISD::HSUB:
9808 case X86ISD::FHADD:
9809 case X86ISD::FHSUB:
9810 case X86ISD::PACKSS:
9811 case X86ISD::PACKUS:
9812 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9813 // TODO: Handle MaskSize != NumElts?
9814 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9815 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9816 MVT VT = Op.getSimpleValueType();
9817 int NumElts = VT.getVectorNumElements();
9818 if (MaskSize == NumElts) {
9819 int NumLanes = VT.getSizeInBits() / 128;
9820 int NumEltsPerLane = NumElts / NumLanes;
9821 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9822 bool SameLane =
9823 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9824 bool SameElt =
9825 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9826 return SameLane && SameElt;
9827 }
9828 }
9829 break;
9830 }
9831
9832 return false;
9833}
9834
9835/// Checks whether a shuffle mask is equivalent to an explicit list of
9836/// arguments.
9837///
9838/// This is a fast way to test a shuffle mask against a fixed pattern:
9839///
9840/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9841///
9842/// It returns true if the mask is exactly as wide as the argument list, and
9843/// each element of the mask is either -1 (signifying undef) or the value given
9844/// in the argument.
9845static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9846 SDValue V1 = SDValue(),
9847 SDValue V2 = SDValue()) {
9848 int Size = Mask.size();
9849 if (Size != (int)ExpectedMask.size())
9850 return false;
9851
9852 for (int i = 0; i < Size; ++i) {
9853 assert(Mask[i] >= -1 && "Out of bound mask element!");
9854 int MaskIdx = Mask[i];
9855 int ExpectedIdx = ExpectedMask[i];
9856 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9857 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9858 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9859 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9860 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9861 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9862 return false;
9863 }
9864 }
9865 return true;
9866}
9867
9868/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9869///
9870/// The masks must be exactly the same width.
9871///
9872/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9873/// value in ExpectedMask is always accepted. Otherwise the indices must match.
9874///
9875/// SM_SentinelZero is accepted as a valid negative index but must match in
9876/// both, or via a known bits test.
9878 ArrayRef<int> ExpectedMask,
9879 const SelectionDAG &DAG,
9880 SDValue V1 = SDValue(),
9881 SDValue V2 = SDValue()) {
9882 int Size = Mask.size();
9883 if (Size != (int)ExpectedMask.size())
9884 return false;
9885 assert(llvm::all_of(ExpectedMask,
9886 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9887 "Illegal target shuffle mask");
9888
9889 // Check for out-of-range target shuffle mask indices.
9890 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9891 return false;
9892
9893 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9894 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9895 !V1.getValueType().isVector()))
9896 V1 = SDValue();
9897 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9898 !V2.getValueType().isVector()))
9899 V2 = SDValue();
9900
9901 APInt ZeroV1 = APInt::getZero(Size);
9902 APInt ZeroV2 = APInt::getZero(Size);
9903
9904 for (int i = 0; i < Size; ++i) {
9905 int MaskIdx = Mask[i];
9906 int ExpectedIdx = ExpectedMask[i];
9907 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9908 continue;
9909 if (MaskIdx == SM_SentinelZero) {
9910 // If we need this expected index to be a zero element, then update the
9911 // relevant zero mask and perform the known bits at the end to minimize
9912 // repeated computes.
9913 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9914 if (ExpectedV &&
9915 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9916 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9917 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9918 ZeroMask.setBit(BitIdx);
9919 continue;
9920 }
9921 }
9922 if (MaskIdx >= 0) {
9923 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9924 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9925 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9926 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9927 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9928 continue;
9929 }
9930 return false;
9931 }
9932 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9933 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9934}
9935
9936// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9937// instructions.
9939 const SelectionDAG &DAG) {
9940 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9941 return false;
9942
9943 SmallVector<int, 8> Unpcklwd;
9944 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9945 /* Unary = */ false);
9946 SmallVector<int, 8> Unpckhwd;
9947 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9948 /* Unary = */ false);
9949 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9950 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9951 return IsUnpackwdMask;
9952}
9953
9955 const SelectionDAG &DAG) {
9956 // Create 128-bit vector type based on mask size.
9957 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9958 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9959
9960 // We can't assume a canonical shuffle mask, so try the commuted version too.
9961 SmallVector<int, 4> CommutedMask(Mask);
9963
9964 // Match any of unary/binary or low/high.
9965 for (unsigned i = 0; i != 4; ++i) {
9966 SmallVector<int, 16> UnpackMask;
9967 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9968 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9969 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9970 return true;
9971 }
9972 return false;
9973}
9974
9975/// Return true if a shuffle mask chooses elements identically in its top and
9976/// bottom halves. For example, any splat mask has the same top and bottom
9977/// halves. If an element is undefined in only one half of the mask, the halves
9978/// are not considered identical.
9980 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9981 unsigned HalfSize = Mask.size() / 2;
9982 for (unsigned i = 0; i != HalfSize; ++i) {
9983 if (Mask[i] != Mask[i + HalfSize])
9984 return false;
9985 }
9986 return true;
9987}
9988
9989/// Get a 4-lane 8-bit shuffle immediate for a mask.
9990///
9991/// This helper function produces an 8-bit shuffle immediate corresponding to
9992/// the ubiquitous shuffle encoding scheme used in x86 instructions for
9993/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9994/// example.
9995///
9996/// NB: We rely heavily on "undef" masks preserving the input lane.
9997static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9998 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9999 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10000 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10001 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10002 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10003
10004 // If the mask only uses one non-undef element, then fully 'splat' it to
10005 // improve later broadcast matching.
10006 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10007 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10008
10009 int FirstElt = Mask[FirstIndex];
10010 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10011 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10012
10013 unsigned Imm = 0;
10014 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10015 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10016 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10017 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10018 return Imm;
10019}
10020
10022 SelectionDAG &DAG) {
10023 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10024}
10025
10026// Canonicalize SHUFPD mask to improve chances of further folding.
10027// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10028static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10029 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10030 "Unexpected SHUFPD mask size");
10031 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10032 "Unexpected SHUFPD mask elements");
10033
10034 // If the mask only uses one non-undef element, then fully 'splat' it to
10035 // improve later broadcast matching.
10036 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10037 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10038 "All undef shuffle mask");
10039
10040 int FirstElt = Mask[FirstIndex];
10041 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10042 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10043 unsigned Imm = 0;
10044 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10045 Imm |= FirstElt << I;
10046 return Imm;
10047 }
10048
10049 // Attempt to keep any undef elements in place to improve chances of the
10050 // shuffle becoming a (commutative) blend.
10051 unsigned Imm = 0;
10052 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10053 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10054
10055 return Imm;
10056}
10057
10059 SelectionDAG &DAG) {
10060 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10061}
10062
10063// The Shuffle result is as follow:
10064// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10065// Each Zeroable's element correspond to a particular Mask's element.
10066// As described in computeZeroableShuffleElements function.
10067//
10068// The function looks for a sub-mask that the nonzero elements are in
10069// increasing order. If such sub-mask exist. The function returns true.
10070static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10071 ArrayRef<int> Mask, const EVT &VectorType,
10072 bool &IsZeroSideLeft) {
10073 int NextElement = -1;
10074 // Check if the Mask's nonzero elements are in increasing order.
10075 for (int i = 0, e = Mask.size(); i < e; i++) {
10076 // Checks if the mask's zeros elements are built from only zeros.
10077 assert(Mask[i] >= -1 && "Out of bound mask element!");
10078 if (Mask[i] < 0)
10079 return false;
10080 if (Zeroable[i])
10081 continue;
10082 // Find the lowest non zero element
10083 if (NextElement < 0) {
10084 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10085 IsZeroSideLeft = NextElement != 0;
10086 }
10087 // Exit if the mask's non zero elements are not in increasing order.
10088 if (NextElement != Mask[i])
10089 return false;
10090 NextElement++;
10091 }
10092 return true;
10093}
10094
10095/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10097 ArrayRef<int> Mask, SDValue V1,
10098 SDValue V2, const APInt &Zeroable,
10099 const X86Subtarget &Subtarget,
10100 SelectionDAG &DAG) {
10101 int Size = Mask.size();
10102 int LaneSize = 128 / VT.getScalarSizeInBits();
10103 const int NumBytes = VT.getSizeInBits() / 8;
10104 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10105
10106 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10107 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10108 (Subtarget.hasBWI() && VT.is512BitVector()));
10109
10110 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10111 // Sign bit set in i8 mask means zero element.
10112 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10113
10114 SDValue V;
10115 for (int i = 0; i < NumBytes; ++i) {
10116 int M = Mask[i / NumEltBytes];
10117 if (M < 0) {
10118 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10119 continue;
10120 }
10121 if (Zeroable[i / NumEltBytes]) {
10122 PSHUFBMask[i] = ZeroMask;
10123 continue;
10124 }
10125
10126 // We can only use a single input of V1 or V2.
10127 SDValue SrcV = (M >= Size ? V2 : V1);
10128 if (V && V != SrcV)
10129 return SDValue();
10130 V = SrcV;
10131 M %= Size;
10132
10133 // PSHUFB can't cross lanes, ensure this doesn't happen.
10134 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10135 return SDValue();
10136
10137 M = M % LaneSize;
10138 M = M * NumEltBytes + (i % NumEltBytes);
10139 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10140 }
10141 assert(V && "Failed to find a source input");
10142
10143 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10144 return DAG.getBitcast(
10145 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10146 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10147}
10148
10149static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10150 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10151 const SDLoc &dl);
10152
10153// X86 has dedicated shuffle that can be lowered to VEXPAND
10155 SDValue V2, ArrayRef<int> Mask,
10156 const APInt &Zeroable,
10157 const X86Subtarget &Subtarget,
10158 SelectionDAG &DAG) {
10159 bool IsLeftZeroSide = true;
10160 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10161 IsLeftZeroSide))
10162 return SDValue();
10163 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10165 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10166 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10167 unsigned NumElts = VT.getVectorNumElements();
10168 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10169 "Unexpected number of vector elements");
10170 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10171 Subtarget, DAG, DL);
10172 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10173 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10174 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10175}
10176
10177static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10178 unsigned &UnpackOpcode, bool IsUnary,
10179 ArrayRef<int> TargetMask, const SDLoc &DL,
10180 SelectionDAG &DAG,
10181 const X86Subtarget &Subtarget) {
10182 int NumElts = VT.getVectorNumElements();
10183
10184 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10185 for (int i = 0; i != NumElts; i += 2) {
10186 int M1 = TargetMask[i + 0];
10187 int M2 = TargetMask[i + 1];
10188 Undef1 &= (SM_SentinelUndef == M1);
10189 Undef2 &= (SM_SentinelUndef == M2);
10190 Zero1 &= isUndefOrZero(M1);
10191 Zero2 &= isUndefOrZero(M2);
10192 }
10193 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10194 "Zeroable shuffle detected");
10195
10196 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10197 SmallVector<int, 64> Unpckl, Unpckh;
10198 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10199 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10200 (IsUnary ? V1 : V2))) {
10201 UnpackOpcode = X86ISD::UNPCKL;
10202 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10203 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10204 return true;
10205 }
10206
10207 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10208 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10209 (IsUnary ? V1 : V2))) {
10210 UnpackOpcode = X86ISD::UNPCKH;
10211 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10212 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10213 return true;
10214 }
10215
10216 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10217 if (IsUnary && (Zero1 || Zero2)) {
10218 // Don't bother if we can blend instead.
10219 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10220 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10221 return false;
10222
10223 bool MatchLo = true, MatchHi = true;
10224 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10225 int M = TargetMask[i];
10226
10227 // Ignore if the input is known to be zero or the index is undef.
10228 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10229 (M == SM_SentinelUndef))
10230 continue;
10231
10232 MatchLo &= (M == Unpckl[i]);
10233 MatchHi &= (M == Unpckh[i]);
10234 }
10235
10236 if (MatchLo || MatchHi) {
10237 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10238 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10239 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10240 return true;
10241 }
10242 }
10243
10244 // If a binary shuffle, commute and try again.
10245 if (!IsUnary) {
10247 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10248 UnpackOpcode = X86ISD::UNPCKL;
10249 std::swap(V1, V2);
10250 return true;
10251 }
10252
10254 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10255 UnpackOpcode = X86ISD::UNPCKH;
10256 std::swap(V1, V2);
10257 return true;
10258 }
10259 }
10260
10261 return false;
10262}
10263
10264// X86 has dedicated unpack instructions that can handle specific blend
10265// operations: UNPCKH and UNPCKL.
10267 SDValue V2, ArrayRef<int> Mask,
10268 SelectionDAG &DAG) {
10269 SmallVector<int, 8> Unpckl;
10270 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10271 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10272 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10273
10274 SmallVector<int, 8> Unpckh;
10275 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10276 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10277 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10278
10279 // Commute and try again.
10281 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10282 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10283
10285 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10286 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10287
10288 return SDValue();
10289}
10290
10291/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10292/// followed by unpack 256-bit.
10294 SDValue V2, ArrayRef<int> Mask,
10295 SelectionDAG &DAG) {
10296 SmallVector<int, 32> Unpckl, Unpckh;
10297 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10298 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10299
10300 unsigned UnpackOpcode;
10301 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10302 UnpackOpcode = X86ISD::UNPCKL;
10303 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10304 UnpackOpcode = X86ISD::UNPCKH;
10305 else
10306 return SDValue();
10307
10308 // This is a "natural" unpack operation (rather than the 128-bit sectored
10309 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10310 // input in order to use the x86 instruction.
10311 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10312 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10313 V1 = DAG.getBitcast(VT, V1);
10314 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10315}
10316
10317// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10318// source into the lower elements and zeroing the upper elements.
10319static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10320 ArrayRef<int> Mask, const APInt &Zeroable,
10321 const X86Subtarget &Subtarget) {
10322 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10323 return false;
10324
10325 unsigned NumElts = Mask.size();
10326 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10327 unsigned MaxScale = 64 / EltSizeInBits;
10328
10329 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10330 unsigned SrcEltBits = EltSizeInBits * Scale;
10331 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10332 continue;
10333 unsigned NumSrcElts = NumElts / Scale;
10334 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10335 continue;
10336 unsigned UpperElts = NumElts - NumSrcElts;
10337 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10338 continue;
10339 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10340 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10341 DstVT = MVT::getIntegerVT(EltSizeInBits);
10342 if ((NumSrcElts * EltSizeInBits) >= 128) {
10343 // ISD::TRUNCATE
10344 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10345 } else {
10346 // X86ISD::VTRUNC
10347 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10348 }
10349 return true;
10350 }
10351
10352 return false;
10353}
10354
10355// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10356// element padding to the final DstVT.
10357static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10358 const X86Subtarget &Subtarget,
10359 SelectionDAG &DAG, bool ZeroUppers) {
10360 MVT SrcVT = Src.getSimpleValueType();
10361 MVT DstSVT = DstVT.getScalarType();
10362 unsigned NumDstElts = DstVT.getVectorNumElements();
10363 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10364 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10365
10366 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10367 return SDValue();
10368
10369 // Perform a direct ISD::TRUNCATE if possible.
10370 if (NumSrcElts == NumDstElts)
10371 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10372
10373 if (NumSrcElts > NumDstElts) {
10374 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10375 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10376 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10377 }
10378
10379 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10380 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10381 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10382 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10383 DstVT.getSizeInBits());
10384 }
10385
10386 // Non-VLX targets must truncate from a 512-bit type, so we need to
10387 // widen, truncate and then possibly extract the original subvector.
10388 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10389 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10390 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10391 }
10392
10393 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10394 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10395 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10396 if (DstVT != TruncVT)
10397 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10398 DstVT.getSizeInBits());
10399 return Trunc;
10400}
10401
10402// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10403//
10404// An example is the following:
10405//
10406// t0: ch = EntryToken
10407// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10408// t25: v4i32 = truncate t2
10409// t41: v8i16 = bitcast t25
10410// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10411// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10412// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10413// t18: v2i64 = bitcast t51
10414//
10415// One can just use a single vpmovdw instruction, without avx512vl we need to
10416// use the zmm variant and extract the lower subvector, padding with zeroes.
10417// TODO: Merge with lowerShuffleAsVTRUNC.
10419 SDValue V2, ArrayRef<int> Mask,
10420 const APInt &Zeroable,
10421 const X86Subtarget &Subtarget,
10422 SelectionDAG &DAG) {
10423 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10424 if (!Subtarget.hasAVX512())
10425 return SDValue();
10426
10427 unsigned NumElts = VT.getVectorNumElements();
10428 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10429 unsigned MaxScale = 64 / EltSizeInBits;
10430 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10431 unsigned SrcEltBits = EltSizeInBits * Scale;
10432 unsigned NumSrcElts = NumElts / Scale;
10433 unsigned UpperElts = NumElts - NumSrcElts;
10434 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10435 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10436 continue;
10437
10438 // Attempt to find a matching source truncation, but as a fall back VLX
10439 // cases can use the VPMOV directly.
10440 SDValue Src = peekThroughBitcasts(V1);
10441 if (Src.getOpcode() == ISD::TRUNCATE &&
10442 Src.getScalarValueSizeInBits() == SrcEltBits) {
10443 Src = Src.getOperand(0);
10444 } else if (Subtarget.hasVLX()) {
10445 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10446 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10447 Src = DAG.getBitcast(SrcVT, Src);
10448 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10449 if (Scale == 2 &&
10450 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10451 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10452 return SDValue();
10453 } else
10454 return SDValue();
10455
10456 // VPMOVWB is only available with avx512bw.
10457 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10458 return SDValue();
10459
10460 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10461 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10462 }
10463
10464 return SDValue();
10465}
10466
10467// Attempt to match binary shuffle patterns as a truncate.
10469 SDValue V2, ArrayRef<int> Mask,
10470 const APInt &Zeroable,
10471 const X86Subtarget &Subtarget,
10472 SelectionDAG &DAG) {
10473 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10474 "Unexpected VTRUNC type");
10475 if (!Subtarget.hasAVX512())
10476 return SDValue();
10477
10478 unsigned NumElts = VT.getVectorNumElements();
10479 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10480 unsigned MaxScale = 64 / EltSizeInBits;
10481 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10482 // TODO: Support non-BWI VPMOVWB truncations?
10483 unsigned SrcEltBits = EltSizeInBits * Scale;
10484 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10485 continue;
10486
10487 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10488 // Bail if the V2 elements are undef.
10489 unsigned NumHalfSrcElts = NumElts / Scale;
10490 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10491 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10492 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10493 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10494 continue;
10495
10496 // The elements beyond the truncation must be undef/zero.
10497 unsigned UpperElts = NumElts - NumSrcElts;
10498 if (UpperElts > 0 &&
10499 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10500 continue;
10501 bool UndefUppers =
10502 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10503
10504 // For offset truncations, ensure that the concat is cheap.
10505 if (Offset) {
10506 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
10507 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10508 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
10509 return Lo.getOperand(0) == Hi.getOperand(0);
10510 if (ISD::isNormalLoad(Lo.getNode()) &&
10511 ISD::isNormalLoad(Hi.getNode())) {
10512 auto *LDLo = cast<LoadSDNode>(Lo);
10513 auto *LDHi = cast<LoadSDNode>(Hi);
10515 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10516 }
10517 return false;
10518 };
10519 if (!IsCheapConcat(peekThroughBitcasts(V1), peekThroughBitcasts(V2)))
10520 continue;
10521 }
10522
10523 // As we're using both sources then we need to concat them together
10524 // and truncate from the double-sized src.
10525 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10526 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10527
10528 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10529 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10530 Src = DAG.getBitcast(SrcVT, Src);
10531
10532 // Shift the offset'd elements into place for the truncation.
10533 // TODO: Use getTargetVShiftByConstNode.
10534 if (Offset)
10535 Src = DAG.getNode(
10536 X86ISD::VSRLI, DL, SrcVT, Src,
10537 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10538
10539 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10540 }
10541 }
10542
10543 return SDValue();
10544}
10545
10546/// Check whether a compaction lowering can be done by dropping even/odd
10547/// elements and compute how many times even/odd elements must be dropped.
10548///
10549/// This handles shuffles which take every Nth element where N is a power of
10550/// two. Example shuffle masks:
10551///
10552/// (even)
10553/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10554/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10555/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10556/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10557/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10558/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10559///
10560/// (odd)
10561/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10562/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10563///
10564/// Any of these lanes can of course be undef.
10565///
10566/// This routine only supports N <= 3.
10567/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10568/// for larger N.
10569///
10570/// \returns N above, or the number of times even/odd elements must be dropped
10571/// if there is such a number. Otherwise returns zero.
10572static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10573 bool IsSingleInput) {
10574 // The modulus for the shuffle vector entries is based on whether this is
10575 // a single input or not.
10576 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10577 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10578 "We should only be called with masks with a power-of-2 size!");
10579
10580 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10581 int Offset = MatchEven ? 0 : 1;
10582
10583 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10584 // and 2^3 simultaneously. This is because we may have ambiguity with
10585 // partially undef inputs.
10586 bool ViableForN[3] = {true, true, true};
10587
10588 for (int i = 0, e = Mask.size(); i < e; ++i) {
10589 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10590 // want.
10591 if (Mask[i] < 0)
10592 continue;
10593
10594 bool IsAnyViable = false;
10595 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10596 if (ViableForN[j]) {
10597 uint64_t N = j + 1;
10598
10599 // The shuffle mask must be equal to (i * 2^N) % M.
10600 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10601 IsAnyViable = true;
10602 else
10603 ViableForN[j] = false;
10604 }
10605 // Early exit if we exhaust the possible powers of two.
10606 if (!IsAnyViable)
10607 break;
10608 }
10609
10610 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10611 if (ViableForN[j])
10612 return j + 1;
10613
10614 // Return 0 as there is no viable power of two.
10615 return 0;
10616}
10617
10618// X86 has dedicated pack instructions that can handle specific truncation
10619// operations: PACKSS and PACKUS.
10620// Checks for compaction shuffle masks if MaxStages > 1.
10621// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10622static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10623 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10624 const SelectionDAG &DAG,
10625 const X86Subtarget &Subtarget,
10626 unsigned MaxStages = 1) {
10627 unsigned NumElts = VT.getVectorNumElements();
10628 unsigned BitSize = VT.getScalarSizeInBits();
10629 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10630 "Illegal maximum compaction");
10631
10632 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10633 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10634 unsigned NumPackedBits = NumSrcBits - BitSize;
10635 N1 = peekThroughBitcasts(N1);
10636 N2 = peekThroughBitcasts(N2);
10637 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10638 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10639 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10640 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10641 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10642 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10643 return false;
10644 if (Subtarget.hasSSE41() || BitSize == 8) {
10645 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10646 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10647 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10648 V1 = N1;
10649 V2 = N2;
10650 SrcVT = PackVT;
10651 PackOpcode = X86ISD::PACKUS;
10652 return true;
10653 }
10654 }
10655 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10656 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10657 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10658 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10659 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10660 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10661 V1 = N1;
10662 V2 = N2;
10663 SrcVT = PackVT;
10664 PackOpcode = X86ISD::PACKSS;
10665 return true;
10666 }
10667 return false;
10668 };
10669
10670 // Attempt to match against wider and wider compaction patterns.
10671 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10672 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10673 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10674
10675 // Try binary shuffle.
10676 SmallVector<int, 32> BinaryMask;
10677 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10678 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10679 if (MatchPACK(V1, V2, PackVT))
10680 return true;
10681
10682 // Try unary shuffle.
10683 SmallVector<int, 32> UnaryMask;
10684 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10685 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10686 if (MatchPACK(V1, V1, PackVT))
10687 return true;
10688 }
10689
10690 return false;
10691}
10692
10694 SDValue V2, ArrayRef<int> Mask,
10695 const X86Subtarget &Subtarget,
10696 SelectionDAG &DAG) {
10697 MVT PackVT;
10698 unsigned PackOpcode;
10699 unsigned SizeBits = VT.getSizeInBits();
10700 unsigned EltBits = VT.getScalarSizeInBits();
10701 unsigned MaxStages = Log2_32(64 / EltBits);
10702 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10703 Subtarget, MaxStages))
10704 return SDValue();
10705
10706 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10707 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10708
10709 // Don't lower multi-stage packs on AVX512, truncation is better.
10710 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10711 return SDValue();
10712
10713 // Pack to the largest type possible:
10714 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10715 unsigned MaxPackBits = 16;
10716 if (CurrentEltBits > 16 &&
10717 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10718 MaxPackBits = 32;
10719
10720 // Repeatedly pack down to the target size.
10721 SDValue Res;
10722 for (unsigned i = 0; i != NumStages; ++i) {
10723 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10724 unsigned NumSrcElts = SizeBits / SrcEltBits;
10725 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10726 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10727 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10728 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10729 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10730 DAG.getBitcast(SrcVT, V2));
10731 V1 = V2 = Res;
10732 CurrentEltBits /= 2;
10733 }
10734 assert(Res && Res.getValueType() == VT &&
10735 "Failed to lower compaction shuffle");
10736 return Res;
10737}
10738
10739/// Try to emit a bitmask instruction for a shuffle.
10740///
10741/// This handles cases where we can model a blend exactly as a bitmask due to
10742/// one of the inputs being zeroable.
10744 SDValue V2, ArrayRef<int> Mask,
10745 const APInt &Zeroable,
10746 const X86Subtarget &Subtarget,
10747 SelectionDAG &DAG) {
10748 MVT MaskVT = VT;
10749 MVT EltVT = VT.getVectorElementType();
10750 SDValue Zero, AllOnes;
10751 // Use f64 if i64 isn't legal.
10752 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10753 EltVT = MVT::f64;
10754 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10755 }
10756
10757 MVT LogicVT = VT;
10758 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10759 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10760 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
10761 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10762 LogicVT =
10763 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10764 } else {
10765 Zero = DAG.getConstant(0, DL, EltVT);
10766 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10767 }
10768
10769 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10770 SDValue V;
10771 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10772 if (Zeroable[i])
10773 continue;
10774 if (Mask[i] % Size != i)
10775 return SDValue(); // Not a blend.
10776 if (!V)
10777 V = Mask[i] < Size ? V1 : V2;
10778 else if (V != (Mask[i] < Size ? V1 : V2))
10779 return SDValue(); // Can only let one input through the mask.
10780
10781 VMaskOps[i] = AllOnes;
10782 }
10783 if (!V)
10784 return SDValue(); // No non-zeroable elements!
10785
10786 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10787 VMask = DAG.getBitcast(LogicVT, VMask);
10788 V = DAG.getBitcast(LogicVT, V);
10789 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10790 return DAG.getBitcast(VT, And);
10791}
10792
10793/// Try to emit a blend instruction for a shuffle using bit math.
10794///
10795/// This is used as a fallback approach when first class blend instructions are
10796/// unavailable. Currently it is only suitable for integer vectors, but could
10797/// be generalized for floating point vectors if desirable.
10799 SDValue V2, ArrayRef<int> Mask,
10800 SelectionDAG &DAG) {
10801 assert(VT.isInteger() && "Only supports integer vector types!");
10802 MVT EltVT = VT.getVectorElementType();
10803 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10804 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10806 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10807 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10808 return SDValue(); // Shuffled input!
10809 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10810 }
10811
10812 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10813 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10814}
10815
10817 SDValue PreservedSrc,
10818 const X86Subtarget &Subtarget,
10819 SelectionDAG &DAG);
10820
10823 const APInt &Zeroable, bool &ForceV1Zero,
10824 bool &ForceV2Zero, uint64_t &BlendMask) {
10825 bool V1IsZeroOrUndef =
10827 bool V2IsZeroOrUndef =
10828 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10829
10830 BlendMask = 0;
10831 ForceV1Zero = false, ForceV2Zero = false;
10832 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10833
10834 int NumElts = Mask.size();
10835 int NumLanes = VT.getSizeInBits() / 128;
10836 int NumEltsPerLane = NumElts / NumLanes;
10837 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10838
10839 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10840 // then ensure the blend mask part for that lane just references that input.
10841 bool ForceWholeLaneMasks =
10842 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10843
10844 // Attempt to generate the binary blend mask. If an input is zero then
10845 // we can use any lane.
10846 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10847 // Keep track of the inputs used per lane.
10848 bool LaneV1InUse = false;
10849 bool LaneV2InUse = false;
10850 uint64_t LaneBlendMask = 0;
10851 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10852 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10853 int M = Mask[Elt];
10854 if (M == SM_SentinelUndef)
10855 continue;
10856 if (M == Elt || (0 <= M && M < NumElts &&
10857 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10858 Mask[Elt] = Elt;
10859 LaneV1InUse = true;
10860 continue;
10861 }
10862 if (M == (Elt + NumElts) ||
10863 (NumElts <= M &&
10864 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10865 LaneBlendMask |= 1ull << LaneElt;
10866 Mask[Elt] = Elt + NumElts;
10867 LaneV2InUse = true;
10868 continue;
10869 }
10870 if (Zeroable[Elt]) {
10871 if (V1IsZeroOrUndef) {
10872 ForceV1Zero = true;
10873 Mask[Elt] = Elt;
10874 LaneV1InUse = true;
10875 continue;
10876 }
10877 if (V2IsZeroOrUndef) {
10878 ForceV2Zero = true;
10879 LaneBlendMask |= 1ull << LaneElt;
10880 Mask[Elt] = Elt + NumElts;
10881 LaneV2InUse = true;
10882 continue;
10883 }
10884 }
10885 return false;
10886 }
10887
10888 // If we only used V2 then splat the lane blend mask to avoid any demanded
10889 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10890 // blend mask bit).
10891 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10892 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10893
10894 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10895 }
10896 return true;
10897}
10898
10899/// Try to emit a blend instruction for a shuffle.
10900///
10901/// This doesn't do any checks for the availability of instructions for blending
10902/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10903/// be matched in the backend with the type given. What it does check for is
10904/// that the shuffle mask is a blend, or convertible into a blend with zero.
10906 SDValue V2, ArrayRef<int> Original,
10907 const APInt &Zeroable,
10908 const X86Subtarget &Subtarget,
10909 SelectionDAG &DAG) {
10910 uint64_t BlendMask = 0;
10911 bool ForceV1Zero = false, ForceV2Zero = false;
10912 SmallVector<int, 64> Mask(Original);
10913 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10914 BlendMask))
10915 return SDValue();
10916
10917 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10918 if (ForceV1Zero)
10919 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10920 if (ForceV2Zero)
10921 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10922
10923 unsigned NumElts = VT.getVectorNumElements();
10924
10925 switch (VT.SimpleTy) {
10926 case MVT::v4i64:
10927 case MVT::v8i32:
10928 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10929 [[fallthrough]];
10930 case MVT::v4f64:
10931 case MVT::v8f32:
10932 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10933 [[fallthrough]];
10934 case MVT::v2f64:
10935 case MVT::v2i64:
10936 case MVT::v4f32:
10937 case MVT::v4i32:
10938 case MVT::v8i16:
10939 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10940 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10941 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10942 case MVT::v16i16: {
10943 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10944 SmallVector<int, 8> RepeatedMask;
10945 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10946 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10947 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10948 BlendMask = 0;
10949 for (int i = 0; i < 8; ++i)
10950 if (RepeatedMask[i] >= 8)
10951 BlendMask |= 1ull << i;
10952 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10953 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10954 }
10955 // Use PBLENDW for lower/upper lanes and then blend lanes.
10956 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10957 // merge to VSELECT where useful.
10958 uint64_t LoMask = BlendMask & 0xFF;
10959 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10960 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10961 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10962 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10963 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10964 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10965 return DAG.getVectorShuffle(
10966 MVT::v16i16, DL, Lo, Hi,
10967 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10968 }
10969 [[fallthrough]];
10970 }
10971 case MVT::v32i8:
10972 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10973 [[fallthrough]];
10974 case MVT::v16i8: {
10975 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10976
10977 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10978 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10979 Subtarget, DAG))
10980 return Masked;
10981
10982 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10983 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10984 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10985 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10986 }
10987
10988 // If we have VPTERNLOG, we can use that as a bit blend.
10989 if (Subtarget.hasVLX())
10990 if (SDValue BitBlend =
10991 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10992 return BitBlend;
10993
10994 // Scale the blend by the number of bytes per element.
10995 int Scale = VT.getScalarSizeInBits() / 8;
10996
10997 // This form of blend is always done on bytes. Compute the byte vector
10998 // type.
10999 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11000
11001 // x86 allows load folding with blendvb from the 2nd source operand. But
11002 // we are still using LLVM select here (see comment below), so that's V1.
11003 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11004 // allow that load-folding possibility.
11005 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11007 std::swap(V1, V2);
11008 }
11009
11010 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11011 // mix of LLVM's code generator and the x86 backend. We tell the code
11012 // generator that boolean values in the elements of an x86 vector register
11013 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11014 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11015 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11016 // of the element (the remaining are ignored) and 0 in that high bit would
11017 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11018 // the LLVM model for boolean values in vector elements gets the relevant
11019 // bit set, it is set backwards and over constrained relative to x86's
11020 // actual model.
11021 SmallVector<SDValue, 32> VSELECTMask;
11022 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11023 for (int j = 0; j < Scale; ++j)
11024 VSELECTMask.push_back(
11025 Mask[i] < 0
11026 ? DAG.getUNDEF(MVT::i8)
11027 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11028
11029 V1 = DAG.getBitcast(BlendVT, V1);
11030 V2 = DAG.getBitcast(BlendVT, V2);
11031 return DAG.getBitcast(
11032 VT,
11033 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11034 V1, V2));
11035 }
11036 case MVT::v16f32:
11037 case MVT::v8f64:
11038 case MVT::v8i64:
11039 case MVT::v16i32:
11040 case MVT::v32i16:
11041 case MVT::v64i8: {
11042 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11043 bool OptForSize = DAG.shouldOptForSize();
11044 if (!OptForSize) {
11045 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11046 Subtarget, DAG))
11047 return Masked;
11048 }
11049
11050 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11051 // masked move.
11052 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11053 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11054 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11055 }
11056 default:
11057 llvm_unreachable("Not a supported integer vector type!");
11058 }
11059}
11060
11061/// Try to lower as a blend of elements from two inputs followed by
11062/// a single-input permutation.
11063///
11064/// This matches the pattern where we can blend elements from two inputs and
11065/// then reduce the shuffle to a single-input permutation.
11067 SDValue V1, SDValue V2,
11068 ArrayRef<int> Mask,
11069 SelectionDAG &DAG,
11070 bool ImmBlends = false) {
11071 // We build up the blend mask while checking whether a blend is a viable way
11072 // to reduce the shuffle.
11073 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11074 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11075
11076 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11077 if (Mask[i] < 0)
11078 continue;
11079
11080 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11081
11082 if (BlendMask[Mask[i] % Size] < 0)
11083 BlendMask[Mask[i] % Size] = Mask[i];
11084 else if (BlendMask[Mask[i] % Size] != Mask[i])
11085 return SDValue(); // Can't blend in the needed input!
11086
11087 PermuteMask[i] = Mask[i] % Size;
11088 }
11089
11090 // If only immediate blends, then bail if the blend mask can't be widened to
11091 // i16.
11092 unsigned EltSize = VT.getScalarSizeInBits();
11093 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11094 return SDValue();
11095
11096 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11097 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11098}
11099
11100/// Try to lower as an unpack of elements from two inputs followed by
11101/// a single-input permutation.
11102///
11103/// This matches the pattern where we can unpack elements from two inputs and
11104/// then reduce the shuffle to a single-input (wider) permutation.
11106 SDValue V1, SDValue V2,
11107 ArrayRef<int> Mask,
11108 SelectionDAG &DAG) {
11109 int NumElts = Mask.size();
11110 int NumLanes = VT.getSizeInBits() / 128;
11111 int NumLaneElts = NumElts / NumLanes;
11112 int NumHalfLaneElts = NumLaneElts / 2;
11113
11114 bool MatchLo = true, MatchHi = true;
11115 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11116
11117 // Determine UNPCKL/UNPCKH type and operand order.
11118 for (int Elt = 0; Elt != NumElts; ++Elt) {
11119 int M = Mask[Elt];
11120 if (M < 0)
11121 continue;
11122
11123 // Normalize the mask value depending on whether it's V1 or V2.
11124 int NormM = M;
11125 SDValue &Op = Ops[Elt & 1];
11126 if (M < NumElts && (Op.isUndef() || Op == V1))
11127 Op = V1;
11128 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11129 Op = V2;
11130 NormM -= NumElts;
11131 } else
11132 return SDValue();
11133
11134 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11135 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11136 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11137 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11138 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11139 if (MatchLoAnyLane || MatchHiAnyLane) {
11140 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11141 "Failed to match UNPCKLO/UNPCKHI");
11142 break;
11143 }
11144 }
11145 MatchLo &= MatchLoAnyLane;
11146 MatchHi &= MatchHiAnyLane;
11147 if (!MatchLo && !MatchHi)
11148 return SDValue();
11149 }
11150 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11151
11152 // Element indices have changed after unpacking. Calculate permute mask
11153 // so that they will be put back to the position as dictated by the
11154 // original shuffle mask indices.
11155 SmallVector<int, 32> PermuteMask(NumElts, -1);
11156 for (int Elt = 0; Elt != NumElts; ++Elt) {
11157 int M = Mask[Elt];
11158 if (M < 0)
11159 continue;
11160 int NormM = M;
11161 if (NumElts <= M)
11162 NormM -= NumElts;
11163 bool IsFirstOp = M < NumElts;
11164 int BaseMaskElt =
11165 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11166 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11167 PermuteMask[Elt] = BaseMaskElt;
11168 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11169 PermuteMask[Elt] = BaseMaskElt + 1;
11170 assert(PermuteMask[Elt] != -1 &&
11171 "Input mask element is defined but failed to assign permute mask");
11172 }
11173
11174 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11175 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11176 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11177}
11178
11179/// Try to lower a shuffle as a permute of the inputs followed by an
11180/// UNPCK instruction.
11181///
11182/// This specifically targets cases where we end up with alternating between
11183/// the two inputs, and so can permute them into something that feeds a single
11184/// UNPCK instruction. Note that this routine only targets integer vectors
11185/// because for floating point vectors we have a generalized SHUFPS lowering
11186/// strategy that handles everything that doesn't *exactly* match an unpack,
11187/// making this clever lowering unnecessary.
11189 SDValue V1, SDValue V2,
11190 ArrayRef<int> Mask,
11191 const X86Subtarget &Subtarget,
11192 SelectionDAG &DAG) {
11193 int Size = Mask.size();
11194 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11195
11196 // This routine only supports 128-bit integer dual input vectors.
11197 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11198 return SDValue();
11199
11200 int NumLoInputs =
11201 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11202 int NumHiInputs =
11203 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11204
11205 bool UnpackLo = NumLoInputs >= NumHiInputs;
11206
11207 auto TryUnpack = [&](int ScalarSize, int Scale) {
11208 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11209 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11210
11211 for (int i = 0; i < Size; ++i) {
11212 if (Mask[i] < 0)
11213 continue;
11214
11215 // Each element of the unpack contains Scale elements from this mask.
11216 int UnpackIdx = i / Scale;
11217
11218 // We only handle the case where V1 feeds the first slots of the unpack.
11219 // We rely on canonicalization to ensure this is the case.
11220 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11221 return SDValue();
11222
11223 // Setup the mask for this input. The indexing is tricky as we have to
11224 // handle the unpack stride.
11225 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11226 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11227 Mask[i] % Size;
11228 }
11229
11230 // If we will have to shuffle both inputs to use the unpack, check whether
11231 // we can just unpack first and shuffle the result. If so, skip this unpack.
11232 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11233 !isNoopShuffleMask(V2Mask))
11234 return SDValue();
11235
11236 // Shuffle the inputs into place.
11237 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11238 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11239
11240 // Cast the inputs to the type we will use to unpack them.
11241 MVT UnpackVT =
11242 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11243 V1 = DAG.getBitcast(UnpackVT, V1);
11244 V2 = DAG.getBitcast(UnpackVT, V2);
11245
11246 // Unpack the inputs and cast the result back to the desired type.
11247 return DAG.getBitcast(
11248 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11249 UnpackVT, V1, V2));
11250 };
11251
11252 // We try each unpack from the largest to the smallest to try and find one
11253 // that fits this mask.
11254 int OrigScalarSize = VT.getScalarSizeInBits();
11255 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11256 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11257 return Unpack;
11258
11259 // If we're shuffling with a zero vector then we're better off not doing
11260 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11262 ISD::isBuildVectorAllZeros(V2.getNode()))
11263 return SDValue();
11264
11265 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11266 // initial unpack.
11267 if (NumLoInputs == 0 || NumHiInputs == 0) {
11268 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11269 "We have to have *some* inputs!");
11270 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11271
11272 // FIXME: We could consider the total complexity of the permute of each
11273 // possible unpacking. Or at the least we should consider how many
11274 // half-crossings are created.
11275 // FIXME: We could consider commuting the unpacks.
11276
11277 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11278 for (int i = 0; i < Size; ++i) {
11279 if (Mask[i] < 0)
11280 continue;
11281
11282 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11283
11284 PermMask[i] =
11285 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11286 }
11287 return DAG.getVectorShuffle(
11288 VT, DL,
11289 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11290 V1, V2),
11291 DAG.getUNDEF(VT), PermMask);
11292 }
11293
11294 return SDValue();
11295}
11296
11297/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11298/// permuting the elements of the result in place.
11300 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11301 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11302 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11303 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11304 (VT.is512BitVector() && !Subtarget.hasBWI()))
11305 return SDValue();
11306
11307 // We don't currently support lane crossing permutes.
11308 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11309 return SDValue();
11310
11311 int Scale = VT.getScalarSizeInBits() / 8;
11312 int NumLanes = VT.getSizeInBits() / 128;
11313 int NumElts = VT.getVectorNumElements();
11314 int NumEltsPerLane = NumElts / NumLanes;
11315
11316 // Determine range of mask elts.
11317 bool Blend1 = true;
11318 bool Blend2 = true;
11319 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11320 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11321 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11322 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11323 int M = Mask[Lane + Elt];
11324 if (M < 0)
11325 continue;
11326 if (M < NumElts) {
11327 Blend1 &= (M == (Lane + Elt));
11328 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11329 M = M % NumEltsPerLane;
11330 Range1.first = std::min(Range1.first, M);
11331 Range1.second = std::max(Range1.second, M);
11332 } else {
11333 M -= NumElts;
11334 Blend2 &= (M == (Lane + Elt));
11335 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11336 M = M % NumEltsPerLane;
11337 Range2.first = std::min(Range2.first, M);
11338 Range2.second = std::max(Range2.second, M);
11339 }
11340 }
11341 }
11342
11343 // Bail if we don't need both elements.
11344 // TODO - it might be worth doing this for unary shuffles if the permute
11345 // can be widened.
11346 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11347 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11348 return SDValue();
11349
11350 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11351 return SDValue();
11352
11353 // Rotate the 2 ops so we can access both ranges, then permute the result.
11354 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11355 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11356 SDValue Rotate = DAG.getBitcast(
11357 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11358 DAG.getBitcast(ByteVT, Lo),
11359 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11360 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11361 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11362 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11363 int M = Mask[Lane + Elt];
11364 if (M < 0)
11365 continue;
11366 if (M < NumElts)
11367 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11368 else
11369 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11370 }
11371 }
11372 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11373 };
11374
11375 // Check if the ranges are small enough to rotate from either direction.
11376 if (Range2.second < Range1.first)
11377 return RotateAndPermute(V1, V2, Range1.first, 0);
11378 if (Range1.second < Range2.first)
11379 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11380 return SDValue();
11381}
11382
11384 return isUndefOrEqual(Mask, 0);
11385}
11386
11388 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11389}
11390
11391/// Check if the Mask consists of the same element repeated multiple times.
11393 size_t NumUndefs = 0;
11394 std::optional<int> UniqueElt;
11395 for (int Elt : Mask) {
11396 if (Elt == SM_SentinelUndef) {
11397 NumUndefs++;
11398 continue;
11399 }
11400 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11401 return false;
11402 UniqueElt = Elt;
11403 }
11404 // Make sure the element is repeated enough times by checking the number of
11405 // undefs is small.
11406 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11407}
11408
11409/// Generic routine to decompose a shuffle and blend into independent
11410/// blends and permutes.
11411///
11412/// This matches the extremely common pattern for handling combined
11413/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11414/// operations. It will try to pick the best arrangement of shuffles and
11415/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11417 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11418 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11419 int NumElts = Mask.size();
11420 int NumLanes = VT.getSizeInBits() / 128;
11421 int NumEltsPerLane = NumElts / NumLanes;
11422
11423 // Shuffle the input elements into the desired positions in V1 and V2 and
11424 // unpack/blend them together.
11425 bool IsAlternating = true;
11426 bool V1Zero = true, V2Zero = true;
11427 SmallVector<int, 32> V1Mask(NumElts, -1);
11428 SmallVector<int, 32> V2Mask(NumElts, -1);
11429 SmallVector<int, 32> FinalMask(NumElts, -1);
11430 for (int i = 0; i < NumElts; ++i) {
11431 int M = Mask[i];
11432 if (M >= 0 && M < NumElts) {
11433 V1Mask[i] = M;
11434 FinalMask[i] = i;
11435 V1Zero &= Zeroable[i];
11436 IsAlternating &= (i & 1) == 0;
11437 } else if (M >= NumElts) {
11438 V2Mask[i] = M - NumElts;
11439 FinalMask[i] = i + NumElts;
11440 V2Zero &= Zeroable[i];
11441 IsAlternating &= (i & 1) == 1;
11442 }
11443 }
11444
11445 // If we effectively only demand the 0'th element of \p Input, and not only
11446 // as 0'th element, then broadcast said input,
11447 // and change \p InputMask to be a no-op (identity) mask.
11448 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11449 &DAG](SDValue &Input,
11450 MutableArrayRef<int> InputMask) {
11451 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11452 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11453 !X86::mayFoldLoad(Input, Subtarget)))
11454 return;
11455 if (isNoopShuffleMask(InputMask))
11456 return;
11457 assert(isBroadcastShuffleMask(InputMask) &&
11458 "Expected to demand only the 0'th element.");
11459 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11460 for (auto I : enumerate(InputMask)) {
11461 int &InputMaskElt = I.value();
11462 if (InputMaskElt >= 0)
11463 InputMaskElt = I.index();
11464 }
11465 };
11466
11467 // Currently, we may need to produce one shuffle per input, and blend results.
11468 // It is possible that the shuffle for one of the inputs is already a no-op.
11469 // See if we can simplify non-no-op shuffles into broadcasts,
11470 // which we consider to be strictly better than an arbitrary shuffle.
11471 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11473 canonicalizeBroadcastableInput(V1, V1Mask);
11474 canonicalizeBroadcastableInput(V2, V2Mask);
11475 }
11476
11477 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11478 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11479 // the shuffle may be able to fold with a load or other benefit. However, when
11480 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11481 // pre-shuffle first is a better strategy.
11482 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11483 // Only prefer immediate blends to unpack/rotate.
11484 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11485 DAG, true))
11486 return BlendPerm;
11487 // If either input vector provides only a single element which is repeated
11488 // multiple times, unpacking from both input vectors would generate worse
11489 // code. e.g. for
11490 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11491 // it is better to process t4 first to create a vector of t4[0], then unpack
11492 // that vector with t2.
11493 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11495 if (SDValue UnpackPerm =
11496 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11497 return UnpackPerm;
11499 DL, VT, V1, V2, Mask, Subtarget, DAG))
11500 return RotatePerm;
11501 // Unpack/rotate failed - try again with variable blends.
11502 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11503 DAG))
11504 return BlendPerm;
11505 if (VT.getScalarSizeInBits() >= 32)
11506 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11507 DL, VT, V1, V2, Mask, Subtarget, DAG))
11508 return PermUnpack;
11509 }
11510
11511 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11512 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11513 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11514 // than half the elements coming from each source.
11515 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11516 V1Mask.assign(NumElts, -1);
11517 V2Mask.assign(NumElts, -1);
11518 FinalMask.assign(NumElts, -1);
11519 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11520 for (int j = 0; j != NumEltsPerLane; ++j) {
11521 int M = Mask[i + j];
11522 if (M >= 0 && M < NumElts) {
11523 V1Mask[i + (j / 2)] = M;
11524 FinalMask[i + j] = i + (j / 2);
11525 } else if (M >= NumElts) {
11526 V2Mask[i + (j / 2)] = M - NumElts;
11527 FinalMask[i + j] = i + (j / 2) + NumElts;
11528 }
11529 }
11530 }
11531
11532 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11533 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11534 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11535}
11536
11537static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11538 const X86Subtarget &Subtarget,
11539 ArrayRef<int> Mask) {
11540 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11541 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11542
11543 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11544 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11545 int MaxSubElts = 64 / EltSizeInBits;
11546 unsigned RotateAmt, NumSubElts;
11547 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11548 MaxSubElts, NumSubElts, RotateAmt))
11549 return -1;
11550 unsigned NumElts = Mask.size();
11551 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11552 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11553 return RotateAmt;
11554}
11555
11556/// Lower shuffle using X86ISD::VROTLI rotations.
11558 ArrayRef<int> Mask,
11559 const X86Subtarget &Subtarget,
11560 SelectionDAG &DAG) {
11561 // Only XOP + AVX512 targets have bit rotation instructions.
11562 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11563 bool IsLegal =
11564 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11565 if (!IsLegal && Subtarget.hasSSE3())
11566 return SDValue();
11567
11568 MVT RotateVT;
11569 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11570 Subtarget, Mask);
11571 if (RotateAmt < 0)
11572 return SDValue();
11573
11574 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11575 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11576 // widen to vXi16 or more then existing lowering should will be better.
11577 if (!IsLegal) {
11578 if ((RotateAmt % 16) == 0)
11579 return SDValue();
11580 // TODO: Use getTargetVShiftByConstNode.
11581 unsigned ShlAmt = RotateAmt;
11582 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11583 V1 = DAG.getBitcast(RotateVT, V1);
11584 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11585 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11586 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11587 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11588 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11589 return DAG.getBitcast(VT, Rot);
11590 }
11591
11592 SDValue Rot =
11593 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11594 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11595 return DAG.getBitcast(VT, Rot);
11596}
11597
11598/// Try to match a vector shuffle as an element rotation.
11599///
11600/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11602 ArrayRef<int> Mask) {
11603 int NumElts = Mask.size();
11604
11605 // We need to detect various ways of spelling a rotation:
11606 // [11, 12, 13, 14, 15, 0, 1, 2]
11607 // [-1, 12, 13, 14, -1, -1, 1, -1]
11608 // [-1, -1, -1, -1, -1, -1, 1, 2]
11609 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11610 // [-1, 4, 5, 6, -1, -1, 9, -1]
11611 // [-1, 4, 5, 6, -1, -1, -1, -1]
11612 int Rotation = 0;
11613 SDValue Lo, Hi;
11614 for (int i = 0; i < NumElts; ++i) {
11615 int M = Mask[i];
11616 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11617 "Unexpected mask index.");
11618 if (M < 0)
11619 continue;
11620
11621 // Determine where a rotated vector would have started.
11622 int StartIdx = i - (M % NumElts);
11623 if (StartIdx == 0)
11624 // The identity rotation isn't interesting, stop.
11625 return -1;
11626
11627 // If we found the tail of a vector the rotation must be the missing
11628 // front. If we found the head of a vector, it must be how much of the
11629 // head.
11630 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11631
11632 if (Rotation == 0)
11633 Rotation = CandidateRotation;
11634 else if (Rotation != CandidateRotation)
11635 // The rotations don't match, so we can't match this mask.
11636 return -1;
11637
11638 // Compute which value this mask is pointing at.
11639 SDValue MaskV = M < NumElts ? V1 : V2;
11640
11641 // Compute which of the two target values this index should be assigned
11642 // to. This reflects whether the high elements are remaining or the low
11643 // elements are remaining.
11644 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11645
11646 // Either set up this value if we've not encountered it before, or check
11647 // that it remains consistent.
11648 if (!TargetV)
11649 TargetV = MaskV;
11650 else if (TargetV != MaskV)
11651 // This may be a rotation, but it pulls from the inputs in some
11652 // unsupported interleaving.
11653 return -1;
11654 }
11655
11656 // Check that we successfully analyzed the mask, and normalize the results.
11657 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11658 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11659 if (!Lo)
11660 Lo = Hi;
11661 else if (!Hi)
11662 Hi = Lo;
11663
11664 V1 = Lo;
11665 V2 = Hi;
11666
11667 return Rotation;
11668}
11669
11670/// Try to lower a vector shuffle as a byte rotation.
11671///
11672/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11673/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11674/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11675/// try to generically lower a vector shuffle through such an pattern. It
11676/// does not check for the profitability of lowering either as PALIGNR or
11677/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11678/// This matches shuffle vectors that look like:
11679///
11680/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11681///
11682/// Essentially it concatenates V1 and V2, shifts right by some number of
11683/// elements, and takes the low elements as the result. Note that while this is
11684/// specified as a *right shift* because x86 is little-endian, it is a *left
11685/// rotate* of the vector lanes.
11687 ArrayRef<int> Mask) {
11688 // Don't accept any shuffles with zero elements.
11689 if (isAnyZero(Mask))
11690 return -1;
11691
11692 // PALIGNR works on 128-bit lanes.
11693 SmallVector<int, 16> RepeatedMask;
11694 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11695 return -1;
11696
11697 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11698 if (Rotation <= 0)
11699 return -1;
11700
11701 // PALIGNR rotates bytes, so we need to scale the
11702 // rotation based on how many bytes are in the vector lane.
11703 int NumElts = RepeatedMask.size();
11704 int Scale = 16 / NumElts;
11705 return Rotation * Scale;
11706}
11707
11709 SDValue V2, ArrayRef<int> Mask,
11710 const X86Subtarget &Subtarget,
11711 SelectionDAG &DAG) {
11712 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11713
11714 SDValue Lo = V1, Hi = V2;
11715 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11716 if (ByteRotation <= 0)
11717 return SDValue();
11718
11719 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11720 // PSLLDQ/PSRLDQ.
11721 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11722 Lo = DAG.getBitcast(ByteVT, Lo);
11723 Hi = DAG.getBitcast(ByteVT, Hi);
11724
11725 // SSSE3 targets can use the palignr instruction.
11726 if (Subtarget.hasSSSE3()) {
11727 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11728 "512-bit PALIGNR requires BWI instructions");
11729 return DAG.getBitcast(
11730 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11731 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11732 }
11733
11734 assert(VT.is128BitVector() &&
11735 "Rotate-based lowering only supports 128-bit lowering!");
11736 assert(Mask.size() <= 16 &&
11737 "Can shuffle at most 16 bytes in a 128-bit vector!");
11738 assert(ByteVT == MVT::v16i8 &&
11739 "SSE2 rotate lowering only needed for v16i8!");
11740
11741 // Default SSE2 implementation
11742 int LoByteShift = 16 - ByteRotation;
11743 int HiByteShift = ByteRotation;
11744
11745 SDValue LoShift =
11746 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11747 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11748 SDValue HiShift =
11749 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11750 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11751 return DAG.getBitcast(VT,
11752 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11753}
11754
11755/// Try to lower a vector shuffle as a dword/qword rotation.
11756///
11757/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11758/// rotation of the concatenation of two vectors; This routine will
11759/// try to generically lower a vector shuffle through such an pattern.
11760///
11761/// Essentially it concatenates V1 and V2, shifts right by some number of
11762/// elements, and takes the low elements as the result. Note that while this is
11763/// specified as a *right shift* because x86 is little-endian, it is a *left
11764/// rotate* of the vector lanes.
11766 SDValue V2, ArrayRef<int> Mask,
11767 const APInt &Zeroable,
11768 const X86Subtarget &Subtarget,
11769 SelectionDAG &DAG) {
11770 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11771 "Only 32-bit and 64-bit elements are supported!");
11772
11773 // 128/256-bit vectors are only supported with VLX.
11774 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11775 && "VLX required for 128/256-bit vectors");
11776
11777 SDValue Lo = V1, Hi = V2;
11778 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11779 if (0 < Rotation)
11780 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11781 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11782
11783 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11784 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11785 // TODO: We can probably make this more aggressive and use shift-pairs like
11786 // lowerShuffleAsByteShiftMask.
11787 unsigned NumElts = Mask.size();
11788 unsigned ZeroLo = Zeroable.countr_one();
11789 unsigned ZeroHi = Zeroable.countl_one();
11790 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11791 if (!ZeroLo && !ZeroHi)
11792 return SDValue();
11793
11794 if (ZeroLo) {
11795 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11796 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11797 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11798 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11799 getZeroVector(VT, Subtarget, DAG, DL),
11800 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11801 }
11802
11803 if (ZeroHi) {
11804 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11805 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11806 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11807 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11808 getZeroVector(VT, Subtarget, DAG, DL), Src,
11809 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11810 }
11811
11812 return SDValue();
11813}
11814
11815/// Try to lower a vector shuffle as a byte shift sequence.
11817 SDValue V2, ArrayRef<int> Mask,
11818 const APInt &Zeroable,
11819 const X86Subtarget &Subtarget,
11820 SelectionDAG &DAG) {
11821 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11822 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11823
11824 // We need a shuffle that has zeros at one/both ends and a sequential
11825 // shuffle from one source within.
11826 unsigned ZeroLo = Zeroable.countr_one();
11827 unsigned ZeroHi = Zeroable.countl_one();
11828 if (!ZeroLo && !ZeroHi)
11829 return SDValue();
11830
11831 unsigned NumElts = Mask.size();
11832 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11833 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11834 return SDValue();
11835
11836 unsigned Scale = VT.getScalarSizeInBits() / 8;
11837 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11838 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11839 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11840 return SDValue();
11841
11842 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11843 Res = DAG.getBitcast(MVT::v16i8, Res);
11844
11845 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11846 // inner sequential set of elements, possibly offset:
11847 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11848 // 01234567 --> 4567zzzz --> zzzzz456
11849 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11850 if (ZeroLo == 0) {
11851 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11852 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11853 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11854 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11855 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11856 } else if (ZeroHi == 0) {
11857 unsigned Shift = Mask[ZeroLo] % NumElts;
11858 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11859 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11860 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11861 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11862 } else if (!Subtarget.hasSSSE3()) {
11863 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11864 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11865 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11866 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11867 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11868 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11869 Shift += Mask[ZeroLo] % NumElts;
11870 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11871 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11872 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11873 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11874 } else
11875 return SDValue();
11876
11877 return DAG.getBitcast(VT, Res);
11878}
11879
11880/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11881///
11882/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11883/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11884/// matches elements from one of the input vectors shuffled to the left or
11885/// right with zeroable elements 'shifted in'. It handles both the strictly
11886/// bit-wise element shifts and the byte shift across an entire 128-bit double
11887/// quad word lane.
11888///
11889/// PSHL : (little-endian) left bit shift.
11890/// [ zz, 0, zz, 2 ]
11891/// [ -1, 4, zz, -1 ]
11892/// PSRL : (little-endian) right bit shift.
11893/// [ 1, zz, 3, zz]
11894/// [ -1, -1, 7, zz]
11895/// PSLLDQ : (little-endian) left byte shift
11896/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11897/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11898/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11899/// PSRLDQ : (little-endian) right byte shift
11900/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11901/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11902/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11903static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11904 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11905 int MaskOffset, const APInt &Zeroable,
11906 const X86Subtarget &Subtarget) {
11907 int Size = Mask.size();
11908 unsigned SizeInBits = Size * ScalarSizeInBits;
11909
11910 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11911 for (int i = 0; i < Size; i += Scale)
11912 for (int j = 0; j < Shift; ++j)
11913 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11914 return false;
11915
11916 return true;
11917 };
11918
11919 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11920 for (int i = 0; i != Size; i += Scale) {
11921 unsigned Pos = Left ? i + Shift : i;
11922 unsigned Low = Left ? i : i + Shift;
11923 unsigned Len = Scale - Shift;
11924 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11925 return -1;
11926 }
11927
11928 int ShiftEltBits = ScalarSizeInBits * Scale;
11929 bool ByteShift = ShiftEltBits > 64;
11930 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11931 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11932 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11933
11934 // Normalize the scale for byte shifts to still produce an i64 element
11935 // type.
11936 Scale = ByteShift ? Scale / 2 : Scale;
11937
11938 // We need to round trip through the appropriate type for the shift.
11939 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11940 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11941 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11942 return (int)ShiftAmt;
11943 };
11944
11945 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11946 // keep doubling the size of the integer elements up to that. We can
11947 // then shift the elements of the integer vector by whole multiples of
11948 // their width within the elements of the larger integer vector. Test each
11949 // multiple to see if we can find a match with the moved element indices
11950 // and that the shifted in elements are all zeroable.
11951 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11952 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11953 for (int Shift = 1; Shift != Scale; ++Shift)
11954 for (bool Left : {true, false})
11955 if (CheckZeros(Shift, Scale, Left)) {
11956 int ShiftAmt = MatchShift(Shift, Scale, Left);
11957 if (0 < ShiftAmt)
11958 return ShiftAmt;
11959 }
11960
11961 // no match
11962 return -1;
11963}
11964
11966 SDValue V2, ArrayRef<int> Mask,
11967 const APInt &Zeroable,
11968 const X86Subtarget &Subtarget,
11969 SelectionDAG &DAG, bool BitwiseOnly) {
11970 int Size = Mask.size();
11971 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11972
11973 MVT ShiftVT;
11974 SDValue V = V1;
11975 unsigned Opcode;
11976
11977 // Try to match shuffle against V1 shift.
11978 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11979 Mask, 0, Zeroable, Subtarget);
11980
11981 // If V1 failed, try to match shuffle against V2 shift.
11982 if (ShiftAmt < 0) {
11983 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11984 Mask, Size, Zeroable, Subtarget);
11985 V = V2;
11986 }
11987
11988 if (ShiftAmt < 0)
11989 return SDValue();
11990
11991 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11992 return SDValue();
11993
11994 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11995 "Illegal integer vector type");
11996 V = DAG.getBitcast(ShiftVT, V);
11997 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11998 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11999 return DAG.getBitcast(VT, V);
12000}
12001
12002// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12003// Remainder of lower half result is zero and upper half is all undef.
12004static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12005 ArrayRef<int> Mask, uint64_t &BitLen,
12006 uint64_t &BitIdx, const APInt &Zeroable) {
12007 int Size = Mask.size();
12008 int HalfSize = Size / 2;
12009 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12010 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12011
12012 // Upper half must be undefined.
12013 if (!isUndefUpperHalf(Mask))
12014 return false;
12015
12016 // Determine the extraction length from the part of the
12017 // lower half that isn't zeroable.
12018 int Len = HalfSize;
12019 for (; Len > 0; --Len)
12020 if (!Zeroable[Len - 1])
12021 break;
12022 assert(Len > 0 && "Zeroable shuffle mask");
12023
12024 // Attempt to match first Len sequential elements from the lower half.
12025 SDValue Src;
12026 int Idx = -1;
12027 for (int i = 0; i != Len; ++i) {
12028 int M = Mask[i];
12029 if (M == SM_SentinelUndef)
12030 continue;
12031 SDValue &V = (M < Size ? V1 : V2);
12032 M = M % Size;
12033
12034 // The extracted elements must start at a valid index and all mask
12035 // elements must be in the lower half.
12036 if (i > M || M >= HalfSize)
12037 return false;
12038
12039 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12040 Src = V;
12041 Idx = M - i;
12042 continue;
12043 }
12044 return false;
12045 }
12046
12047 if (!Src || Idx < 0)
12048 return false;
12049
12050 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12051 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12052 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12053 V1 = Src;
12054 return true;
12055}
12056
12057// INSERTQ: Extract lowest Len elements from lower half of second source and
12058// insert over first source, starting at Idx.
12059// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12060static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12061 ArrayRef<int> Mask, uint64_t &BitLen,
12062 uint64_t &BitIdx) {
12063 int Size = Mask.size();
12064 int HalfSize = Size / 2;
12065 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12066
12067 // Upper half must be undefined.
12068 if (!isUndefUpperHalf(Mask))
12069 return false;
12070
12071 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12072 SDValue Base;
12073
12074 // Attempt to match first source from mask before insertion point.
12075 if (isUndefInRange(Mask, 0, Idx)) {
12076 /* EMPTY */
12077 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12078 Base = V1;
12079 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12080 Base = V2;
12081 } else {
12082 continue;
12083 }
12084
12085 // Extend the extraction length looking to match both the insertion of
12086 // the second source and the remaining elements of the first.
12087 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12088 SDValue Insert;
12089 int Len = Hi - Idx;
12090
12091 // Match insertion.
12092 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12093 Insert = V1;
12094 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12095 Insert = V2;
12096 } else {
12097 continue;
12098 }
12099
12100 // Match the remaining elements of the lower half.
12101 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12102 /* EMPTY */
12103 } else if ((!Base || (Base == V1)) &&
12104 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12105 Base = V1;
12106 } else if ((!Base || (Base == V2)) &&
12107 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12108 Size + Hi)) {
12109 Base = V2;
12110 } else {
12111 continue;
12112 }
12113
12114 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12115 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12116 V1 = Base;
12117 V2 = Insert;
12118 return true;
12119 }
12120 }
12121
12122 return false;
12123}
12124
12125/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12127 SDValue V2, ArrayRef<int> Mask,
12128 const APInt &Zeroable, SelectionDAG &DAG) {
12129 uint64_t BitLen, BitIdx;
12130 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12131 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12132 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12133 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12134
12135 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12136 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12137 V2 ? V2 : DAG.getUNDEF(VT),
12138 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12139 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12140
12141 return SDValue();
12142}
12143
12144/// Lower a vector shuffle as a zero or any extension.
12145///
12146/// Given a specific number of elements, element bit width, and extension
12147/// stride, produce either a zero or any extension based on the available
12148/// features of the subtarget. The extended elements are consecutive and
12149/// begin and can start from an offsetted element index in the input; to
12150/// avoid excess shuffling the offset must either being in the bottom lane
12151/// or at the start of a higher lane. All extended elements must be from
12152/// the same lane.
12154 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
12155 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12156 assert(Scale > 1 && "Need a scale to extend.");
12157 int EltBits = VT.getScalarSizeInBits();
12158 int NumElements = VT.getVectorNumElements();
12159 int NumEltsPerLane = 128 / EltBits;
12160 int OffsetLane = Offset / NumEltsPerLane;
12161 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12162 "Only 8, 16, and 32 bit elements can be extended.");
12163 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12164 assert(0 <= Offset && "Extension offset must be positive.");
12165 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12166 "Extension offset must be in the first lane or start an upper lane.");
12167
12168 // Check that an index is in same lane as the base offset.
12169 auto SafeOffset = [&](int Idx) {
12170 return OffsetLane == (Idx / NumEltsPerLane);
12171 };
12172
12173 // Shift along an input so that the offset base moves to the first element.
12174 auto ShuffleOffset = [&](SDValue V) {
12175 if (!Offset)
12176 return V;
12177
12178 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12179 for (int i = 0; i * Scale < NumElements; ++i) {
12180 int SrcIdx = i + Offset;
12181 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12182 }
12183 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12184 };
12185
12186 // Found a valid a/zext mask! Try various lowering strategies based on the
12187 // input type and available ISA extensions.
12188 if (Subtarget.hasSSE41()) {
12189 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12190 // PUNPCK will catch this in a later shuffle match.
12191 if (Offset && Scale == 2 && VT.is128BitVector())
12192 return SDValue();
12193 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12194 NumElements / Scale);
12195 InputV = DAG.getBitcast(VT, InputV);
12196 InputV = ShuffleOffset(InputV);
12198 DL, ExtVT, InputV, DAG);
12199 return DAG.getBitcast(VT, InputV);
12200 }
12201
12202 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12203 InputV = DAG.getBitcast(VT, InputV);
12204
12205 // For any extends we can cheat for larger element sizes and use shuffle
12206 // instructions that can fold with a load and/or copy.
12207 if (AnyExt && EltBits == 32) {
12208 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12209 -1};
12210 return DAG.getBitcast(
12211 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12212 DAG.getBitcast(MVT::v4i32, InputV),
12213 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12214 }
12215 if (AnyExt && EltBits == 16 && Scale > 2) {
12216 int PSHUFDMask[4] = {Offset / 2, -1,
12217 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12218 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12219 DAG.getBitcast(MVT::v4i32, InputV),
12220 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12221 int PSHUFWMask[4] = {1, -1, -1, -1};
12222 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12223 return DAG.getBitcast(
12224 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12225 DAG.getBitcast(MVT::v8i16, InputV),
12226 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12227 }
12228
12229 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12230 // to 64-bits.
12231 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12232 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12233 assert(VT.is128BitVector() && "Unexpected vector width!");
12234
12235 int LoIdx = Offset * EltBits;
12236 SDValue Lo = DAG.getBitcast(
12237 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12238 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12239 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12240
12241 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12242 return DAG.getBitcast(VT, Lo);
12243
12244 int HiIdx = (Offset + 1) * EltBits;
12245 SDValue Hi = DAG.getBitcast(
12246 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12247 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12248 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12249 return DAG.getBitcast(VT,
12250 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12251 }
12252
12253 // If this would require more than 2 unpack instructions to expand, use
12254 // pshufb when available. We can only use more than 2 unpack instructions
12255 // when zero extending i8 elements which also makes it easier to use pshufb.
12256 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12257 assert(NumElements == 16 && "Unexpected byte vector width!");
12258 SDValue PSHUFBMask[16];
12259 for (int i = 0; i < 16; ++i) {
12260 int Idx = Offset + (i / Scale);
12261 if ((i % Scale == 0 && SafeOffset(Idx))) {
12262 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12263 continue;
12264 }
12265 PSHUFBMask[i] =
12266 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12267 }
12268 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12269 return DAG.getBitcast(
12270 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12271 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12272 }
12273
12274 // If we are extending from an offset, ensure we start on a boundary that
12275 // we can unpack from.
12276 int AlignToUnpack = Offset % (NumElements / Scale);
12277 if (AlignToUnpack) {
12278 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12279 for (int i = AlignToUnpack; i < NumElements; ++i)
12280 ShMask[i - AlignToUnpack] = i;
12281 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12282 Offset -= AlignToUnpack;
12283 }
12284
12285 // Otherwise emit a sequence of unpacks.
12286 do {
12287 unsigned UnpackLoHi = X86ISD::UNPCKL;
12288 if (Offset >= (NumElements / 2)) {
12289 UnpackLoHi = X86ISD::UNPCKH;
12290 Offset -= (NumElements / 2);
12291 }
12292
12293 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12294 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12295 : getZeroVector(InputVT, Subtarget, DAG, DL);
12296 InputV = DAG.getBitcast(InputVT, InputV);
12297 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12298 Scale /= 2;
12299 EltBits *= 2;
12300 NumElements /= 2;
12301 } while (Scale > 1);
12302 return DAG.getBitcast(VT, InputV);
12303}
12304
12305/// Try to lower a vector shuffle as a zero extension on any microarch.
12306///
12307/// This routine will try to do everything in its power to cleverly lower
12308/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12309/// check for the profitability of this lowering, it tries to aggressively
12310/// match this pattern. It will use all of the micro-architectural details it
12311/// can to emit an efficient lowering. It handles both blends with all-zero
12312/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12313/// masking out later).
12314///
12315/// The reason we have dedicated lowering for zext-style shuffles is that they
12316/// are both incredibly common and often quite performance sensitive.
12318 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12319 const APInt &Zeroable, const X86Subtarget &Subtarget,
12320 SelectionDAG &DAG) {
12321 int Bits = VT.getSizeInBits();
12322 int NumLanes = Bits / 128;
12323 int NumElements = VT.getVectorNumElements();
12324 int NumEltsPerLane = NumElements / NumLanes;
12325 assert(VT.getScalarSizeInBits() <= 32 &&
12326 "Exceeds 32-bit integer zero extension limit");
12327 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12328
12329 // Define a helper function to check a particular ext-scale and lower to it if
12330 // valid.
12331 auto Lower = [&](int Scale) -> SDValue {
12332 SDValue InputV;
12333 bool AnyExt = true;
12334 int Offset = 0;
12335 int Matches = 0;
12336 for (int i = 0; i < NumElements; ++i) {
12337 int M = Mask[i];
12338 if (M < 0)
12339 continue; // Valid anywhere but doesn't tell us anything.
12340 if (i % Scale != 0) {
12341 // Each of the extended elements need to be zeroable.
12342 if (!Zeroable[i])
12343 return SDValue();
12344
12345 // We no longer are in the anyext case.
12346 AnyExt = false;
12347 continue;
12348 }
12349
12350 // Each of the base elements needs to be consecutive indices into the
12351 // same input vector.
12352 SDValue V = M < NumElements ? V1 : V2;
12353 M = M % NumElements;
12354 if (!InputV) {
12355 InputV = V;
12356 Offset = M - (i / Scale);
12357 } else if (InputV != V)
12358 return SDValue(); // Flip-flopping inputs.
12359
12360 // Offset must start in the lowest 128-bit lane or at the start of an
12361 // upper lane.
12362 // FIXME: Is it ever worth allowing a negative base offset?
12363 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12364 (Offset % NumEltsPerLane) == 0))
12365 return SDValue();
12366
12367 // If we are offsetting, all referenced entries must come from the same
12368 // lane.
12369 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12370 return SDValue();
12371
12372 if ((M % NumElements) != (Offset + (i / Scale)))
12373 return SDValue(); // Non-consecutive strided elements.
12374 Matches++;
12375 }
12376
12377 // If we fail to find an input, we have a zero-shuffle which should always
12378 // have already been handled.
12379 // FIXME: Maybe handle this here in case during blending we end up with one?
12380 if (!InputV)
12381 return SDValue();
12382
12383 // If we are offsetting, don't extend if we only match a single input, we
12384 // can always do better by using a basic PSHUF or PUNPCK.
12385 if (Offset != 0 && Matches < 2)
12386 return SDValue();
12387
12388 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12389 InputV, Mask, Subtarget, DAG);
12390 };
12391
12392 // The widest scale possible for extending is to a 64-bit integer.
12393 assert(Bits % 64 == 0 &&
12394 "The number of bits in a vector must be divisible by 64 on x86!");
12395 int NumExtElements = Bits / 64;
12396
12397 // Each iteration, try extending the elements half as much, but into twice as
12398 // many elements.
12399 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12400 assert(NumElements % NumExtElements == 0 &&
12401 "The input vector size must be divisible by the extended size.");
12402 if (SDValue V = Lower(NumElements / NumExtElements))
12403 return V;
12404 }
12405
12406 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12407 if (Bits != 128)
12408 return SDValue();
12409
12410 // Returns one of the source operands if the shuffle can be reduced to a
12411 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12412 auto CanZExtLowHalf = [&]() {
12413 for (int i = NumElements / 2; i != NumElements; ++i)
12414 if (!Zeroable[i])
12415 return SDValue();
12416 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12417 return V1;
12418 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12419 return V2;
12420 return SDValue();
12421 };
12422
12423 if (SDValue V = CanZExtLowHalf()) {
12424 V = DAG.getBitcast(MVT::v2i64, V);
12425 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12426 return DAG.getBitcast(VT, V);
12427 }
12428
12429 // No viable ext lowering found.
12430 return SDValue();
12431}
12432
12433/// Try to get a scalar value for a specific element of a vector.
12434///
12435/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12437 SelectionDAG &DAG) {
12438 MVT VT = V.getSimpleValueType();
12439 MVT EltVT = VT.getVectorElementType();
12440 V = peekThroughBitcasts(V);
12441
12442 // If the bitcasts shift the element size, we can't extract an equivalent
12443 // element from it.
12444 MVT NewVT = V.getSimpleValueType();
12445 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12446 return SDValue();
12447
12448 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12449 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12450 // Ensure the scalar operand is the same size as the destination.
12451 // FIXME: Add support for scalar truncation where possible.
12452 SDValue S = V.getOperand(Idx);
12453 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12454 return DAG.getBitcast(EltVT, S);
12455 }
12456
12457 return SDValue();
12458}
12459
12460/// Helper to test for a load that can be folded with x86 shuffles.
12461///
12462/// This is particularly important because the set of instructions varies
12463/// significantly based on whether the operand is a load or not.
12465 return V->hasOneUse() &&
12467}
12468
12469template<typename T>
12470static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12471 T EltVT = VT.getScalarType();
12472 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12473 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12474}
12475
12476/// Try to lower insertion of a single element into a zero vector.
12477///
12478/// This is a common pattern that we have especially efficient patterns to lower
12479/// across all subtarget feature sets.
12481 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12482 const APInt &Zeroable, const X86Subtarget &Subtarget,
12483 SelectionDAG &DAG) {
12484 MVT ExtVT = VT;
12485 MVT EltVT = VT.getVectorElementType();
12486 unsigned NumElts = VT.getVectorNumElements();
12487 unsigned EltBits = VT.getScalarSizeInBits();
12488
12489 if (isSoftF16(EltVT, Subtarget))
12490 return SDValue();
12491
12492 int V2Index =
12493 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12494 Mask.begin();
12495 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12496 bool IsV1Zeroable = true;
12497 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12498 if (i != V2Index && !Zeroable[i]) {
12499 IsV1Zeroable = false;
12500 break;
12501 }
12502
12503 // Bail if a non-zero V1 isn't used in place.
12504 if (!IsV1Zeroable) {
12505 SmallVector<int, 8> V1Mask(Mask);
12506 V1Mask[V2Index] = -1;
12507 if (!isNoopShuffleMask(V1Mask))
12508 return SDValue();
12509 }
12510
12511 // Check for a single input from a SCALAR_TO_VECTOR node.
12512 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12513 // all the smarts here sunk into that routine. However, the current
12514 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12515 // vector shuffle lowering is dead.
12516 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12517 DAG);
12518 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12519 // We need to zext the scalar if it is smaller than an i32.
12520 V2S = DAG.getBitcast(EltVT, V2S);
12521 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12522 // Using zext to expand a narrow element won't work for non-zero
12523 // insertions. But we can use a masked constant vector if we're
12524 // inserting V2 into the bottom of V1.
12525 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12526 return SDValue();
12527
12528 // Zero-extend directly to i32.
12529 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12530 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12531
12532 // If we're inserting into a constant, mask off the inserted index
12533 // and OR with the zero-extended scalar.
12534 if (!IsV1Zeroable) {
12535 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12536 Bits[V2Index] = APInt::getZero(EltBits);
12537 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12538 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12539 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12540 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12541 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12542 }
12543 }
12544 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12545 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12546 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12547 // Either not inserting from the low element of the input or the input
12548 // element size is too small to use VZEXT_MOVL to clear the high bits.
12549 return SDValue();
12550 }
12551
12552 if (!IsV1Zeroable) {
12553 // If V1 can't be treated as a zero vector we have fewer options to lower
12554 // this. We can't support integer vectors or non-zero targets cheaply.
12555 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12556 if (!VT.isFloatingPoint() || V2Index != 0)
12557 return SDValue();
12558 if (!VT.is128BitVector())
12559 return SDValue();
12560
12561 // Otherwise, use MOVSD, MOVSS or MOVSH.
12562 unsigned MovOpc = 0;
12563 if (EltVT == MVT::f16)
12564 MovOpc = X86ISD::MOVSH;
12565 else if (EltVT == MVT::f32)
12566 MovOpc = X86ISD::MOVSS;
12567 else if (EltVT == MVT::f64)
12568 MovOpc = X86ISD::MOVSD;
12569 else
12570 llvm_unreachable("Unsupported floating point element type to handle!");
12571 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12572 }
12573
12574 // This lowering only works for the low element with floating point vectors.
12575 if (VT.isFloatingPoint() && V2Index != 0)
12576 return SDValue();
12577
12578 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12579 if (ExtVT != VT)
12580 V2 = DAG.getBitcast(VT, V2);
12581
12582 if (V2Index != 0) {
12583 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12584 // the desired position. Otherwise it is more efficient to do a vector
12585 // shift left. We know that we can do a vector shift left because all
12586 // the inputs are zero.
12587 if (VT.isFloatingPoint() || NumElts <= 4) {
12588 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12589 V2Shuffle[V2Index] = 0;
12590 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12591 } else {
12592 V2 = DAG.getBitcast(MVT::v16i8, V2);
12593 V2 = DAG.getNode(
12594 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12595 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12596 V2 = DAG.getBitcast(VT, V2);
12597 }
12598 }
12599 return V2;
12600}
12601
12602/// Try to lower broadcast of a single - truncated - integer element,
12603/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12604///
12605/// This assumes we have AVX2.
12607 int BroadcastIdx,
12608 const X86Subtarget &Subtarget,
12609 SelectionDAG &DAG) {
12610 assert(Subtarget.hasAVX2() &&
12611 "We can only lower integer broadcasts with AVX2!");
12612
12613 MVT EltVT = VT.getVectorElementType();
12614 MVT V0VT = V0.getSimpleValueType();
12615
12616 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12617 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12618
12619 MVT V0EltVT = V0VT.getVectorElementType();
12620 if (!V0EltVT.isInteger())
12621 return SDValue();
12622
12623 const unsigned EltSize = EltVT.getSizeInBits();
12624 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12625
12626 // This is only a truncation if the original element type is larger.
12627 if (V0EltSize <= EltSize)
12628 return SDValue();
12629
12630 assert(((V0EltSize % EltSize) == 0) &&
12631 "Scalar type sizes must all be powers of 2 on x86!");
12632
12633 const unsigned V0Opc = V0.getOpcode();
12634 const unsigned Scale = V0EltSize / EltSize;
12635 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12636
12637 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12638 V0Opc != ISD::BUILD_VECTOR)
12639 return SDValue();
12640
12641 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12642
12643 // If we're extracting non-least-significant bits, shift so we can truncate.
12644 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12645 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12646 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12647 if (const int OffsetIdx = BroadcastIdx % Scale)
12648 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12649 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12650
12651 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12652 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12653}
12654
12655/// Test whether this can be lowered with a single SHUFPS instruction.
12656///
12657/// This is used to disable more specialized lowerings when the shufps lowering
12658/// will happen to be efficient.
12660 // This routine only handles 128-bit shufps.
12661 assert(Mask.size() == 4 && "Unsupported mask size!");
12662 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12663 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12664 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12665 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12666
12667 // To lower with a single SHUFPS we need to have the low half and high half
12668 // each requiring a single input.
12669 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12670 return false;
12671 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12672 return false;
12673
12674 return true;
12675}
12676
12677/// Test whether the specified input (0 or 1) is in-place blended by the
12678/// given mask.
12679///
12680/// This returns true if the elements from a particular input are already in the
12681/// slot required by the given mask and require no permutation.
12682static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12683 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12684 int Size = Mask.size();
12685 for (int i = 0; i < Size; ++i)
12686 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12687 return false;
12688
12689 return true;
12690}
12691
12692/// If we are extracting two 128-bit halves of a vector and shuffling the
12693/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12694/// multi-shuffle lowering.
12696 SDValue N1, ArrayRef<int> Mask,
12697 SelectionDAG &DAG) {
12698 MVT VT = N0.getSimpleValueType();
12699 assert((VT.is128BitVector() &&
12700 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12701 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12702
12703 // Check that both sources are extracts of the same source vector.
12704 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12706 N0.getOperand(0) != N1.getOperand(0) ||
12707 !N0.hasOneUse() || !N1.hasOneUse())
12708 return SDValue();
12709
12710 SDValue WideVec = N0.getOperand(0);
12711 MVT WideVT = WideVec.getSimpleValueType();
12712 if (!WideVT.is256BitVector())
12713 return SDValue();
12714
12715 // Match extracts of each half of the wide source vector. Commute the shuffle
12716 // if the extract of the low half is N1.
12717 unsigned NumElts = VT.getVectorNumElements();
12718 SmallVector<int, 4> NewMask(Mask);
12719 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12720 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12721 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12723 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12724 return SDValue();
12725
12726 // Final bailout: if the mask is simple, we are better off using an extract
12727 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12728 // because that avoids a constant load from memory.
12729 if (NumElts == 4 &&
12730 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12731 return SDValue();
12732
12733 // Extend the shuffle mask with undef elements.
12734 NewMask.append(NumElts, -1);
12735
12736 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12737 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12738 NewMask);
12739 // This is free: ymm -> xmm.
12740 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12741 DAG.getVectorIdxConstant(0, DL));
12742}
12743
12744/// Try to lower broadcast of a single element.
12745///
12746/// For convenience, this code also bundles all of the subtarget feature set
12747/// filtering. While a little annoying to re-dispatch on type here, there isn't
12748/// a convenient way to factor it out.
12750 SDValue V2, ArrayRef<int> Mask,
12751 const X86Subtarget &Subtarget,
12752 SelectionDAG &DAG) {
12753 MVT EltVT = VT.getVectorElementType();
12754 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12755 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12756 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12757 return SDValue();
12758
12759 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12760 // we can only broadcast from a register with AVX2.
12761 unsigned NumEltBits = VT.getScalarSizeInBits();
12762 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12765 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12766
12767 // Check that the mask is a broadcast.
12768 int BroadcastIdx = getSplatIndex(Mask);
12769 if (BroadcastIdx < 0)
12770 return SDValue();
12771 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12772 "a sorted mask where the broadcast "
12773 "comes from V1.");
12774 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
12775
12776 // Go up the chain of (vector) values to find a scalar load that we can
12777 // combine with the broadcast.
12778 // TODO: Combine this logic with findEltLoadSrc() used by
12779 // EltsFromConsecutiveLoads().
12780 int BitOffset = BroadcastIdx * NumEltBits;
12781 SDValue V = V1;
12782 for (;;) {
12783 switch (V.getOpcode()) {
12784 case ISD::BITCAST: {
12785 V = V.getOperand(0);
12786 continue;
12787 }
12788 case ISD::CONCAT_VECTORS: {
12789 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12790 int OpIdx = BitOffset / OpBitWidth;
12791 V = V.getOperand(OpIdx);
12792 BitOffset %= OpBitWidth;
12793 continue;
12794 }
12796 // The extraction index adds to the existing offset.
12797 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12798 unsigned Idx = V.getConstantOperandVal(1);
12799 unsigned BeginOffset = Idx * EltBitWidth;
12800 BitOffset += BeginOffset;
12801 V = V.getOperand(0);
12802 continue;
12803 }
12804 case ISD::INSERT_SUBVECTOR: {
12805 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12806 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12807 int Idx = (int)V.getConstantOperandVal(2);
12808 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12809 int BeginOffset = Idx * EltBitWidth;
12810 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12811 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12812 BitOffset -= BeginOffset;
12813 V = VInner;
12814 } else {
12815 V = VOuter;
12816 }
12817 continue;
12818 }
12819 }
12820 break;
12821 }
12822 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12823 BroadcastIdx = BitOffset / NumEltBits;
12824
12825 // Do we need to bitcast the source to retrieve the original broadcast index?
12826 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12827
12828 // Check if this is a broadcast of a scalar. We special case lowering
12829 // for scalars so that we can more effectively fold with loads.
12830 // If the original value has a larger element type than the shuffle, the
12831 // broadcast element is in essence truncated. Make that explicit to ease
12832 // folding.
12833 if (BitCastSrc && VT.isInteger())
12834 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12835 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12836 return TruncBroadcast;
12837
12838 // Also check the simpler case, where we can directly reuse the scalar.
12839 if (!BitCastSrc &&
12840 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12841 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12842 V = V.getOperand(BroadcastIdx);
12843
12844 // If we can't broadcast from a register, check that the input is a load.
12845 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12846 return SDValue();
12847 } else if (ISD::isNormalLoad(V.getNode()) &&
12848 cast<LoadSDNode>(V)->isSimple()) {
12849 // We do not check for one-use of the vector load because a broadcast load
12850 // is expected to be a win for code size, register pressure, and possibly
12851 // uops even if the original vector load is not eliminated.
12852
12853 // Reduce the vector load and shuffle to a broadcasted scalar load.
12854 LoadSDNode *Ld = cast<LoadSDNode>(V);
12855 SDValue BaseAddr = Ld->getOperand(1);
12856 MVT SVT = VT.getScalarType();
12857 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12858 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12859 SDValue NewAddr =
12861
12862 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12863 // than MOVDDUP.
12864 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12865 if (Opcode == X86ISD::VBROADCAST) {
12866 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12867 SDValue Ops[] = {Ld->getChain(), NewAddr};
12868 V = DAG.getMemIntrinsicNode(
12869 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12871 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12873 return DAG.getBitcast(VT, V);
12874 }
12875 assert(SVT == MVT::f64 && "Unexpected VT!");
12876 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12878 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12880 } else if (!BroadcastFromReg) {
12881 // We can't broadcast from a vector register.
12882 return SDValue();
12883 } else if (BitOffset != 0) {
12884 // We can only broadcast from the zero-element of a vector register,
12885 // but it can be advantageous to broadcast from the zero-element of a
12886 // subvector.
12887 if (!VT.is256BitVector() && !VT.is512BitVector())
12888 return SDValue();
12889
12890 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12891 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12892 return SDValue();
12893
12894 // If we are broadcasting an element from the lowest 128-bit subvector, try
12895 // to move the element in position.
12896 if (BitOffset < 128 && NumActiveElts > 1 &&
12897 V.getScalarValueSizeInBits() == NumEltBits) {
12898 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12899 "Unexpected bit-offset");
12900 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
12901 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
12902 V = extractSubVector(V, 0, DAG, DL, 128);
12903 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
12904 } else {
12905 // Only broadcast the zero-element of a 128-bit subvector.
12906 if ((BitOffset % 128) != 0)
12907 return SDValue();
12908
12909 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12910 "Unexpected bit-offset");
12911 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12912 "Unexpected vector size");
12913 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12914 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12915 }
12916 }
12917
12918 // On AVX we can use VBROADCAST directly for scalar sources.
12919 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12920 V = DAG.getBitcast(MVT::f64, V);
12921 if (Subtarget.hasAVX()) {
12922 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12923 return DAG.getBitcast(VT, V);
12924 }
12925 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12926 }
12927
12928 // If this is a scalar, do the broadcast on this type and bitcast.
12929 if (!V.getValueType().isVector()) {
12930 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12931 "Unexpected scalar size");
12932 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12934 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12935 }
12936
12937 // We only support broadcasting from 128-bit vectors to minimize the
12938 // number of patterns we need to deal with in isel. So extract down to
12939 // 128-bits, removing as many bitcasts as possible.
12940 if (V.getValueSizeInBits() > 128)
12942
12943 // Otherwise cast V to a vector with the same element type as VT, but
12944 // possibly narrower than VT. Then perform the broadcast.
12945 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12946 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12947 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12948}
12949
12950// Check for whether we can use INSERTPS to perform the shuffle. We only use
12951// INSERTPS when the V1 elements are already in the correct locations
12952// because otherwise we can just always use two SHUFPS instructions which
12953// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12954// perform INSERTPS if a single V1 element is out of place and all V2
12955// elements are zeroable.
12957 unsigned &InsertPSMask,
12958 const APInt &Zeroable,
12959 ArrayRef<int> Mask, SelectionDAG &DAG) {
12960 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12961 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12962 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12963
12964 // Attempt to match INSERTPS with one element from VA or VB being
12965 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12966 // are updated.
12967 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12968 ArrayRef<int> CandidateMask) {
12969 unsigned ZMask = 0;
12970 int VADstIndex = -1;
12971 int VBDstIndex = -1;
12972 bool VAUsedInPlace = false;
12973
12974 for (int i = 0; i < 4; ++i) {
12975 // Synthesize a zero mask from the zeroable elements (includes undefs).
12976 if (Zeroable[i]) {
12977 ZMask |= 1 << i;
12978 continue;
12979 }
12980
12981 // Flag if we use any VA inputs in place.
12982 if (i == CandidateMask[i]) {
12983 VAUsedInPlace = true;
12984 continue;
12985 }
12986
12987 // We can only insert a single non-zeroable element.
12988 if (VADstIndex >= 0 || VBDstIndex >= 0)
12989 return false;
12990
12991 if (CandidateMask[i] < 4) {
12992 // VA input out of place for insertion.
12993 VADstIndex = i;
12994 } else {
12995 // VB input for insertion.
12996 VBDstIndex = i;
12997 }
12998 }
12999
13000 // Don't bother if we have no (non-zeroable) element for insertion.
13001 if (VADstIndex < 0 && VBDstIndex < 0)
13002 return false;
13003
13004 // Determine element insertion src/dst indices. The src index is from the
13005 // start of the inserted vector, not the start of the concatenated vector.
13006 unsigned VBSrcIndex = 0;
13007 if (VADstIndex >= 0) {
13008 // If we have a VA input out of place, we use VA as the V2 element
13009 // insertion and don't use the original V2 at all.
13010 VBSrcIndex = CandidateMask[VADstIndex];
13011 VBDstIndex = VADstIndex;
13012 VB = VA;
13013 } else {
13014 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13015 }
13016
13017 // If no V1 inputs are used in place, then the result is created only from
13018 // the zero mask and the V2 insertion - so remove V1 dependency.
13019 if (!VAUsedInPlace)
13020 VA = DAG.getUNDEF(MVT::v4f32);
13021
13022 // Update V1, V2 and InsertPSMask accordingly.
13023 V1 = VA;
13024 V2 = VB;
13025
13026 // Insert the V2 element into the desired position.
13027 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13028 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13029 return true;
13030 };
13031
13032 if (matchAsInsertPS(V1, V2, Mask))
13033 return true;
13034
13035 // Commute and try again.
13036 SmallVector<int, 4> CommutedMask(Mask);
13038 if (matchAsInsertPS(V2, V1, CommutedMask))
13039 return true;
13040
13041 return false;
13042}
13043
13045 ArrayRef<int> Mask, const APInt &Zeroable,
13046 SelectionDAG &DAG) {
13047 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13048 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13049
13050 // Attempt to match the insertps pattern.
13051 unsigned InsertPSMask = 0;
13052 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13053 return SDValue();
13054
13055 // Insert the V2 element into the desired position.
13056 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13057 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13058}
13059
13060/// Handle lowering of 2-lane 64-bit floating point shuffles.
13061///
13062/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13063/// support for floating point shuffles but not integer shuffles. These
13064/// instructions will incur a domain crossing penalty on some chips though so
13065/// it is better to avoid lowering through this for integer vectors where
13066/// possible.
13068 const APInt &Zeroable, SDValue V1, SDValue V2,
13069 const X86Subtarget &Subtarget,
13070 SelectionDAG &DAG) {
13071 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13072 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13073 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13074
13075 if (V2.isUndef()) {
13076 // Check for being able to broadcast a single element.
13077 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13078 Mask, Subtarget, DAG))
13079 return Broadcast;
13080
13081 // Straight shuffle of a single input vector. Simulate this by using the
13082 // single input as both of the "inputs" to this instruction..
13083 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13084
13085 if (Subtarget.hasAVX()) {
13086 // If we have AVX, we can use VPERMILPS which will allow folding a load
13087 // into the shuffle.
13088 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13089 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13090 }
13091
13092 return DAG.getNode(
13093 X86ISD::SHUFP, DL, MVT::v2f64,
13094 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13095 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13096 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13097 }
13098 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13099 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13100 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13101 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13102
13103 if (Subtarget.hasAVX2())
13104 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13105 return Extract;
13106
13107 // When loading a scalar and then shuffling it into a vector we can often do
13108 // the insertion cheaply.
13110 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13111 return Insertion;
13112 // Try inverting the insertion since for v2 masks it is easy to do and we
13113 // can't reliably sort the mask one way or the other.
13114 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13115 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13117 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13118 return Insertion;
13119
13120 // Try to use one of the special instruction patterns to handle two common
13121 // blend patterns if a zero-blend above didn't work.
13122 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13123 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13124 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13125 // We can either use a special instruction to load over the low double or
13126 // to move just the low double.
13127 return DAG.getNode(
13128 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13129 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13130
13131 if (Subtarget.hasSSE41())
13132 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13133 Zeroable, Subtarget, DAG))
13134 return Blend;
13135
13136 // Use dedicated unpack instructions for masks that match their pattern.
13137 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13138 return V;
13139
13140 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13141 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13142 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13143}
13144
13145/// Handle lowering of 2-lane 64-bit integer shuffles.
13146///
13147/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13148/// the integer unit to minimize domain crossing penalties. However, for blends
13149/// it falls back to the floating point shuffle operation with appropriate bit
13150/// casting.
13152 const APInt &Zeroable, SDValue V1, SDValue V2,
13153 const X86Subtarget &Subtarget,
13154 SelectionDAG &DAG) {
13155 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13156 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13157 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13158
13159 if (V2.isUndef()) {
13160 // Check for being able to broadcast a single element.
13161 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13162 Mask, Subtarget, DAG))
13163 return Broadcast;
13164
13165 // Straight shuffle of a single input vector. For everything from SSE2
13166 // onward this has a single fast instruction with no scary immediates.
13167 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13168 V1 = DAG.getBitcast(MVT::v4i32, V1);
13169 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13170 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13171 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13172 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13173 return DAG.getBitcast(
13174 MVT::v2i64,
13175 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13176 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13177 }
13178 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13179 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13180 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13181 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13182
13183 if (Subtarget.hasAVX2())
13184 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13185 return Extract;
13186
13187 // Try to use shift instructions.
13188 if (SDValue Shift =
13189 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13190 DAG, /*BitwiseOnly*/ false))
13191 return Shift;
13192
13193 // When loading a scalar and then shuffling it into a vector we can often do
13194 // the insertion cheaply.
13196 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13197 return Insertion;
13198 // Try inverting the insertion since for v2 masks it is easy to do and we
13199 // can't reliably sort the mask one way or the other.
13200 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13202 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13203 return Insertion;
13204
13205 // We have different paths for blend lowering, but they all must use the
13206 // *exact* same predicate.
13207 bool IsBlendSupported = Subtarget.hasSSE41();
13208 if (IsBlendSupported)
13209 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13210 Zeroable, Subtarget, DAG))
13211 return Blend;
13212
13213 // Use dedicated unpack instructions for masks that match their pattern.
13214 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13215 return V;
13216
13217 // Try to use byte rotation instructions.
13218 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13219 if (Subtarget.hasSSSE3()) {
13220 if (Subtarget.hasVLX())
13221 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13222 Zeroable, Subtarget, DAG))
13223 return Rotate;
13224
13225 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13226 Subtarget, DAG))
13227 return Rotate;
13228 }
13229
13230 // If we have direct support for blends, we should lower by decomposing into
13231 // a permute. That will be faster than the domain cross.
13232 if (IsBlendSupported)
13233 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13234 Zeroable, Subtarget, DAG);
13235
13236 // We implement this with SHUFPD which is pretty lame because it will likely
13237 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13238 // However, all the alternatives are still more cycles and newer chips don't
13239 // have this problem. It would be really nice if x86 had better shuffles here.
13240 V1 = DAG.getBitcast(MVT::v2f64, V1);
13241 V2 = DAG.getBitcast(MVT::v2f64, V2);
13242 return DAG.getBitcast(MVT::v2i64,
13243 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13244}
13245
13246/// Lower a vector shuffle using the SHUFPS instruction.
13247///
13248/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13249/// It makes no assumptions about whether this is the *best* lowering, it simply
13250/// uses it.
13252 ArrayRef<int> Mask, SDValue V1,
13253 SDValue V2, SelectionDAG &DAG) {
13254 SDValue LowV = V1, HighV = V2;
13255 SmallVector<int, 4> NewMask(Mask);
13256 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13257
13258 if (NumV2Elements == 1) {
13259 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13260
13261 // Compute the index adjacent to V2Index and in the same half by toggling
13262 // the low bit.
13263 int V2AdjIndex = V2Index ^ 1;
13264
13265 if (Mask[V2AdjIndex] < 0) {
13266 // Handles all the cases where we have a single V2 element and an undef.
13267 // This will only ever happen in the high lanes because we commute the
13268 // vector otherwise.
13269 if (V2Index < 2)
13270 std::swap(LowV, HighV);
13271 NewMask[V2Index] -= 4;
13272 } else {
13273 // Handle the case where the V2 element ends up adjacent to a V1 element.
13274 // To make this work, blend them together as the first step.
13275 int V1Index = V2AdjIndex;
13276 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13277 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13278 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13279
13280 // Now proceed to reconstruct the final blend as we have the necessary
13281 // high or low half formed.
13282 if (V2Index < 2) {
13283 LowV = V2;
13284 HighV = V1;
13285 } else {
13286 HighV = V2;
13287 }
13288 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13289 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13290 }
13291 } else if (NumV2Elements == 2) {
13292 if (Mask[0] < 4 && Mask[1] < 4) {
13293 // Handle the easy case where we have V1 in the low lanes and V2 in the
13294 // high lanes.
13295 NewMask[2] -= 4;
13296 NewMask[3] -= 4;
13297 } else if (Mask[2] < 4 && Mask[3] < 4) {
13298 // We also handle the reversed case because this utility may get called
13299 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13300 // arrange things in the right direction.
13301 NewMask[0] -= 4;
13302 NewMask[1] -= 4;
13303 HighV = V1;
13304 LowV = V2;
13305 } else {
13306 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13307 // trying to place elements directly, just blend them and set up the final
13308 // shuffle to place them.
13309
13310 // The first two blend mask elements are for V1, the second two are for
13311 // V2.
13312 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13313 Mask[2] < 4 ? Mask[2] : Mask[3],
13314 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13315 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13316 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13317 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13318
13319 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13320 // a blend.
13321 LowV = HighV = V1;
13322 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13323 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13324 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13325 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13326 }
13327 } else if (NumV2Elements == 3) {
13328 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13329 // we can get here due to other paths (e.g repeated mask matching) that we
13330 // don't want to do another round of lowerVECTOR_SHUFFLE.
13332 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13333 }
13334 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13335 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13336}
13337
13338/// Lower 4-lane 32-bit floating point shuffles.
13339///
13340/// Uses instructions exclusively from the floating point unit to minimize
13341/// domain crossing penalties, as these are sufficient to implement all v4f32
13342/// shuffles.
13344 const APInt &Zeroable, SDValue V1, SDValue V2,
13345 const X86Subtarget &Subtarget,
13346 SelectionDAG &DAG) {
13347 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13348 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13349 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13350
13351 if (Subtarget.hasSSE41())
13352 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13353 Zeroable, Subtarget, DAG))
13354 return Blend;
13355
13356 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13357
13358 if (NumV2Elements == 0) {
13359 // Check for being able to broadcast a single element.
13360 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13361 Mask, Subtarget, DAG))
13362 return Broadcast;
13363
13364 // Use even/odd duplicate instructions for masks that match their pattern.
13365 if (Subtarget.hasSSE3()) {
13366 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13367 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13368 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13369 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13370 }
13371
13372 if (Subtarget.hasAVX()) {
13373 // If we have AVX, we can use VPERMILPS which will allow folding a load
13374 // into the shuffle.
13375 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13376 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13377 }
13378
13379 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13380 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13381 if (!Subtarget.hasSSE2()) {
13382 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13383 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13384 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13385 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13386 }
13387
13388 // Otherwise, use a straight shuffle of a single input vector. We pass the
13389 // input vector to both operands to simulate this with a SHUFPS.
13390 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13391 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13392 }
13393
13394 if (Subtarget.hasSSE2())
13396 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13397 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13398 return ZExt;
13399 }
13400
13401 if (Subtarget.hasAVX2())
13402 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13403 return Extract;
13404
13405 // There are special ways we can lower some single-element blends. However, we
13406 // have custom ways we can lower more complex single-element blends below that
13407 // we defer to if both this and BLENDPS fail to match, so restrict this to
13408 // when the V2 input is targeting element 0 of the mask -- that is the fast
13409 // case here.
13410 if (NumV2Elements == 1 && Mask[0] >= 4)
13412 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13413 return V;
13414
13415 if (Subtarget.hasSSE41()) {
13416 // Use INSERTPS if we can complete the shuffle efficiently.
13417 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13418 return V;
13419
13420 if (!isSingleSHUFPSMask(Mask))
13421 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13422 V2, Mask, DAG))
13423 return BlendPerm;
13424 }
13425
13426 // Use low/high mov instructions. These are only valid in SSE1 because
13427 // otherwise they are widened to v2f64 and never get here.
13428 if (!Subtarget.hasSSE2()) {
13429 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13430 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13431 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13432 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13433 }
13434
13435 // Use dedicated unpack instructions for masks that match their pattern.
13436 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13437 return V;
13438
13439 // Otherwise fall back to a SHUFPS lowering strategy.
13440 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13441}
13442
13443/// Lower 4-lane i32 vector shuffles.
13444///
13445/// We try to handle these with integer-domain shuffles where we can, but for
13446/// blends we use the floating point domain blend instructions.
13448 const APInt &Zeroable, SDValue V1, SDValue V2,
13449 const X86Subtarget &Subtarget,
13450 SelectionDAG &DAG) {
13451 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13452 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13453 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13454
13455 // Whenever we can lower this as a zext, that instruction is strictly faster
13456 // than any alternative. It also allows us to fold memory operands into the
13457 // shuffle in many cases.
13458 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13459 Zeroable, Subtarget, DAG))
13460 return ZExt;
13461
13462 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13463
13464 // Try to use shift instructions if fast.
13465 if (Subtarget.preferLowerShuffleAsShift()) {
13466 if (SDValue Shift =
13467 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13468 Subtarget, DAG, /*BitwiseOnly*/ true))
13469 return Shift;
13470 if (NumV2Elements == 0)
13471 if (SDValue Rotate =
13472 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13473 return Rotate;
13474 }
13475
13476 if (NumV2Elements == 0) {
13477 // Try to use broadcast unless the mask only has one non-undef element.
13478 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13479 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13480 Mask, Subtarget, DAG))
13481 return Broadcast;
13482 }
13483
13484 // Straight shuffle of a single input vector. For everything from SSE2
13485 // onward this has a single fast instruction with no scary immediates.
13486 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13487 // but we aren't actually going to use the UNPCK instruction because doing
13488 // so prevents folding a load into this instruction or making a copy.
13489 const int UnpackLoMask[] = {0, 0, 1, 1};
13490 const int UnpackHiMask[] = {2, 2, 3, 3};
13491 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13492 Mask = UnpackLoMask;
13493 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13494 Mask = UnpackHiMask;
13495
13496 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13497 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13498 }
13499
13500 if (Subtarget.hasAVX2())
13501 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13502 return Extract;
13503
13504 // Try to use shift instructions.
13505 if (SDValue Shift =
13506 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13507 DAG, /*BitwiseOnly*/ false))
13508 return Shift;
13509
13510 // There are special ways we can lower some single-element blends.
13511 if (NumV2Elements == 1)
13513 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13514 return V;
13515
13516 // We have different paths for blend lowering, but they all must use the
13517 // *exact* same predicate.
13518 bool IsBlendSupported = Subtarget.hasSSE41();
13519 if (IsBlendSupported)
13520 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13521 Zeroable, Subtarget, DAG))
13522 return Blend;
13523
13524 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13525 Zeroable, Subtarget, DAG))
13526 return Masked;
13527
13528 // Use dedicated unpack instructions for masks that match their pattern.
13529 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13530 return V;
13531
13532 // Try to use byte rotation instructions.
13533 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13534 if (Subtarget.hasSSSE3()) {
13535 if (Subtarget.hasVLX())
13536 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13537 Zeroable, Subtarget, DAG))
13538 return Rotate;
13539
13540 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13541 Subtarget, DAG))
13542 return Rotate;
13543 }
13544
13545 // Assume that a single SHUFPS is faster than an alternative sequence of
13546 // multiple instructions (even if the CPU has a domain penalty).
13547 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13548 if (!isSingleSHUFPSMask(Mask)) {
13549 // If we have direct support for blends, we should lower by decomposing into
13550 // a permute. That will be faster than the domain cross.
13551 if (IsBlendSupported)
13552 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13553 Zeroable, Subtarget, DAG);
13554
13555 // Try to lower by permuting the inputs into an unpack instruction.
13556 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13557 Mask, Subtarget, DAG))
13558 return Unpack;
13559 }
13560
13561 // We implement this with SHUFPS because it can blend from two vectors.
13562 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13563 // up the inputs, bypassing domain shift penalties that we would incur if we
13564 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13565 // relevant.
13566 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13567 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13568 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13569 return DAG.getBitcast(MVT::v4i32, ShufPS);
13570}
13571
13572/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13573/// shuffle lowering, and the most complex part.
13574///
13575/// The lowering strategy is to try to form pairs of input lanes which are
13576/// targeted at the same half of the final vector, and then use a dword shuffle
13577/// to place them onto the right half, and finally unpack the paired lanes into
13578/// their final position.
13579///
13580/// The exact breakdown of how to form these dword pairs and align them on the
13581/// correct sides is really tricky. See the comments within the function for
13582/// more of the details.
13583///
13584/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13585/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13586/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13587/// vector, form the analogous 128-bit 8-element Mask.
13589 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13590 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13591 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13592 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13593
13594 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13595 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13596 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13597
13598 // Attempt to directly match PSHUFLW or PSHUFHW.
13599 if (isUndefOrInRange(LoMask, 0, 4) &&
13600 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13601 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13602 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13603 }
13604 if (isUndefOrInRange(HiMask, 4, 8) &&
13605 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13606 for (int i = 0; i != 4; ++i)
13607 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13608 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13609 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13610 }
13611
13612 SmallVector<int, 4> LoInputs;
13613 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13614 array_pod_sort(LoInputs.begin(), LoInputs.end());
13615 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13616 SmallVector<int, 4> HiInputs;
13617 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13618 array_pod_sort(HiInputs.begin(), HiInputs.end());
13619 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13620 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13621 int NumHToL = LoInputs.size() - NumLToL;
13622 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13623 int NumHToH = HiInputs.size() - NumLToH;
13624 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13625 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13626 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13627 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13628
13629 // If we are shuffling values from one half - check how many different DWORD
13630 // pairs we need to create. If only 1 or 2 then we can perform this as a
13631 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13632 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13633 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13634 V = DAG.getNode(ShufWOp, DL, VT, V,
13635 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13636 V = DAG.getBitcast(PSHUFDVT, V);
13637 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13638 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13639 return DAG.getBitcast(VT, V);
13640 };
13641
13642 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13643 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13644 SmallVector<std::pair<int, int>, 4> DWordPairs;
13645 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13646
13647 // Collect the different DWORD pairs.
13648 for (int DWord = 0; DWord != 4; ++DWord) {
13649 int M0 = Mask[2 * DWord + 0];
13650 int M1 = Mask[2 * DWord + 1];
13651 M0 = (M0 >= 0 ? M0 % 4 : M0);
13652 M1 = (M1 >= 0 ? M1 % 4 : M1);
13653 if (M0 < 0 && M1 < 0)
13654 continue;
13655
13656 bool Match = false;
13657 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13658 auto &DWordPair = DWordPairs[j];
13659 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13660 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13661 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13662 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13663 PSHUFDMask[DWord] = DOffset + j;
13664 Match = true;
13665 break;
13666 }
13667 }
13668 if (!Match) {
13669 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13670 DWordPairs.push_back(std::make_pair(M0, M1));
13671 }
13672 }
13673
13674 if (DWordPairs.size() <= 2) {
13675 DWordPairs.resize(2, std::make_pair(-1, -1));
13676 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13677 DWordPairs[1].first, DWordPairs[1].second};
13678 if ((NumHToL + NumHToH) == 0)
13679 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13680 if ((NumLToL + NumLToH) == 0)
13681 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13682 }
13683 }
13684
13685 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13686 // such inputs we can swap two of the dwords across the half mark and end up
13687 // with <=2 inputs to each half in each half. Once there, we can fall through
13688 // to the generic code below. For example:
13689 //
13690 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13691 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13692 //
13693 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13694 // and an existing 2-into-2 on the other half. In this case we may have to
13695 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13696 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13697 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13698 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13699 // half than the one we target for fixing) will be fixed when we re-enter this
13700 // path. We will also combine away any sequence of PSHUFD instructions that
13701 // result into a single instruction. Here is an example of the tricky case:
13702 //
13703 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13704 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13705 //
13706 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13707 //
13708 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13709 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13710 //
13711 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13712 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13713 //
13714 // The result is fine to be handled by the generic logic.
13715 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13716 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13717 int AOffset, int BOffset) {
13718 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13719 "Must call this with A having 3 or 1 inputs from the A half.");
13720 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13721 "Must call this with B having 1 or 3 inputs from the B half.");
13722 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13723 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13724
13725 bool ThreeAInputs = AToAInputs.size() == 3;
13726
13727 // Compute the index of dword with only one word among the three inputs in
13728 // a half by taking the sum of the half with three inputs and subtracting
13729 // the sum of the actual three inputs. The difference is the remaining
13730 // slot.
13731 int ADWord = 0, BDWord = 0;
13732 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13733 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13734 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13735 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13736 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13737 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13738 int TripleNonInputIdx =
13739 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13740 TripleDWord = TripleNonInputIdx / 2;
13741
13742 // We use xor with one to compute the adjacent DWord to whichever one the
13743 // OneInput is in.
13744 OneInputDWord = (OneInput / 2) ^ 1;
13745
13746 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13747 // and BToA inputs. If there is also such a problem with the BToB and AToB
13748 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13749 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13750 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13751 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13752 // Compute how many inputs will be flipped by swapping these DWords. We
13753 // need
13754 // to balance this to ensure we don't form a 3-1 shuffle in the other
13755 // half.
13756 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13757 llvm::count(AToBInputs, 2 * ADWord + 1);
13758 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13759 llvm::count(BToBInputs, 2 * BDWord + 1);
13760 if ((NumFlippedAToBInputs == 1 &&
13761 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13762 (NumFlippedBToBInputs == 1 &&
13763 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13764 // We choose whether to fix the A half or B half based on whether that
13765 // half has zero flipped inputs. At zero, we may not be able to fix it
13766 // with that half. We also bias towards fixing the B half because that
13767 // will more commonly be the high half, and we have to bias one way.
13768 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13769 ArrayRef<int> Inputs) {
13770 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13771 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13772 // Determine whether the free index is in the flipped dword or the
13773 // unflipped dword based on where the pinned index is. We use this bit
13774 // in an xor to conditionally select the adjacent dword.
13775 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13776 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13777 if (IsFixIdxInput == IsFixFreeIdxInput)
13778 FixFreeIdx += 1;
13779 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13780 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13781 "We need to be changing the number of flipped inputs!");
13782 int PSHUFHalfMask[] = {0, 1, 2, 3};
13783 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13784 V = DAG.getNode(
13785 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13786 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13787 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13788
13789 for (int &M : Mask)
13790 if (M >= 0 && M == FixIdx)
13791 M = FixFreeIdx;
13792 else if (M >= 0 && M == FixFreeIdx)
13793 M = FixIdx;
13794 };
13795 if (NumFlippedBToBInputs != 0) {
13796 int BPinnedIdx =
13797 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13798 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13799 } else {
13800 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13801 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13802 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13803 }
13804 }
13805 }
13806
13807 int PSHUFDMask[] = {0, 1, 2, 3};
13808 PSHUFDMask[ADWord] = BDWord;
13809 PSHUFDMask[BDWord] = ADWord;
13810 V = DAG.getBitcast(
13811 VT,
13812 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13813 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13814
13815 // Adjust the mask to match the new locations of A and B.
13816 for (int &M : Mask)
13817 if (M >= 0 && M/2 == ADWord)
13818 M = 2 * BDWord + M % 2;
13819 else if (M >= 0 && M/2 == BDWord)
13820 M = 2 * ADWord + M % 2;
13821
13822 // Recurse back into this routine to re-compute state now that this isn't
13823 // a 3 and 1 problem.
13824 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13825 };
13826 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13827 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13828 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13829 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13830
13831 // At this point there are at most two inputs to the low and high halves from
13832 // each half. That means the inputs can always be grouped into dwords and
13833 // those dwords can then be moved to the correct half with a dword shuffle.
13834 // We use at most one low and one high word shuffle to collect these paired
13835 // inputs into dwords, and finally a dword shuffle to place them.
13836 int PSHUFLMask[4] = {-1, -1, -1, -1};
13837 int PSHUFHMask[4] = {-1, -1, -1, -1};
13838 int PSHUFDMask[4] = {-1, -1, -1, -1};
13839
13840 // First fix the masks for all the inputs that are staying in their
13841 // original halves. This will then dictate the targets of the cross-half
13842 // shuffles.
13843 auto fixInPlaceInputs =
13844 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13845 MutableArrayRef<int> SourceHalfMask,
13846 MutableArrayRef<int> HalfMask, int HalfOffset) {
13847 if (InPlaceInputs.empty())
13848 return;
13849 if (InPlaceInputs.size() == 1) {
13850 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13851 InPlaceInputs[0] - HalfOffset;
13852 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13853 return;
13854 }
13855 if (IncomingInputs.empty()) {
13856 // Just fix all of the in place inputs.
13857 for (int Input : InPlaceInputs) {
13858 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13859 PSHUFDMask[Input / 2] = Input / 2;
13860 }
13861 return;
13862 }
13863
13864 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13865 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13866 InPlaceInputs[0] - HalfOffset;
13867 // Put the second input next to the first so that they are packed into
13868 // a dword. We find the adjacent index by toggling the low bit.
13869 int AdjIndex = InPlaceInputs[0] ^ 1;
13870 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13871 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13872 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13873 };
13874 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13875 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13876
13877 // Now gather the cross-half inputs and place them into a free dword of
13878 // their target half.
13879 // FIXME: This operation could almost certainly be simplified dramatically to
13880 // look more like the 3-1 fixing operation.
13881 auto moveInputsToRightHalf = [&PSHUFDMask](
13882 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13883 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13884 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13885 int DestOffset) {
13886 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13887 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13888 };
13889 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13890 int Word) {
13891 int LowWord = Word & ~1;
13892 int HighWord = Word | 1;
13893 return isWordClobbered(SourceHalfMask, LowWord) ||
13894 isWordClobbered(SourceHalfMask, HighWord);
13895 };
13896
13897 if (IncomingInputs.empty())
13898 return;
13899
13900 if (ExistingInputs.empty()) {
13901 // Map any dwords with inputs from them into the right half.
13902 for (int Input : IncomingInputs) {
13903 // If the source half mask maps over the inputs, turn those into
13904 // swaps and use the swapped lane.
13905 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13906 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13907 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13908 Input - SourceOffset;
13909 // We have to swap the uses in our half mask in one sweep.
13910 for (int &M : HalfMask)
13911 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13912 M = Input;
13913 else if (M == Input)
13914 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13915 } else {
13916 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13917 Input - SourceOffset &&
13918 "Previous placement doesn't match!");
13919 }
13920 // Note that this correctly re-maps both when we do a swap and when
13921 // we observe the other side of the swap above. We rely on that to
13922 // avoid swapping the members of the input list directly.
13923 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13924 }
13925
13926 // Map the input's dword into the correct half.
13927 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13928 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13929 else
13930 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13931 Input / 2 &&
13932 "Previous placement doesn't match!");
13933 }
13934
13935 // And just directly shift any other-half mask elements to be same-half
13936 // as we will have mirrored the dword containing the element into the
13937 // same position within that half.
13938 for (int &M : HalfMask)
13939 if (M >= SourceOffset && M < SourceOffset + 4) {
13940 M = M - SourceOffset + DestOffset;
13941 assert(M >= 0 && "This should never wrap below zero!");
13942 }
13943 return;
13944 }
13945
13946 // Ensure we have the input in a viable dword of its current half. This
13947 // is particularly tricky because the original position may be clobbered
13948 // by inputs being moved and *staying* in that half.
13949 if (IncomingInputs.size() == 1) {
13950 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13951 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13952 SourceOffset;
13953 SourceHalfMask[InputFixed - SourceOffset] =
13954 IncomingInputs[0] - SourceOffset;
13955 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13956 InputFixed);
13957 IncomingInputs[0] = InputFixed;
13958 }
13959 } else if (IncomingInputs.size() == 2) {
13960 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13961 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13962 // We have two non-adjacent or clobbered inputs we need to extract from
13963 // the source half. To do this, we need to map them into some adjacent
13964 // dword slot in the source mask.
13965 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13966 IncomingInputs[1] - SourceOffset};
13967
13968 // If there is a free slot in the source half mask adjacent to one of
13969 // the inputs, place the other input in it. We use (Index XOR 1) to
13970 // compute an adjacent index.
13971 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13972 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13973 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13974 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13975 InputsFixed[1] = InputsFixed[0] ^ 1;
13976 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13977 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13978 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13979 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13980 InputsFixed[0] = InputsFixed[1] ^ 1;
13981 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13982 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13983 // The two inputs are in the same DWord but it is clobbered and the
13984 // adjacent DWord isn't used at all. Move both inputs to the free
13985 // slot.
13986 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13987 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13988 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13989 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13990 } else {
13991 // The only way we hit this point is if there is no clobbering
13992 // (because there are no off-half inputs to this half) and there is no
13993 // free slot adjacent to one of the inputs. In this case, we have to
13994 // swap an input with a non-input.
13995 for (int i = 0; i < 4; ++i)
13996 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13997 "We can't handle any clobbers here!");
13998 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13999 "Cannot have adjacent inputs here!");
14000
14001 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14002 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14003
14004 // We also have to update the final source mask in this case because
14005 // it may need to undo the above swap.
14006 for (int &M : FinalSourceHalfMask)
14007 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14008 M = InputsFixed[1] + SourceOffset;
14009 else if (M == InputsFixed[1] + SourceOffset)
14010 M = (InputsFixed[0] ^ 1) + SourceOffset;
14011
14012 InputsFixed[1] = InputsFixed[0] ^ 1;
14013 }
14014
14015 // Point everything at the fixed inputs.
14016 for (int &M : HalfMask)
14017 if (M == IncomingInputs[0])
14018 M = InputsFixed[0] + SourceOffset;
14019 else if (M == IncomingInputs[1])
14020 M = InputsFixed[1] + SourceOffset;
14021
14022 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14023 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14024 }
14025 } else {
14026 llvm_unreachable("Unhandled input size!");
14027 }
14028
14029 // Now hoist the DWord down to the right half.
14030 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14031 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14032 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14033 for (int &M : HalfMask)
14034 for (int Input : IncomingInputs)
14035 if (M == Input)
14036 M = FreeDWord * 2 + Input % 2;
14037 };
14038 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14039 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14040 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14041 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14042
14043 // Now enact all the shuffles we've computed to move the inputs into their
14044 // target half.
14045 if (!isNoopShuffleMask(PSHUFLMask))
14046 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14047 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14048 if (!isNoopShuffleMask(PSHUFHMask))
14049 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14050 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14051 if (!isNoopShuffleMask(PSHUFDMask))
14052 V = DAG.getBitcast(
14053 VT,
14054 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14055 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14056
14057 // At this point, each half should contain all its inputs, and we can then
14058 // just shuffle them into their final position.
14059 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
14060 "Failed to lift all the high half inputs to the low mask!");
14061 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
14062 "Failed to lift all the low half inputs to the high mask!");
14063
14064 // Do a half shuffle for the low mask.
14065 if (!isNoopShuffleMask(LoMask))
14066 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14067 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14068
14069 // Do a half shuffle with the high mask after shifting its values down.
14070 for (int &M : HiMask)
14071 if (M >= 0)
14072 M -= 4;
14073 if (!isNoopShuffleMask(HiMask))
14074 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14075 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14076
14077 return V;
14078}
14079
14080/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14081/// blend if only one input is used.
14083 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14084 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14086 "Lane crossing shuffle masks not supported");
14087
14088 int NumBytes = VT.getSizeInBits() / 8;
14089 int Size = Mask.size();
14090 int Scale = NumBytes / Size;
14091
14092 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14093 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14094 V1InUse = false;
14095 V2InUse = false;
14096
14097 for (int i = 0; i < NumBytes; ++i) {
14098 int M = Mask[i / Scale];
14099 if (M < 0)
14100 continue;
14101
14102 const int ZeroMask = 0x80;
14103 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14104 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14105 if (Zeroable[i / Scale])
14106 V1Idx = V2Idx = ZeroMask;
14107
14108 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14109 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14110 V1InUse |= (ZeroMask != V1Idx);
14111 V2InUse |= (ZeroMask != V2Idx);
14112 }
14113
14114 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14115 if (V1InUse)
14116 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14117 DAG.getBuildVector(ShufVT, DL, V1Mask));
14118 if (V2InUse)
14119 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14120 DAG.getBuildVector(ShufVT, DL, V2Mask));
14121
14122 // If we need shuffled inputs from both, blend the two.
14123 SDValue V;
14124 if (V1InUse && V2InUse)
14125 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14126 else
14127 V = V1InUse ? V1 : V2;
14128
14129 // Cast the result back to the correct type.
14130 return DAG.getBitcast(VT, V);
14131}
14132
14133/// Generic lowering of 8-lane i16 shuffles.
14134///
14135/// This handles both single-input shuffles and combined shuffle/blends with
14136/// two inputs. The single input shuffles are immediately delegated to
14137/// a dedicated lowering routine.
14138///
14139/// The blends are lowered in one of three fundamental ways. If there are few
14140/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14141/// of the input is significantly cheaper when lowered as an interleaving of
14142/// the two inputs, try to interleave them. Otherwise, blend the low and high
14143/// halves of the inputs separately (making them have relatively few inputs)
14144/// and then concatenate them.
14146 const APInt &Zeroable, SDValue V1, SDValue V2,
14147 const X86Subtarget &Subtarget,
14148 SelectionDAG &DAG) {
14149 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14150 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14151 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14152
14153 // Whenever we can lower this as a zext, that instruction is strictly faster
14154 // than any alternative.
14155 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14156 Zeroable, Subtarget, DAG))
14157 return ZExt;
14158
14159 // Try to use lower using a truncation.
14160 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14161 Subtarget, DAG))
14162 return V;
14163
14164 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14165
14166 if (NumV2Inputs == 0) {
14167 // Try to use shift instructions.
14168 if (SDValue Shift =
14169 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14170 Subtarget, DAG, /*BitwiseOnly*/ false))
14171 return Shift;
14172
14173 // Check for being able to broadcast a single element.
14174 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14175 Mask, Subtarget, DAG))
14176 return Broadcast;
14177
14178 // Try to use bit rotation instructions.
14179 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14180 Subtarget, DAG))
14181 return Rotate;
14182
14183 // Use dedicated unpack instructions for masks that match their pattern.
14184 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14185 return V;
14186
14187 // Use dedicated pack instructions for masks that match their pattern.
14188 if (SDValue V =
14189 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14190 return V;
14191
14192 // Try to use byte rotation instructions.
14193 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14194 Subtarget, DAG))
14195 return Rotate;
14196
14197 // Make a copy of the mask so it can be modified.
14198 SmallVector<int, 8> MutableMask(Mask);
14199 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14200 Subtarget, DAG);
14201 }
14202
14203 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14204 "All single-input shuffles should be canonicalized to be V1-input "
14205 "shuffles.");
14206
14207 // Try to use shift instructions.
14208 if (SDValue Shift =
14209 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14210 DAG, /*BitwiseOnly*/ false))
14211 return Shift;
14212
14213 // See if we can use SSE4A Extraction / Insertion.
14214 if (Subtarget.hasSSE4A())
14215 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14216 Zeroable, DAG))
14217 return V;
14218
14219 // There are special ways we can lower some single-element blends.
14220 if (NumV2Inputs == 1)
14222 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14223 return V;
14224
14225 // We have different paths for blend lowering, but they all must use the
14226 // *exact* same predicate.
14227 bool IsBlendSupported = Subtarget.hasSSE41();
14228 if (IsBlendSupported)
14229 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14230 Zeroable, Subtarget, DAG))
14231 return Blend;
14232
14233 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14234 Zeroable, Subtarget, DAG))
14235 return Masked;
14236
14237 // Use dedicated unpack instructions for masks that match their pattern.
14238 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14239 return V;
14240
14241 // Use dedicated pack instructions for masks that match their pattern.
14242 if (SDValue V =
14243 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14244 return V;
14245
14246 // Try to use lower using a truncation.
14247 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14248 Subtarget, DAG))
14249 return V;
14250
14251 // Try to use byte rotation instructions.
14252 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14253 Subtarget, DAG))
14254 return Rotate;
14255
14256 if (SDValue BitBlend =
14257 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14258 return BitBlend;
14259
14260 // Try to use byte shift instructions to mask.
14261 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14262 Zeroable, Subtarget, DAG))
14263 return V;
14264
14265 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14266 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14267 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14268 !Subtarget.hasVLX()) {
14269 // Check if this is part of a 256-bit vector truncation.
14270 unsigned PackOpc = 0;
14271 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14274 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14275 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14276 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14277 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14278 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14279 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14280 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14281 PackOpc = X86ISD::PACKUS;
14282 } else if (Subtarget.hasSSE41()) {
14283 SmallVector<SDValue, 4> DWordClearOps(4,
14284 DAG.getConstant(0, DL, MVT::i32));
14285 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14286 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14287 SDValue DWordClearMask =
14288 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14289 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14290 DWordClearMask);
14291 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14292 DWordClearMask);
14293 PackOpc = X86ISD::PACKUS;
14294 } else if (!Subtarget.hasSSSE3()) {
14295 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14296 V1 = DAG.getBitcast(MVT::v4i32, V1);
14297 V2 = DAG.getBitcast(MVT::v4i32, V2);
14298 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14299 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14300 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14301 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14302 PackOpc = X86ISD::PACKSS;
14303 }
14304 if (PackOpc) {
14305 // Now pack things back together.
14306 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14307 if (NumEvenDrops == 2) {
14308 Result = DAG.getBitcast(MVT::v4i32, Result);
14309 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14310 }
14311 return Result;
14312 }
14313 }
14314
14315 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14316 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14317 if (NumOddDrops == 1) {
14318 bool HasSSE41 = Subtarget.hasSSE41();
14319 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14320 DAG.getBitcast(MVT::v4i32, V1),
14321 DAG.getTargetConstant(16, DL, MVT::i8));
14322 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14323 DAG.getBitcast(MVT::v4i32, V2),
14324 DAG.getTargetConstant(16, DL, MVT::i8));
14325 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14326 MVT::v8i16, V1, V2);
14327 }
14328
14329 // Try to lower by permuting the inputs into an unpack instruction.
14330 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14331 Mask, Subtarget, DAG))
14332 return Unpack;
14333
14334 // If we can't directly blend but can use PSHUFB, that will be better as it
14335 // can both shuffle and set up the inefficient blend.
14336 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14337 bool V1InUse, V2InUse;
14338 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14339 Zeroable, DAG, V1InUse, V2InUse);
14340 }
14341
14342 // We can always bit-blend if we have to so the fallback strategy is to
14343 // decompose into single-input permutes and blends/unpacks.
14344 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14345 Zeroable, Subtarget, DAG);
14346}
14347
14348/// Lower 8-lane 16-bit floating point shuffles.
14350 const APInt &Zeroable, SDValue V1, SDValue V2,
14351 const X86Subtarget &Subtarget,
14352 SelectionDAG &DAG) {
14353 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14354 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14355 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14356 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14357
14358 if (Subtarget.hasFP16()) {
14359 if (NumV2Elements == 0) {
14360 // Check for being able to broadcast a single element.
14361 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14362 Mask, Subtarget, DAG))
14363 return Broadcast;
14364 }
14365 if (NumV2Elements == 1 && Mask[0] >= 8)
14367 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14368 return V;
14369 }
14370
14371 V1 = DAG.getBitcast(MVT::v8i16, V1);
14372 V2 = DAG.getBitcast(MVT::v8i16, V2);
14373 return DAG.getBitcast(MVT::v8f16,
14374 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14375}
14376
14377// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14378// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14379// the active subvector is extracted.
14381 ArrayRef<int> OriginalMask, SDValue V1,
14382 SDValue V2, const X86Subtarget &Subtarget,
14383 SelectionDAG &DAG) {
14384 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14385 SmallVector<int, 32> Mask(OriginalMask);
14386 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14387 !isShuffleFoldableLoad(V2)) {
14389 std::swap(V1, V2);
14390 }
14391
14392 MVT MaskVT = VT.changeTypeToInteger();
14393 SDValue MaskNode;
14394 MVT ShuffleVT = VT;
14395 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14396 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14397 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14398 ShuffleVT = V1.getSimpleValueType();
14399
14400 // Adjust mask to correct indices for the second input.
14401 int NumElts = VT.getVectorNumElements();
14402 unsigned Scale = 512 / VT.getSizeInBits();
14403 SmallVector<int, 32> AdjustedMask(Mask);
14404 for (int &M : AdjustedMask)
14405 if (NumElts <= M)
14406 M += (Scale - 1) * NumElts;
14407 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14408 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14409 } else {
14410 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14411 }
14412
14413 SDValue Result;
14414 if (V2.isUndef())
14415 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14416 else
14417 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14418
14419 if (VT != ShuffleVT)
14420 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14421
14422 return Result;
14423}
14424
14425/// Generic lowering of v16i8 shuffles.
14426///
14427/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14428/// detect any complexity reducing interleaving. If that doesn't help, it uses
14429/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14430/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14431/// back together.
14433 const APInt &Zeroable, SDValue V1, SDValue V2,
14434 const X86Subtarget &Subtarget,
14435 SelectionDAG &DAG) {
14436 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14437 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14438 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14439
14440 // Try to use shift instructions.
14441 if (SDValue Shift =
14442 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14443 DAG, /*BitwiseOnly*/ false))
14444 return Shift;
14445
14446 // Try to use byte rotation instructions.
14447 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14448 Subtarget, DAG))
14449 return Rotate;
14450
14451 // Use dedicated pack instructions for masks that match their pattern.
14452 if (SDValue V =
14453 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14454 return V;
14455
14456 // Try to use a zext lowering.
14457 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14458 Zeroable, Subtarget, DAG))
14459 return ZExt;
14460
14461 // Try to use lower using a truncation.
14462 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14463 Subtarget, DAG))
14464 return V;
14465
14466 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14467 Subtarget, DAG))
14468 return V;
14469
14470 // See if we can use SSE4A Extraction / Insertion.
14471 if (Subtarget.hasSSE4A())
14472 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14473 Zeroable, DAG))
14474 return V;
14475
14476 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14477
14478 // For single-input shuffles, there are some nicer lowering tricks we can use.
14479 if (NumV2Elements == 0) {
14480 // Check for being able to broadcast a single element.
14481 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14482 Mask, Subtarget, DAG))
14483 return Broadcast;
14484
14485 // Try to use bit rotation instructions.
14486 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14487 Subtarget, DAG))
14488 return Rotate;
14489
14490 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14491 return V;
14492
14493 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14494 // Notably, this handles splat and partial-splat shuffles more efficiently.
14495 // However, it only makes sense if the pre-duplication shuffle simplifies
14496 // things significantly. Currently, this means we need to be able to
14497 // express the pre-duplication shuffle as an i16 shuffle.
14498 //
14499 // FIXME: We should check for other patterns which can be widened into an
14500 // i16 shuffle as well.
14501 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14502 for (int i = 0; i < 16; i += 2)
14503 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14504 return false;
14505
14506 return true;
14507 };
14508 auto tryToWidenViaDuplication = [&]() -> SDValue {
14509 if (!canWidenViaDuplication(Mask))
14510 return SDValue();
14511 SmallVector<int, 4> LoInputs;
14512 copy_if(Mask, std::back_inserter(LoInputs),
14513 [](int M) { return M >= 0 && M < 8; });
14514 array_pod_sort(LoInputs.begin(), LoInputs.end());
14515 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14516 SmallVector<int, 4> HiInputs;
14517 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14518 array_pod_sort(HiInputs.begin(), HiInputs.end());
14519 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14520
14521 bool TargetLo = LoInputs.size() >= HiInputs.size();
14522 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14523 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14524
14525 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14527 for (int I : InPlaceInputs) {
14528 PreDupI16Shuffle[I/2] = I/2;
14529 LaneMap[I] = I;
14530 }
14531 int j = TargetLo ? 0 : 4, je = j + 4;
14532 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14533 // Check if j is already a shuffle of this input. This happens when
14534 // there are two adjacent bytes after we move the low one.
14535 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14536 // If we haven't yet mapped the input, search for a slot into which
14537 // we can map it.
14538 while (j < je && PreDupI16Shuffle[j] >= 0)
14539 ++j;
14540
14541 if (j == je)
14542 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14543 return SDValue();
14544
14545 // Map this input with the i16 shuffle.
14546 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14547 }
14548
14549 // Update the lane map based on the mapping we ended up with.
14550 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14551 }
14552 V1 = DAG.getBitcast(
14553 MVT::v16i8,
14554 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14555 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14556
14557 // Unpack the bytes to form the i16s that will be shuffled into place.
14558 bool EvenInUse = false, OddInUse = false;
14559 for (int i = 0; i < 16; i += 2) {
14560 EvenInUse |= (Mask[i + 0] >= 0);
14561 OddInUse |= (Mask[i + 1] >= 0);
14562 if (EvenInUse && OddInUse)
14563 break;
14564 }
14565 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14566 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14567 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14568
14569 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14570 for (int i = 0; i < 16; ++i)
14571 if (Mask[i] >= 0) {
14572 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14573 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14574 if (PostDupI16Shuffle[i / 2] < 0)
14575 PostDupI16Shuffle[i / 2] = MappedMask;
14576 else
14577 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14578 "Conflicting entries in the original shuffle!");
14579 }
14580 return DAG.getBitcast(
14581 MVT::v16i8,
14582 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14583 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14584 };
14585 if (SDValue V = tryToWidenViaDuplication())
14586 return V;
14587 }
14588
14589 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14590 Zeroable, Subtarget, DAG))
14591 return Masked;
14592
14593 // Use dedicated unpack instructions for masks that match their pattern.
14594 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14595 return V;
14596
14597 // Try to use byte shift instructions to mask.
14598 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14599 Zeroable, Subtarget, DAG))
14600 return V;
14601
14602 // Check for compaction patterns.
14603 bool IsSingleInput = V2.isUndef();
14604 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14605
14606 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14607 // with PSHUFB. It is important to do this before we attempt to generate any
14608 // blends but after all of the single-input lowerings. If the single input
14609 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14610 // want to preserve that and we can DAG combine any longer sequences into
14611 // a PSHUFB in the end. But once we start blending from multiple inputs,
14612 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14613 // and there are *very* few patterns that would actually be faster than the
14614 // PSHUFB approach because of its ability to zero lanes.
14615 //
14616 // If the mask is a binary compaction, we can more efficiently perform this
14617 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14618 //
14619 // FIXME: The only exceptions to the above are blends which are exact
14620 // interleavings with direct instructions supporting them. We currently don't
14621 // handle those well here.
14622 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14623 bool V1InUse = false;
14624 bool V2InUse = false;
14625
14627 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14628
14629 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14630 // do so. This avoids using them to handle blends-with-zero which is
14631 // important as a single pshufb is significantly faster for that.
14632 if (V1InUse && V2InUse) {
14633 if (Subtarget.hasSSE41())
14634 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14635 Zeroable, Subtarget, DAG))
14636 return Blend;
14637
14638 // We can use an unpack to do the blending rather than an or in some
14639 // cases. Even though the or may be (very minorly) more efficient, we
14640 // preference this lowering because there are common cases where part of
14641 // the complexity of the shuffles goes away when we do the final blend as
14642 // an unpack.
14643 // FIXME: It might be worth trying to detect if the unpack-feeding
14644 // shuffles will both be pshufb, in which case we shouldn't bother with
14645 // this.
14647 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14648 return Unpack;
14649
14650 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14651 if (Subtarget.hasVBMI())
14652 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14653 DAG);
14654
14655 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14656 if (Subtarget.hasXOP()) {
14657 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14658 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14659 }
14660
14661 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14662 // PALIGNR will be cheaper than the second PSHUFB+OR.
14664 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14665 return V;
14666 }
14667
14668 return PSHUFB;
14669 }
14670
14671 // There are special ways we can lower some single-element blends.
14672 if (NumV2Elements == 1)
14674 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14675 return V;
14676
14677 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14678 return Blend;
14679
14680 // Check whether a compaction lowering can be done. This handles shuffles
14681 // which take every Nth element for some even N. See the helper function for
14682 // details.
14683 //
14684 // We special case these as they can be particularly efficiently handled with
14685 // the PACKUSB instruction on x86 and they show up in common patterns of
14686 // rearranging bytes to truncate wide elements.
14687 if (NumEvenDrops) {
14688 // NumEvenDrops is the power of two stride of the elements. Another way of
14689 // thinking about it is that we need to drop the even elements this many
14690 // times to get the original input.
14691
14692 // First we need to zero all the dropped bytes.
14693 assert(NumEvenDrops <= 3 &&
14694 "No support for dropping even elements more than 3 times.");
14695 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14696 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14697 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14698 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14699 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14700 WordClearMask);
14701 if (!IsSingleInput)
14702 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14703 WordClearMask);
14704
14705 // Now pack things back together.
14706 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14707 IsSingleInput ? V1 : V2);
14708 for (int i = 1; i < NumEvenDrops; ++i) {
14709 Result = DAG.getBitcast(MVT::v8i16, Result);
14710 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14711 }
14712 return Result;
14713 }
14714
14715 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14716 if (NumOddDrops == 1) {
14717 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14718 DAG.getBitcast(MVT::v8i16, V1),
14719 DAG.getTargetConstant(8, DL, MVT::i8));
14720 if (!IsSingleInput)
14721 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14722 DAG.getBitcast(MVT::v8i16, V2),
14723 DAG.getTargetConstant(8, DL, MVT::i8));
14724 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14725 IsSingleInput ? V1 : V2);
14726 }
14727
14728 // Handle multi-input cases by blending/unpacking single-input shuffles.
14729 if (NumV2Elements > 0)
14730 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14731 Zeroable, Subtarget, DAG);
14732
14733 // The fallback path for single-input shuffles widens this into two v8i16
14734 // vectors with unpacks, shuffles those, and then pulls them back together
14735 // with a pack.
14736 SDValue V = V1;
14737
14738 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14739 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14740 for (int i = 0; i < 16; ++i)
14741 if (Mask[i] >= 0)
14742 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14743
14744 SDValue VLoHalf, VHiHalf;
14745 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14746 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14747 // i16s.
14748 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14749 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14750 // Use a mask to drop the high bytes.
14751 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14752 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14753 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14754
14755 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14756 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14757
14758 // Squash the masks to point directly into VLoHalf.
14759 for (int &M : LoBlendMask)
14760 if (M >= 0)
14761 M /= 2;
14762 for (int &M : HiBlendMask)
14763 if (M >= 0)
14764 M /= 2;
14765 } else {
14766 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14767 // VHiHalf so that we can blend them as i16s.
14768 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14769
14770 VLoHalf = DAG.getBitcast(
14771 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14772 VHiHalf = DAG.getBitcast(
14773 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14774 }
14775
14776 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14777 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14778
14779 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14780}
14781
14782/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14783///
14784/// This routine breaks down the specific type of 128-bit shuffle and
14785/// dispatches to the lowering routines accordingly.
14787 MVT VT, SDValue V1, SDValue V2,
14788 const APInt &Zeroable,
14789 const X86Subtarget &Subtarget,
14790 SelectionDAG &DAG) {
14791 if (VT == MVT::v8bf16) {
14792 V1 = DAG.getBitcast(MVT::v8i16, V1);
14793 V2 = DAG.getBitcast(MVT::v8i16, V2);
14794 return DAG.getBitcast(VT,
14795 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14796 }
14797
14798 switch (VT.SimpleTy) {
14799 case MVT::v2i64:
14800 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14801 case MVT::v2f64:
14802 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14803 case MVT::v4i32:
14804 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14805 case MVT::v4f32:
14806 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14807 case MVT::v8i16:
14808 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14809 case MVT::v8f16:
14810 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14811 case MVT::v16i8:
14812 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14813
14814 default:
14815 llvm_unreachable("Unimplemented!");
14816 }
14817}
14818
14819/// Generic routine to split vector shuffle into half-sized shuffles.
14820///
14821/// This routine just extracts two subvectors, shuffles them independently, and
14822/// then concatenates them back together. This should work effectively with all
14823/// AVX vector shuffle types.
14825 SDValue V2, ArrayRef<int> Mask,
14826 SelectionDAG &DAG, bool SimpleOnly) {
14827 assert(VT.getSizeInBits() >= 256 &&
14828 "Only for 256-bit or wider vector shuffles!");
14829 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14830 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14831
14832 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14833 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14834
14835 int NumElements = VT.getVectorNumElements();
14836 int SplitNumElements = NumElements / 2;
14837 MVT ScalarVT = VT.getVectorElementType();
14838 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14839
14840 // Use splitVector/extractSubVector so that split build-vectors just build two
14841 // narrower build vectors. This helps shuffling with splats and zeros.
14842 auto SplitVector = [&](SDValue V) {
14843 SDValue LoV, HiV;
14844 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14845 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14846 DAG.getBitcast(SplitVT, HiV));
14847 };
14848
14849 SDValue LoV1, HiV1, LoV2, HiV2;
14850 std::tie(LoV1, HiV1) = SplitVector(V1);
14851 std::tie(LoV2, HiV2) = SplitVector(V2);
14852
14853 // Now create two 4-way blends of these half-width vectors.
14854 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14855 bool &UseHiV1, bool &UseLoV2,
14856 bool &UseHiV2) {
14857 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14858 for (int i = 0; i < SplitNumElements; ++i) {
14859 int M = HalfMask[i];
14860 if (M >= NumElements) {
14861 if (M >= NumElements + SplitNumElements)
14862 UseHiV2 = true;
14863 else
14864 UseLoV2 = true;
14865 } else if (M >= 0) {
14866 if (M >= SplitNumElements)
14867 UseHiV1 = true;
14868 else
14869 UseLoV1 = true;
14870 }
14871 }
14872 };
14873
14874 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14875 if (!SimpleOnly)
14876 return true;
14877
14878 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14879 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14880
14881 return !(UseHiV1 || UseHiV2);
14882 };
14883
14884 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14885 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14886 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14887 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14888 for (int i = 0; i < SplitNumElements; ++i) {
14889 int M = HalfMask[i];
14890 if (M >= NumElements) {
14891 V2BlendMask[i] = M - NumElements;
14892 BlendMask[i] = SplitNumElements + i;
14893 } else if (M >= 0) {
14894 V1BlendMask[i] = M;
14895 BlendMask[i] = i;
14896 }
14897 }
14898
14899 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14900 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14901
14902 // Because the lowering happens after all combining takes place, we need to
14903 // manually combine these blend masks as much as possible so that we create
14904 // a minimal number of high-level vector shuffle nodes.
14905 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14906
14907 // First try just blending the halves of V1 or V2.
14908 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14909 return DAG.getUNDEF(SplitVT);
14910 if (!UseLoV2 && !UseHiV2)
14911 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14912 if (!UseLoV1 && !UseHiV1)
14913 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14914
14915 SDValue V1Blend, V2Blend;
14916 if (UseLoV1 && UseHiV1) {
14917 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14918 } else {
14919 // We only use half of V1 so map the usage down into the final blend mask.
14920 V1Blend = UseLoV1 ? LoV1 : HiV1;
14921 for (int i = 0; i < SplitNumElements; ++i)
14922 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14923 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14924 }
14925 if (UseLoV2 && UseHiV2) {
14926 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14927 } else {
14928 // We only use half of V2 so map the usage down into the final blend mask.
14929 V2Blend = UseLoV2 ? LoV2 : HiV2;
14930 for (int i = 0; i < SplitNumElements; ++i)
14931 if (BlendMask[i] >= SplitNumElements)
14932 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14933 }
14934 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14935 };
14936
14937 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14938 return SDValue();
14939
14940 SDValue Lo = HalfBlend(LoMask);
14941 SDValue Hi = HalfBlend(HiMask);
14942 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14943}
14944
14945/// Either split a vector in halves or decompose the shuffles and the
14946/// blend/unpack.
14947///
14948/// This is provided as a good fallback for many lowerings of non-single-input
14949/// shuffles with more than one 128-bit lane. In those cases, we want to select
14950/// between splitting the shuffle into 128-bit components and stitching those
14951/// back together vs. extracting the single-input shuffles and blending those
14952/// results.
14954 SDValue V2, ArrayRef<int> Mask,
14955 const APInt &Zeroable,
14956 const X86Subtarget &Subtarget,
14957 SelectionDAG &DAG) {
14958 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14959 "shuffles as it could then recurse on itself.");
14960 int Size = Mask.size();
14961
14962 // If this can be modeled as a broadcast of two elements followed by a blend,
14963 // prefer that lowering. This is especially important because broadcasts can
14964 // often fold with memory operands.
14965 auto DoBothBroadcast = [&] {
14966 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14967 for (int M : Mask)
14968 if (M >= Size) {
14969 if (V2BroadcastIdx < 0)
14970 V2BroadcastIdx = M - Size;
14971 else if (M - Size != V2BroadcastIdx)
14972 return false;
14973 } else if (M >= 0) {
14974 if (V1BroadcastIdx < 0)
14975 V1BroadcastIdx = M;
14976 else if (M != V1BroadcastIdx)
14977 return false;
14978 }
14979 return true;
14980 };
14981 if (DoBothBroadcast())
14982 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
14983 Subtarget, DAG);
14984
14985 // If the inputs all stem from a single 128-bit lane of each input, then we
14986 // split them rather than blending because the split will decompose to
14987 // unusually few instructions.
14988 int LaneCount = VT.getSizeInBits() / 128;
14989 int LaneSize = Size / LaneCount;
14990 SmallBitVector LaneInputs[2];
14991 LaneInputs[0].resize(LaneCount, false);
14992 LaneInputs[1].resize(LaneCount, false);
14993 for (int i = 0; i < Size; ++i)
14994 if (Mask[i] >= 0)
14995 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14996 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14997 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14998 /*SimpleOnly*/ false);
14999
15000 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15001 // requires that the decomposed single-input shuffles don't end up here.
15002 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15003 Subtarget, DAG);
15004}
15005
15006// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15007// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15009 SDValue V1, SDValue V2,
15010 ArrayRef<int> Mask,
15011 SelectionDAG &DAG) {
15012 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15013
15014 int LHSMask[4] = {-1, -1, -1, -1};
15015 int RHSMask[4] = {-1, -1, -1, -1};
15016 int SHUFPDMask[4] = {-1, -1, -1, -1};
15017
15018 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15019 // perform the shuffle once the lanes have been shuffled in place.
15020 for (int i = 0; i != 4; ++i) {
15021 int M = Mask[i];
15022 if (M < 0)
15023 continue;
15024 int LaneBase = i & ~1;
15025 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15026 LaneMask[LaneBase + (M & 1)] = M;
15027 SHUFPDMask[i] = M & 1;
15028 }
15029
15030 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15031 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15032 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15033 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15034}
15035
15036/// Lower a vector shuffle crossing multiple 128-bit lanes as
15037/// a lane permutation followed by a per-lane permutation.
15038///
15039/// This is mainly for cases where we can have non-repeating permutes
15040/// in each lane.
15041///
15042/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15043/// we should investigate merging them.
15045 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15046 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15047 int NumElts = VT.getVectorNumElements();
15048 int NumLanes = VT.getSizeInBits() / 128;
15049 int NumEltsPerLane = NumElts / NumLanes;
15050 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15051
15052 /// Attempts to find a sublane permute with the given size
15053 /// that gets all elements into their target lanes.
15054 ///
15055 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15056 /// If unsuccessful, returns false and may overwrite InLaneMask.
15057 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15058 int NumSublanesPerLane = NumSublanes / NumLanes;
15059 int NumEltsPerSublane = NumElts / NumSublanes;
15060
15061 SmallVector<int, 16> CrossLaneMask;
15062 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15063 // CrossLaneMask but one entry == one sublane.
15064 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15065 APInt DemandedCrossLane = APInt::getZero(NumElts);
15066
15067 for (int i = 0; i != NumElts; ++i) {
15068 int M = Mask[i];
15069 if (M < 0)
15070 continue;
15071
15072 int SrcSublane = M / NumEltsPerSublane;
15073 int DstLane = i / NumEltsPerLane;
15074
15075 // We only need to get the elements into the right lane, not sublane.
15076 // So search all sublanes that make up the destination lane.
15077 bool Found = false;
15078 int DstSubStart = DstLane * NumSublanesPerLane;
15079 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15080 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15081 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15082 continue;
15083
15084 Found = true;
15085 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15086 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15087 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15088 DemandedCrossLane.setBit(InLaneMask[i]);
15089 break;
15090 }
15091 if (!Found)
15092 return SDValue();
15093 }
15094
15095 // Fill CrossLaneMask using CrossLaneMaskLarge.
15096 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15097
15098 if (!CanUseSublanes) {
15099 // If we're only shuffling a single lowest lane and the rest are identity
15100 // then don't bother.
15101 // TODO - isShuffleMaskInputInPlace could be extended to something like
15102 // this.
15103 int NumIdentityLanes = 0;
15104 bool OnlyShuffleLowestLane = true;
15105 for (int i = 0; i != NumLanes; ++i) {
15106 int LaneOffset = i * NumEltsPerLane;
15107 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15108 i * NumEltsPerLane))
15109 NumIdentityLanes++;
15110 else if (CrossLaneMask[LaneOffset] != 0)
15111 OnlyShuffleLowestLane = false;
15112 }
15113 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15114 return SDValue();
15115 }
15116
15117 // Avoid returning the same shuffle operation. For example,
15118 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15119 // undef:v16i16
15120 if (CrossLaneMask == Mask || InLaneMask == Mask)
15121 return SDValue();
15122
15123 // Simplify CrossLaneMask based on the actual demanded elements.
15124 if (V1.hasOneUse())
15125 for (int i = 0; i != NumElts; ++i)
15126 if (!DemandedCrossLane[i])
15127 CrossLaneMask[i] = SM_SentinelUndef;
15128
15129 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15130 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15131 InLaneMask);
15132 };
15133
15134 // First attempt a solution with full lanes.
15135 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15136 return V;
15137
15138 // The rest of the solutions use sublanes.
15139 if (!CanUseSublanes)
15140 return SDValue();
15141
15142 // Then attempt a solution with 64-bit sublanes (vpermq).
15143 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15144 return V;
15145
15146 // If that doesn't work and we have fast variable cross-lane shuffle,
15147 // attempt 32-bit sublanes (vpermd).
15148 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15149 return SDValue();
15150
15151 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15152}
15153
15154/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15155static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15156 SmallVector<int> &InLaneMask) {
15157 int Size = Mask.size();
15158 InLaneMask.assign(Mask.begin(), Mask.end());
15159 for (int i = 0; i < Size; ++i) {
15160 int &M = InLaneMask[i];
15161 if (M < 0)
15162 continue;
15163 if (((M % Size) / LaneSize) != (i / LaneSize))
15164 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15165 }
15166}
15167
15168/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15169/// source with a lane permutation.
15170///
15171/// This lowering strategy results in four instructions in the worst case for a
15172/// single-input cross lane shuffle which is lower than any other fully general
15173/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15174/// shuffle pattern should be handled prior to trying this lowering.
15176 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15177 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15178 // FIXME: This should probably be generalized for 512-bit vectors as well.
15179 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15180 int Size = Mask.size();
15181 int LaneSize = Size / 2;
15182
15183 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15184 // Only do this if the elements aren't all from the lower lane,
15185 // otherwise we're (probably) better off doing a split.
15186 if (VT == MVT::v4f64 &&
15187 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15188 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15189
15190 // If there are only inputs from one 128-bit lane, splitting will in fact be
15191 // less expensive. The flags track whether the given lane contains an element
15192 // that crosses to another lane.
15193 bool AllLanes;
15194 if (!Subtarget.hasAVX2()) {
15195 bool LaneCrossing[2] = {false, false};
15196 for (int i = 0; i < Size; ++i)
15197 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15198 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15199 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15200 } else {
15201 bool LaneUsed[2] = {false, false};
15202 for (int i = 0; i < Size; ++i)
15203 if (Mask[i] >= 0)
15204 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15205 AllLanes = LaneUsed[0] && LaneUsed[1];
15206 }
15207
15208 // TODO - we could support shuffling V2 in the Flipped input.
15209 assert(V2.isUndef() &&
15210 "This last part of this routine only works on single input shuffles");
15211
15212 SmallVector<int> InLaneMask;
15213 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15214
15215 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15216 "In-lane shuffle mask expected");
15217
15218 // If we're not using both lanes in each lane and the inlane mask is not
15219 // repeating, then we're better off splitting.
15220 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15221 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15222 /*SimpleOnly*/ false);
15223
15224 // Flip the lanes, and shuffle the results which should now be in-lane.
15225 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15226 SDValue Flipped = DAG.getBitcast(PVT, V1);
15227 Flipped =
15228 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15229 Flipped = DAG.getBitcast(VT, Flipped);
15230 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15231}
15232
15233/// Handle lowering 2-lane 128-bit shuffles.
15235 SDValue V2, ArrayRef<int> Mask,
15236 const APInt &Zeroable,
15237 const X86Subtarget &Subtarget,
15238 SelectionDAG &DAG) {
15239 if (V2.isUndef()) {
15240 // Attempt to match VBROADCAST*128 subvector broadcast load.
15241 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15242 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15243 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15245 MVT MemVT = VT.getHalfNumVectorElementsVT();
15246 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15247 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
15249 VT, MemVT, Ld, Ofs, DAG))
15250 return BcstLd;
15251 }
15252
15253 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15254 if (Subtarget.hasAVX2())
15255 return SDValue();
15256 }
15257
15258 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15259
15260 SmallVector<int, 4> WidenedMask;
15261 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15262 return SDValue();
15263
15264 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15265 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15266
15267 // Try to use an insert into a zero vector.
15268 if (WidenedMask[0] == 0 && IsHighZero) {
15269 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15270 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15271 DAG.getVectorIdxConstant(0, DL));
15272 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15273 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15274 DAG.getVectorIdxConstant(0, DL));
15275 }
15276
15277 // TODO: If minimizing size and one of the inputs is a zero vector and the
15278 // the zero vector has only one use, we could use a VPERM2X128 to save the
15279 // instruction bytes needed to explicitly generate the zero vector.
15280
15281 // Blends are faster and handle all the non-lane-crossing cases.
15282 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15283 Subtarget, DAG))
15284 return Blend;
15285
15286 // If either input operand is a zero vector, use VPERM2X128 because its mask
15287 // allows us to replace the zero input with an implicit zero.
15288 if (!IsLowZero && !IsHighZero) {
15289 // Check for patterns which can be matched with a single insert of a 128-bit
15290 // subvector.
15291 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15292 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15293
15294 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15295 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15296 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
15297 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15298 SDValue SubVec =
15299 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15300 DAG.getVectorIdxConstant(0, DL));
15301 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15302 DAG.getVectorIdxConstant(2, DL));
15303 }
15304 }
15305
15306 // Try to use SHUF128 if possible.
15307 if (Subtarget.hasVLX()) {
15308 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15309 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15310 ((WidenedMask[1] % 2) << 1);
15311 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15312 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15313 }
15314 }
15315 }
15316
15317 // Otherwise form a 128-bit permutation. After accounting for undefs,
15318 // convert the 64-bit shuffle mask selection values into 128-bit
15319 // selection bits by dividing the indexes by 2 and shifting into positions
15320 // defined by a vperm2*128 instruction's immediate control byte.
15321
15322 // The immediate permute control byte looks like this:
15323 // [1:0] - select 128 bits from sources for low half of destination
15324 // [2] - ignore
15325 // [3] - zero low half of destination
15326 // [5:4] - select 128 bits from sources for high half of destination
15327 // [6] - ignore
15328 // [7] - zero high half of destination
15329
15330 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15331 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15332
15333 unsigned PermMask = 0;
15334 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15335 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15336
15337 // Check the immediate mask and replace unused sources with undef.
15338 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15339 V1 = DAG.getUNDEF(VT);
15340 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15341 V2 = DAG.getUNDEF(VT);
15342
15343 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15344 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15345}
15346
15347/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15348/// shuffling each lane.
15349///
15350/// This attempts to create a repeated lane shuffle where each lane uses one
15351/// or two of the lanes of the inputs. The lanes of the input vectors are
15352/// shuffled in one or two independent shuffles to get the lanes into the
15353/// position needed by the final shuffle.
15355 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15356 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15357 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15358
15359 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15360 return SDValue();
15361
15362 int NumElts = Mask.size();
15363 int NumLanes = VT.getSizeInBits() / 128;
15364 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15365 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15366 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15367
15368 // First pass will try to fill in the RepeatMask from lanes that need two
15369 // sources.
15370 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15371 int Srcs[2] = {-1, -1};
15372 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15373 for (int i = 0; i != NumLaneElts; ++i) {
15374 int M = Mask[(Lane * NumLaneElts) + i];
15375 if (M < 0)
15376 continue;
15377 // Determine which of the possible input lanes (NumLanes from each source)
15378 // this element comes from. Assign that as one of the sources for this
15379 // lane. We can assign up to 2 sources for this lane. If we run out
15380 // sources we can't do anything.
15381 int LaneSrc = M / NumLaneElts;
15382 int Src;
15383 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15384 Src = 0;
15385 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15386 Src = 1;
15387 else
15388 return SDValue();
15389
15390 Srcs[Src] = LaneSrc;
15391 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15392 }
15393
15394 // If this lane has two sources, see if it fits with the repeat mask so far.
15395 if (Srcs[1] < 0)
15396 continue;
15397
15398 LaneSrcs[Lane][0] = Srcs[0];
15399 LaneSrcs[Lane][1] = Srcs[1];
15400
15401 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15402 assert(M1.size() == M2.size() && "Unexpected mask size");
15403 for (int i = 0, e = M1.size(); i != e; ++i)
15404 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15405 return false;
15406 return true;
15407 };
15408
15409 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15410 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15411 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15412 int M = Mask[i];
15413 if (M < 0)
15414 continue;
15415 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15416 "Unexpected mask element");
15417 MergedMask[i] = M;
15418 }
15419 };
15420
15421 if (MatchMasks(InLaneMask, RepeatMask)) {
15422 // Merge this lane mask into the final repeat mask.
15423 MergeMasks(InLaneMask, RepeatMask);
15424 continue;
15425 }
15426
15427 // Didn't find a match. Swap the operands and try again.
15428 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15430
15431 if (MatchMasks(InLaneMask, RepeatMask)) {
15432 // Merge this lane mask into the final repeat mask.
15433 MergeMasks(InLaneMask, RepeatMask);
15434 continue;
15435 }
15436
15437 // Couldn't find a match with the operands in either order.
15438 return SDValue();
15439 }
15440
15441 // Now handle any lanes with only one source.
15442 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15443 // If this lane has already been processed, skip it.
15444 if (LaneSrcs[Lane][0] >= 0)
15445 continue;
15446
15447 for (int i = 0; i != NumLaneElts; ++i) {
15448 int M = Mask[(Lane * NumLaneElts) + i];
15449 if (M < 0)
15450 continue;
15451
15452 // If RepeatMask isn't defined yet we can define it ourself.
15453 if (RepeatMask[i] < 0)
15454 RepeatMask[i] = M % NumLaneElts;
15455
15456 if (RepeatMask[i] < NumElts) {
15457 if (RepeatMask[i] != M % NumLaneElts)
15458 return SDValue();
15459 LaneSrcs[Lane][0] = M / NumLaneElts;
15460 } else {
15461 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15462 return SDValue();
15463 LaneSrcs[Lane][1] = M / NumLaneElts;
15464 }
15465 }
15466
15467 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15468 return SDValue();
15469 }
15470
15471 SmallVector<int, 16> NewMask(NumElts, -1);
15472 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15473 int Src = LaneSrcs[Lane][0];
15474 for (int i = 0; i != NumLaneElts; ++i) {
15475 int M = -1;
15476 if (Src >= 0)
15477 M = Src * NumLaneElts + i;
15478 NewMask[Lane * NumLaneElts + i] = M;
15479 }
15480 }
15481 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15482 // Ensure we didn't get back the shuffle we started with.
15483 // FIXME: This is a hack to make up for some splat handling code in
15484 // getVectorShuffle.
15485 if (isa<ShuffleVectorSDNode>(NewV1) &&
15486 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15487 return SDValue();
15488
15489 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15490 int Src = LaneSrcs[Lane][1];
15491 for (int i = 0; i != NumLaneElts; ++i) {
15492 int M = -1;
15493 if (Src >= 0)
15494 M = Src * NumLaneElts + i;
15495 NewMask[Lane * NumLaneElts + i] = M;
15496 }
15497 }
15498 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15499 // Ensure we didn't get back the shuffle we started with.
15500 // FIXME: This is a hack to make up for some splat handling code in
15501 // getVectorShuffle.
15502 if (isa<ShuffleVectorSDNode>(NewV2) &&
15503 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15504 return SDValue();
15505
15506 for (int i = 0; i != NumElts; ++i) {
15507 if (Mask[i] < 0) {
15508 NewMask[i] = -1;
15509 continue;
15510 }
15511 NewMask[i] = RepeatMask[i % NumLaneElts];
15512 if (NewMask[i] < 0)
15513 continue;
15514
15515 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15516 }
15517 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15518}
15519
15520/// If the input shuffle mask results in a vector that is undefined in all upper
15521/// or lower half elements and that mask accesses only 2 halves of the
15522/// shuffle's operands, return true. A mask of half the width with mask indexes
15523/// adjusted to access the extracted halves of the original shuffle operands is
15524/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15525/// lower half of each input operand is accessed.
15526static bool
15528 int &HalfIdx1, int &HalfIdx2) {
15529 assert((Mask.size() == HalfMask.size() * 2) &&
15530 "Expected input mask to be twice as long as output");
15531
15532 // Exactly one half of the result must be undef to allow narrowing.
15533 bool UndefLower = isUndefLowerHalf(Mask);
15534 bool UndefUpper = isUndefUpperHalf(Mask);
15535 if (UndefLower == UndefUpper)
15536 return false;
15537
15538 unsigned HalfNumElts = HalfMask.size();
15539 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15540 HalfIdx1 = -1;
15541 HalfIdx2 = -1;
15542 for (unsigned i = 0; i != HalfNumElts; ++i) {
15543 int M = Mask[i + MaskIndexOffset];
15544 if (M < 0) {
15545 HalfMask[i] = M;
15546 continue;
15547 }
15548
15549 // Determine which of the 4 half vectors this element is from.
15550 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15551 int HalfIdx = M / HalfNumElts;
15552
15553 // Determine the element index into its half vector source.
15554 int HalfElt = M % HalfNumElts;
15555
15556 // We can shuffle with up to 2 half vectors, set the new 'half'
15557 // shuffle mask accordingly.
15558 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15559 HalfMask[i] = HalfElt;
15560 HalfIdx1 = HalfIdx;
15561 continue;
15562 }
15563 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15564 HalfMask[i] = HalfElt + HalfNumElts;
15565 HalfIdx2 = HalfIdx;
15566 continue;
15567 }
15568
15569 // Too many half vectors referenced.
15570 return false;
15571 }
15572
15573 return true;
15574}
15575
15576/// Given the output values from getHalfShuffleMask(), create a half width
15577/// shuffle of extracted vectors followed by an insert back to full width.
15579 ArrayRef<int> HalfMask, int HalfIdx1,
15580 int HalfIdx2, bool UndefLower,
15581 SelectionDAG &DAG, bool UseConcat = false) {
15582 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15583 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15584
15585 MVT VT = V1.getSimpleValueType();
15586 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15587 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15588
15589 auto getHalfVector = [&](int HalfIdx) {
15590 if (HalfIdx < 0)
15591 return DAG.getUNDEF(HalfVT);
15592 SDValue V = (HalfIdx < 2 ? V1 : V2);
15593 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15594 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15595 DAG.getVectorIdxConstant(HalfIdx, DL));
15596 };
15597
15598 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15599 SDValue Half1 = getHalfVector(HalfIdx1);
15600 SDValue Half2 = getHalfVector(HalfIdx2);
15601 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15602 if (UseConcat) {
15603 SDValue Op0 = V;
15604 SDValue Op1 = DAG.getUNDEF(HalfVT);
15605 if (UndefLower)
15606 std::swap(Op0, Op1);
15607 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15608 }
15609
15610 unsigned Offset = UndefLower ? HalfNumElts : 0;
15611 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15613}
15614
15615/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15616/// This allows for fast cases such as subvector extraction/insertion
15617/// or shuffling smaller vector types which can lower more efficiently.
15619 SDValue V2, ArrayRef<int> Mask,
15620 const X86Subtarget &Subtarget,
15621 SelectionDAG &DAG) {
15622 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15623 "Expected 256-bit or 512-bit vector");
15624
15625 bool UndefLower = isUndefLowerHalf(Mask);
15626 if (!UndefLower && !isUndefUpperHalf(Mask))
15627 return SDValue();
15628
15629 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15630 "Completely undef shuffle mask should have been simplified already");
15631
15632 // Upper half is undef and lower half is whole upper subvector.
15633 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15634 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15635 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15636 if (!UndefLower &&
15637 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15638 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15639 DAG.getVectorIdxConstant(HalfNumElts, DL));
15640 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15641 DAG.getVectorIdxConstant(0, DL));
15642 }
15643
15644 // Lower half is undef and upper half is whole lower subvector.
15645 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15646 if (UndefLower &&
15647 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15648 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15649 DAG.getVectorIdxConstant(0, DL));
15650 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15651 DAG.getVectorIdxConstant(HalfNumElts, DL));
15652 }
15653
15654 int HalfIdx1, HalfIdx2;
15655 SmallVector<int, 8> HalfMask(HalfNumElts);
15656 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15657 return SDValue();
15658
15659 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15660
15661 // Only shuffle the halves of the inputs when useful.
15662 unsigned NumLowerHalves =
15663 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15664 unsigned NumUpperHalves =
15665 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15666 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15667
15668 // Determine the larger pattern of undef/halves, then decide if it's worth
15669 // splitting the shuffle based on subtarget capabilities and types.
15670 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15671 if (!UndefLower) {
15672 // XXXXuuuu: no insert is needed.
15673 // Always extract lowers when setting lower - these are all free subreg ops.
15674 if (NumUpperHalves == 0)
15675 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15676 UndefLower, DAG);
15677
15678 if (NumUpperHalves == 1) {
15679 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15680 if (Subtarget.hasAVX2()) {
15681 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15682 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15683 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15684 (!isSingleSHUFPSMask(HalfMask) ||
15685 Subtarget.hasFastVariableCrossLaneShuffle()))
15686 return SDValue();
15687 // If this is an unary shuffle (assume that the 2nd operand is
15688 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15689 // are better off extracting the upper half of 1 operand and using a
15690 // narrow shuffle.
15691 if (EltWidth == 64 && V2.isUndef())
15692 return SDValue();
15693 // If this is an unary vXi8 shuffle with inplace halves, then perform as
15694 // full width pshufb, and then merge.
15695 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
15696 return SDValue();
15697 }
15698 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15699 if (Subtarget.hasAVX512() && VT.is512BitVector())
15700 return SDValue();
15701 // Extract + narrow shuffle is better than the wide alternative.
15702 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15703 UndefLower, DAG);
15704 }
15705
15706 // Don't extract both uppers, instead shuffle and then extract.
15707 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15708 return SDValue();
15709 }
15710
15711 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15712 if (NumUpperHalves == 0) {
15713 // AVX2 has efficient 64-bit element cross-lane shuffles.
15714 // TODO: Refine to account for unary shuffle, splat, and other masks?
15715 if (Subtarget.hasAVX2() && EltWidth == 64)
15716 return SDValue();
15717 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15718 if (Subtarget.hasAVX512() && VT.is512BitVector())
15719 return SDValue();
15720 // Narrow shuffle + insert is better than the wide alternative.
15721 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15722 UndefLower, DAG);
15723 }
15724
15725 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15726 return SDValue();
15727}
15728
15729/// Handle case where shuffle sources are coming from the same 128-bit lane and
15730/// every lane can be represented as the same repeating mask - allowing us to
15731/// shuffle the sources with the repeating shuffle and then permute the result
15732/// to the destination lanes.
15734 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15735 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15736 int NumElts = VT.getVectorNumElements();
15737 int NumLanes = VT.getSizeInBits() / 128;
15738 int NumLaneElts = NumElts / NumLanes;
15739
15740 // On AVX2 we may be able to just shuffle the lowest elements and then
15741 // broadcast the result.
15742 if (Subtarget.hasAVX2()) {
15743 for (unsigned BroadcastSize : {16, 32, 64}) {
15744 if (BroadcastSize <= VT.getScalarSizeInBits())
15745 continue;
15746 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15747
15748 // Attempt to match a repeating pattern every NumBroadcastElts,
15749 // accounting for UNDEFs but only references the lowest 128-bit
15750 // lane of the inputs.
15751 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15752 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15753 for (int j = 0; j != NumBroadcastElts; ++j) {
15754 int M = Mask[i + j];
15755 if (M < 0)
15756 continue;
15757 int &R = RepeatMask[j];
15758 if (0 != ((M % NumElts) / NumLaneElts))
15759 return false;
15760 if (0 <= R && R != M)
15761 return false;
15762 R = M;
15763 }
15764 return true;
15765 };
15766
15767 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15768 if (!FindRepeatingBroadcastMask(RepeatMask))
15769 continue;
15770
15771 // Shuffle the (lowest) repeated elements in place for broadcast.
15772 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15773
15774 // Shuffle the actual broadcast.
15775 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15776 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15777 for (int j = 0; j != NumBroadcastElts; ++j)
15778 BroadcastMask[i + j] = j;
15779
15780 // Avoid returning the same shuffle operation. For example,
15781 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15782 if (BroadcastMask == Mask)
15783 return SDValue();
15784
15785 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15786 BroadcastMask);
15787 }
15788 }
15789
15790 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15791 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15792 return SDValue();
15793
15794 // Bail if we already have a repeated lane shuffle mask.
15795 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15796 return SDValue();
15797
15798 // Helper to look for repeated mask in each split sublane, and that those
15799 // sublanes can then be permuted into place.
15800 auto ShuffleSubLanes = [&](int SubLaneScale) {
15801 int NumSubLanes = NumLanes * SubLaneScale;
15802 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15803
15804 // Check that all the sources are coming from the same lane and see if we
15805 // can form a repeating shuffle mask (local to each sub-lane). At the same
15806 // time, determine the source sub-lane for each destination sub-lane.
15807 int TopSrcSubLane = -1;
15808 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15809 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15810 SubLaneScale,
15811 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15812
15813 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15814 // Extract the sub-lane mask, check that it all comes from the same lane
15815 // and normalize the mask entries to come from the first lane.
15816 int SrcLane = -1;
15817 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15818 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15819 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15820 if (M < 0)
15821 continue;
15822 int Lane = (M % NumElts) / NumLaneElts;
15823 if ((0 <= SrcLane) && (SrcLane != Lane))
15824 return SDValue();
15825 SrcLane = Lane;
15826 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15827 SubLaneMask[Elt] = LocalM;
15828 }
15829
15830 // Whole sub-lane is UNDEF.
15831 if (SrcLane < 0)
15832 continue;
15833
15834 // Attempt to match against the candidate repeated sub-lane masks.
15835 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15836 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15837 for (int i = 0; i != NumSubLaneElts; ++i) {
15838 if (M1[i] < 0 || M2[i] < 0)
15839 continue;
15840 if (M1[i] != M2[i])
15841 return false;
15842 }
15843 return true;
15844 };
15845
15846 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15847 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15848 continue;
15849
15850 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15851 for (int i = 0; i != NumSubLaneElts; ++i) {
15852 int M = SubLaneMask[i];
15853 if (M < 0)
15854 continue;
15855 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15856 "Unexpected mask element");
15857 RepeatedSubLaneMask[i] = M;
15858 }
15859
15860 // Track the top most source sub-lane - by setting the remaining to
15861 // UNDEF we can greatly simplify shuffle matching.
15862 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15863 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15864 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15865 break;
15866 }
15867
15868 // Bail if we failed to find a matching repeated sub-lane mask.
15869 if (Dst2SrcSubLanes[DstSubLane] < 0)
15870 return SDValue();
15871 }
15872 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15873 "Unexpected source lane");
15874
15875 // Create a repeating shuffle mask for the entire vector.
15876 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15877 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15878 int Lane = SubLane / SubLaneScale;
15879 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15880 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15881 int M = RepeatedSubLaneMask[Elt];
15882 if (M < 0)
15883 continue;
15884 int Idx = (SubLane * NumSubLaneElts) + Elt;
15885 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15886 }
15887 }
15888
15889 // Shuffle each source sub-lane to its destination.
15890 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15891 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15892 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15893 if (SrcSubLane < 0)
15894 continue;
15895 for (int j = 0; j != NumSubLaneElts; ++j)
15896 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15897 }
15898
15899 // Avoid returning the same shuffle operation.
15900 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15901 if (RepeatedMask == Mask || SubLaneMask == Mask)
15902 return SDValue();
15903
15904 SDValue RepeatedShuffle =
15905 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15906
15907 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15908 SubLaneMask);
15909 };
15910
15911 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15912 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15913 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15914 // Otherwise we can only permute whole 128-bit lanes.
15915 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15916 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15917 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15918 MinSubLaneScale = 2;
15919 MaxSubLaneScale =
15920 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15921 }
15922 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15923 MinSubLaneScale = MaxSubLaneScale = 4;
15924
15925 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15926 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15927 return Shuffle;
15928
15929 return SDValue();
15930}
15931
15933 bool &ForceV1Zero, bool &ForceV2Zero,
15934 unsigned &ShuffleImm, ArrayRef<int> Mask,
15935 const APInt &Zeroable) {
15936 int NumElts = VT.getVectorNumElements();
15937 assert(VT.getScalarSizeInBits() == 64 &&
15938 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15939 "Unexpected data type for VSHUFPD");
15940 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15941 "Illegal shuffle mask");
15942
15943 bool ZeroLane[2] = { true, true };
15944 for (int i = 0; i < NumElts; ++i)
15945 ZeroLane[i & 1] &= Zeroable[i];
15946
15947 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15948 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15949 bool IsSHUFPD = true;
15950 bool IsCommutable = true;
15951 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
15952 for (int i = 0; i < NumElts; ++i) {
15953 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15954 continue;
15955 if (Mask[i] < 0)
15956 return false;
15957 int Val = (i & 6) + NumElts * (i & 1);
15958 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15959 if (Mask[i] < Val || Mask[i] > Val + 1)
15960 IsSHUFPD = false;
15961 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15962 IsCommutable = false;
15963 SHUFPDMask[i] = Mask[i] % 2;
15964 }
15965
15966 if (!IsSHUFPD && !IsCommutable)
15967 return false;
15968
15969 if (!IsSHUFPD && IsCommutable)
15970 std::swap(V1, V2);
15971
15972 ForceV1Zero = ZeroLane[0];
15973 ForceV2Zero = ZeroLane[1];
15974 ShuffleImm = getSHUFPDImm(SHUFPDMask);
15975 return true;
15976}
15977
15979 SDValue V2, ArrayRef<int> Mask,
15980 const APInt &Zeroable,
15981 const X86Subtarget &Subtarget,
15982 SelectionDAG &DAG) {
15983 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15984 "Unexpected data type for VSHUFPD");
15985
15986 unsigned Immediate = 0;
15987 bool ForceV1Zero = false, ForceV2Zero = false;
15988 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15989 Mask, Zeroable))
15990 return SDValue();
15991
15992 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15993 if (ForceV1Zero)
15994 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15995 if (ForceV2Zero)
15996 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15997
15998 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15999 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16000}
16001
16002// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16003// by zeroable elements in the remaining 24 elements. Turn this into two
16004// vmovqb instructions shuffled together.
16006 SDValue V1, SDValue V2,
16007 ArrayRef<int> Mask,
16008 const APInt &Zeroable,
16009 SelectionDAG &DAG) {
16010 assert(VT == MVT::v32i8 && "Unexpected type!");
16011
16012 // The first 8 indices should be every 8th element.
16013 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16014 return SDValue();
16015
16016 // Remaining elements need to be zeroable.
16017 if (Zeroable.countl_one() < (Mask.size() - 8))
16018 return SDValue();
16019
16020 V1 = DAG.getBitcast(MVT::v4i64, V1);
16021 V2 = DAG.getBitcast(MVT::v4i64, V2);
16022
16023 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16024 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16025
16026 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16027 // the upper bits of the result using an unpckldq.
16028 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16029 { 0, 1, 2, 3, 16, 17, 18, 19,
16030 4, 5, 6, 7, 20, 21, 22, 23 });
16031 // Insert the unpckldq into a zero vector to widen to v32i8.
16032 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16033 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16034 DAG.getVectorIdxConstant(0, DL));
16035}
16036
16037// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16038// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16039// =>
16040// ul = unpckl v1, v2
16041// uh = unpckh v1, v2
16042// a = vperm ul, uh
16043// b = vperm ul, uh
16044//
16045// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16046// and permute. We cannot directly match v3 because it is split into two
16047// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16048// pair of 256-bit shuffles and makes sure the masks are consecutive.
16049//
16050// Once unpck and permute nodes are created, the permute corresponding to this
16051// shuffle is returned, while the other permute replaces the other half of the
16052// shuffle in the selection dag.
16054 SDValue V1, SDValue V2,
16055 ArrayRef<int> Mask,
16056 SelectionDAG &DAG) {
16057 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16058 VT != MVT::v32i8)
16059 return SDValue();
16060 // <B0, B1, B0+1, B1+1, ..., >
16061 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16062 unsigned Begin1) {
16063 size_t Size = Mask.size();
16064 assert(Size % 2 == 0 && "Expected even mask size");
16065 for (unsigned I = 0; I < Size; I += 2) {
16066 if (Mask[I] != (int)(Begin0 + I / 2) ||
16067 Mask[I + 1] != (int)(Begin1 + I / 2))
16068 return false;
16069 }
16070 return true;
16071 };
16072 // Check which half is this shuffle node
16073 int NumElts = VT.getVectorNumElements();
16074 size_t FirstQtr = NumElts / 2;
16075 size_t ThirdQtr = NumElts + NumElts / 2;
16076 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16077 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16078 if (!IsFirstHalf && !IsSecondHalf)
16079 return SDValue();
16080
16081 // Find the intersection between shuffle users of V1 and V2.
16082 SmallVector<SDNode *, 2> Shuffles;
16083 for (SDNode *User : V1->users())
16084 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16085 User->getOperand(1) == V2)
16086 Shuffles.push_back(User);
16087 // Limit user size to two for now.
16088 if (Shuffles.size() != 2)
16089 return SDValue();
16090 // Find out which half of the 512-bit shuffles is each smaller shuffle
16091 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16092 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16093 SDNode *FirstHalf;
16094 SDNode *SecondHalf;
16095 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16096 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16097 FirstHalf = Shuffles[0];
16098 SecondHalf = Shuffles[1];
16099 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16100 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16101 FirstHalf = Shuffles[1];
16102 SecondHalf = Shuffles[0];
16103 } else {
16104 return SDValue();
16105 }
16106 // Lower into unpck and perm. Return the perm of this shuffle and replace
16107 // the other.
16108 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16109 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16110 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16111 DAG.getTargetConstant(0x20, DL, MVT::i8));
16112 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16113 DAG.getTargetConstant(0x31, DL, MVT::i8));
16114 if (IsFirstHalf) {
16115 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16116 return Perm1;
16117 }
16118 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16119 return Perm2;
16120}
16121
16122/// Handle lowering of 4-lane 64-bit floating point shuffles.
16123///
16124/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16125/// isn't available.
16127 const APInt &Zeroable, SDValue V1, SDValue V2,
16128 const X86Subtarget &Subtarget,
16129 SelectionDAG &DAG) {
16130 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16131 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16132 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16133
16134 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16135 Subtarget, DAG))
16136 return V;
16137
16138 if (V2.isUndef()) {
16139 // Check for being able to broadcast a single element.
16140 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16141 Mask, Subtarget, DAG))
16142 return Broadcast;
16143
16144 // Use low duplicate instructions for masks that match their pattern.
16145 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16146 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16147
16148 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16149 // Non-half-crossing single input shuffles can be lowered with an
16150 // interleaved permutation.
16151 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16152 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16153 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16154 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16155 }
16156
16157 // With AVX2 we have direct support for this permutation.
16158 if (Subtarget.hasAVX2())
16159 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16160 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16161
16162 // Try to create an in-lane repeating shuffle mask and then shuffle the
16163 // results into the target lanes.
16165 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16166 return V;
16167
16168 // Try to permute the lanes and then use a per-lane permute.
16169 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16170 Mask, DAG, Subtarget))
16171 return V;
16172
16173 // Otherwise, fall back.
16174 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16175 DAG, Subtarget);
16176 }
16177
16178 // Use dedicated unpack instructions for masks that match their pattern.
16179 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16180 return V;
16181
16182 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16183 Zeroable, Subtarget, DAG))
16184 return Blend;
16185
16186 // Check if the blend happens to exactly fit that of SHUFPD.
16187 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16188 Zeroable, Subtarget, DAG))
16189 return Op;
16190
16191 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16192 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16193
16194 // If we have lane crossing shuffles AND they don't all come from the lower
16195 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16196 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16197 // canonicalize to a blend of splat which isn't necessary for this combine.
16198 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16199 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16200 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16201 (V2.getOpcode() != ISD::BUILD_VECTOR))
16202 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16203
16204 // If we have one input in place, then we can permute the other input and
16205 // blend the result.
16206 if (V1IsInPlace || V2IsInPlace)
16207 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16208 Zeroable, Subtarget, DAG);
16209
16210 // Try to create an in-lane repeating shuffle mask and then shuffle the
16211 // results into the target lanes.
16213 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16214 return V;
16215
16216 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16217 // shuffle. However, if we have AVX2 and either inputs are already in place,
16218 // we will be able to shuffle even across lanes the other input in a single
16219 // instruction so skip this pattern.
16220 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16222 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16223 return V;
16224
16225 // If we have VLX support, we can use VEXPAND.
16226 if (Subtarget.hasVLX())
16227 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16228 Zeroable, Subtarget, DAG))
16229 return V;
16230
16231 // If we have AVX2 then we always want to lower with a blend because an v4 we
16232 // can fully permute the elements.
16233 if (Subtarget.hasAVX2())
16234 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16235 Zeroable, Subtarget, DAG);
16236
16237 // Otherwise fall back on generic lowering.
16238 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16239 Subtarget, DAG);
16240}
16241
16242/// Handle lowering of 4-lane 64-bit integer shuffles.
16243///
16244/// This routine is only called when we have AVX2 and thus a reasonable
16245/// instruction set for v4i64 shuffling..
16247 const APInt &Zeroable, SDValue V1, SDValue V2,
16248 const X86Subtarget &Subtarget,
16249 SelectionDAG &DAG) {
16250 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16251 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16252 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16253 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16254
16255 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16256 Subtarget, DAG))
16257 return V;
16258
16259 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16260 Zeroable, Subtarget, DAG))
16261 return Blend;
16262
16263 // Check for being able to broadcast a single element.
16264 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16265 Subtarget, DAG))
16266 return Broadcast;
16267
16268 // Try to use shift instructions if fast.
16269 if (Subtarget.preferLowerShuffleAsShift())
16270 if (SDValue Shift =
16271 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16272 Subtarget, DAG, /*BitwiseOnly*/ true))
16273 return Shift;
16274
16275 if (V2.isUndef()) {
16276 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16277 // can use lower latency instructions that will operate on both lanes.
16278 SmallVector<int, 2> RepeatedMask;
16279 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16280 SmallVector<int, 4> PSHUFDMask;
16281 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16282 return DAG.getBitcast(
16283 MVT::v4i64,
16284 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16285 DAG.getBitcast(MVT::v8i32, V1),
16286 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16287 }
16288
16289 // AVX2 provides a direct instruction for permuting a single input across
16290 // lanes.
16291 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16292 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16293 }
16294
16295 // Try to use shift instructions.
16296 if (SDValue Shift =
16297 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16298 DAG, /*BitwiseOnly*/ false))
16299 return Shift;
16300
16301 // If we have VLX support, we can use VALIGN or VEXPAND.
16302 if (Subtarget.hasVLX()) {
16303 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16304 Zeroable, Subtarget, DAG))
16305 return Rotate;
16306
16307 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16308 Zeroable, Subtarget, DAG))
16309 return V;
16310 }
16311
16312 // Try to use PALIGNR.
16313 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16314 Subtarget, DAG))
16315 return Rotate;
16316
16317 // Use dedicated unpack instructions for masks that match their pattern.
16318 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16319 return V;
16320
16321 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16322 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16323
16324 // If we have one input in place, then we can permute the other input and
16325 // blend the result.
16326 if (V1IsInPlace || V2IsInPlace)
16327 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16328 Zeroable, Subtarget, DAG);
16329
16330 // Try to create an in-lane repeating shuffle mask and then shuffle the
16331 // results into the target lanes.
16333 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16334 return V;
16335
16336 // Try to lower to PERMQ(BLENDD(V1,V2)).
16337 if (SDValue V =
16338 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16339 return V;
16340
16341 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16342 // shuffle. However, if we have AVX2 and either inputs are already in place,
16343 // we will be able to shuffle even across lanes the other input in a single
16344 // instruction so skip this pattern.
16345 if (!V1IsInPlace && !V2IsInPlace)
16347 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16348 return Result;
16349
16350 // Otherwise fall back on generic blend lowering.
16351 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16352 Zeroable, Subtarget, DAG);
16353}
16354
16355/// Handle lowering of 8-lane 32-bit floating point shuffles.
16356///
16357/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16358/// isn't available.
16360 const APInt &Zeroable, SDValue V1, SDValue V2,
16361 const X86Subtarget &Subtarget,
16362 SelectionDAG &DAG) {
16363 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16364 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16365 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16366
16367 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16368 Zeroable, Subtarget, DAG))
16369 return Blend;
16370
16371 // Check for being able to broadcast a single element.
16372 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16373 Subtarget, DAG))
16374 return Broadcast;
16375
16376 if (!Subtarget.hasAVX2()) {
16377 SmallVector<int> InLaneMask;
16378 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16379
16380 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16381 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16382 /*SimpleOnly*/ true))
16383 return R;
16384 }
16385 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16386 Zeroable, Subtarget, DAG))
16387 return DAG.getBitcast(MVT::v8f32, ZExt);
16388
16389 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16390 // options to efficiently lower the shuffle.
16391 SmallVector<int, 4> RepeatedMask;
16392 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16393 assert(RepeatedMask.size() == 4 &&
16394 "Repeated masks must be half the mask width!");
16395
16396 // Use even/odd duplicate instructions for masks that match their pattern.
16397 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16398 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16399 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16400 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16401
16402 if (V2.isUndef())
16403 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16404 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16405
16406 // Use dedicated unpack instructions for masks that match their pattern.
16407 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16408 return V;
16409
16410 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16411 // have already handled any direct blends.
16412 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16413 }
16414
16415 // Try to create an in-lane repeating shuffle mask and then shuffle the
16416 // results into the target lanes.
16418 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16419 return V;
16420
16421 // If we have a single input shuffle with different shuffle patterns in the
16422 // two 128-bit lanes use the variable mask to VPERMILPS.
16423 if (V2.isUndef()) {
16424 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16425 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16426 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16427 }
16428 if (Subtarget.hasAVX2()) {
16429 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16430 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16431 }
16432 // Otherwise, fall back.
16433 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16434 DAG, Subtarget);
16435 }
16436
16437 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16438 // shuffle.
16440 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16441 return Result;
16442
16443 // If we have VLX support, we can use VEXPAND.
16444 if (Subtarget.hasVLX())
16445 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16446 Zeroable, Subtarget, DAG))
16447 return V;
16448
16449 // Try to match an interleave of two v8f32s and lower them as unpck and
16450 // permutes using ymms. This needs to go before we try to split the vectors.
16451 //
16452 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
16453 // this path inadvertently.
16454 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
16455 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16456 Mask, DAG))
16457 return V;
16458
16459 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16460 // since after split we get a more efficient code using vpunpcklwd and
16461 // vpunpckhwd instrs than vblend.
16462 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16463 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16464 Subtarget, DAG);
16465
16466 // If we have AVX2 then we always want to lower with a blend because at v8 we
16467 // can fully permute the elements.
16468 if (Subtarget.hasAVX2())
16469 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16470 Zeroable, Subtarget, DAG);
16471
16472 // Otherwise fall back on generic lowering.
16473 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16474 Subtarget, DAG);
16475}
16476
16477/// Handle lowering of 8-lane 32-bit integer shuffles.
16478///
16479/// This routine is only called when we have AVX2 and thus a reasonable
16480/// instruction set for v8i32 shuffling..
16482 const APInt &Zeroable, SDValue V1, SDValue V2,
16483 const X86Subtarget &Subtarget,
16484 SelectionDAG &DAG) {
16485 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16486 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16487 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16488 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16489
16490 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16491
16492 // Whenever we can lower this as a zext, that instruction is strictly faster
16493 // than any alternative. It also allows us to fold memory operands into the
16494 // shuffle in many cases.
16495 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16496 Zeroable, Subtarget, DAG))
16497 return ZExt;
16498
16499 // Try to match an interleave of two v8i32s and lower them as unpck and
16500 // permutes using ymms. This needs to go before we try to split the vectors.
16501 if (!Subtarget.hasAVX512())
16502 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16503 Mask, DAG))
16504 return V;
16505
16506 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16507 // since after split we get a more efficient code than vblend by using
16508 // vpunpcklwd and vpunpckhwd instrs.
16509 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16510 !Subtarget.hasAVX512())
16511 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16512 Subtarget, DAG);
16513
16514 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16515 Zeroable, Subtarget, DAG))
16516 return Blend;
16517
16518 // Check for being able to broadcast a single element.
16519 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16520 Subtarget, DAG))
16521 return Broadcast;
16522
16523 // Try to use shift instructions if fast.
16524 if (Subtarget.preferLowerShuffleAsShift()) {
16525 if (SDValue Shift =
16526 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16527 Subtarget, DAG, /*BitwiseOnly*/ true))
16528 return Shift;
16529 if (NumV2Elements == 0)
16530 if (SDValue Rotate =
16531 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16532 return Rotate;
16533 }
16534
16535 // If the shuffle mask is repeated in each 128-bit lane we can use more
16536 // efficient instructions that mirror the shuffles across the two 128-bit
16537 // lanes.
16538 SmallVector<int, 4> RepeatedMask;
16539 bool Is128BitLaneRepeatedShuffle =
16540 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16541 if (Is128BitLaneRepeatedShuffle) {
16542 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16543 if (V2.isUndef())
16544 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16545 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16546
16547 // Use dedicated unpack instructions for masks that match their pattern.
16548 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16549 return V;
16550 }
16551
16552 // Try to use shift instructions.
16553 if (SDValue Shift =
16554 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16555 DAG, /*BitwiseOnly*/ false))
16556 return Shift;
16557
16558 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16559 if (SDValue Rotate =
16560 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16561 return Rotate;
16562
16563 // If we have VLX support, we can use VALIGN or EXPAND.
16564 if (Subtarget.hasVLX()) {
16565 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16566 Zeroable, Subtarget, DAG))
16567 return Rotate;
16568
16569 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16570 Zeroable, Subtarget, DAG))
16571 return V;
16572 }
16573
16574 // Try to use byte rotation instructions.
16575 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16576 Subtarget, DAG))
16577 return Rotate;
16578
16579 // Try to create an in-lane repeating shuffle mask and then shuffle the
16580 // results into the target lanes.
16582 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16583 return V;
16584
16585 if (V2.isUndef()) {
16586 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16587 // because that should be faster than the variable permute alternatives.
16588 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16589 return V;
16590
16591 // If the shuffle patterns aren't repeated but it's a single input, directly
16592 // generate a cross-lane VPERMD instruction.
16593 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16594 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16595 }
16596
16597 // Assume that a single SHUFPS is faster than an alternative sequence of
16598 // multiple instructions (even if the CPU has a domain penalty).
16599 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16600 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16601 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16602 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16603 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16604 CastV1, CastV2, DAG);
16605 return DAG.getBitcast(MVT::v8i32, ShufPS);
16606 }
16607
16608 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16609 // shuffle.
16611 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16612 return Result;
16613
16614 // Otherwise fall back on generic blend lowering.
16615 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16616 Zeroable, Subtarget, DAG);
16617}
16618
16619/// Handle lowering of 16-lane 16-bit integer shuffles.
16620///
16621/// This routine is only called when we have AVX2 and thus a reasonable
16622/// instruction set for v16i16 shuffling..
16624 const APInt &Zeroable, SDValue V1, SDValue V2,
16625 const X86Subtarget &Subtarget,
16626 SelectionDAG &DAG) {
16627 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16628 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16629 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16630 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16631
16632 // Whenever we can lower this as a zext, that instruction is strictly faster
16633 // than any alternative. It also allows us to fold memory operands into the
16634 // shuffle in many cases.
16636 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16637 return ZExt;
16638
16639 // Check for being able to broadcast a single element.
16640 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16641 Subtarget, DAG))
16642 return Broadcast;
16643
16644 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16645 Zeroable, Subtarget, DAG))
16646 return Blend;
16647
16648 // Use dedicated unpack instructions for masks that match their pattern.
16649 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16650 return V;
16651
16652 // Use dedicated pack instructions for masks that match their pattern.
16653 if (SDValue V =
16654 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16655 return V;
16656
16657 // Try to use lower using a truncation.
16658 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16659 Subtarget, DAG))
16660 return V;
16661
16662 // Try to use shift instructions.
16663 if (SDValue Shift =
16664 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16665 Subtarget, DAG, /*BitwiseOnly*/ false))
16666 return Shift;
16667
16668 // Try to use byte rotation instructions.
16669 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16670 Subtarget, DAG))
16671 return Rotate;
16672
16673 // Try to create an in-lane repeating shuffle mask and then shuffle the
16674 // results into the target lanes.
16676 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16677 return V;
16678
16679 if (V2.isUndef()) {
16680 // Try to use bit rotation instructions.
16681 if (SDValue Rotate =
16682 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16683 return Rotate;
16684
16685 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16686 // because that should be faster than the variable permute alternatives.
16687 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
16688 return V;
16689
16690 // There are no generalized cross-lane shuffle operations available on i16
16691 // element types.
16692 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16694 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16695 return V;
16696
16697 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16698 DAG, Subtarget);
16699 }
16700
16701 SmallVector<int, 8> RepeatedMask;
16702 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16703 // As this is a single-input shuffle, the repeated mask should be
16704 // a strictly valid v8i16 mask that we can pass through to the v8i16
16705 // lowering to handle even the v16 case.
16707 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16708 }
16709 }
16710
16711 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16712 Zeroable, Subtarget, DAG))
16713 return PSHUFB;
16714
16715 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16716 if (Subtarget.hasBWI())
16717 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16718
16719 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16720 // shuffle.
16722 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16723 return Result;
16724
16725 // Try to permute the lanes and then use a per-lane permute.
16727 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16728 return V;
16729
16730 // Try to match an interleave of two v16i16s and lower them as unpck and
16731 // permutes using ymms.
16732 if (!Subtarget.hasAVX512())
16733 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16734 Mask, DAG))
16735 return V;
16736
16737 // Otherwise fall back on generic lowering.
16738 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16739 Subtarget, DAG);
16740}
16741
16742/// Handle lowering of 32-lane 8-bit integer shuffles.
16743///
16744/// This routine is only called when we have AVX2 and thus a reasonable
16745/// instruction set for v32i8 shuffling..
16747 const APInt &Zeroable, SDValue V1, SDValue V2,
16748 const X86Subtarget &Subtarget,
16749 SelectionDAG &DAG) {
16750 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16751 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16752 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16753 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16754
16755 // Whenever we can lower this as a zext, that instruction is strictly faster
16756 // than any alternative. It also allows us to fold memory operands into the
16757 // shuffle in many cases.
16758 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16759 Zeroable, Subtarget, DAG))
16760 return ZExt;
16761
16762 // Check for being able to broadcast a single element.
16763 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16764 Subtarget, DAG))
16765 return Broadcast;
16766
16767 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16768 Zeroable, Subtarget, DAG))
16769 return Blend;
16770
16771 // Use dedicated unpack instructions for masks that match their pattern.
16772 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
16773 return V;
16774
16775 // Use dedicated pack instructions for masks that match their pattern.
16776 if (SDValue V =
16777 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16778 return V;
16779
16780 // Try to use lower using a truncation.
16781 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16782 Subtarget, DAG))
16783 return V;
16784
16785 // Try to use shift instructions.
16786 if (SDValue Shift =
16787 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16788 DAG, /*BitwiseOnly*/ false))
16789 return Shift;
16790
16791 // Try to use byte rotation instructions.
16792 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16793 Subtarget, DAG))
16794 return Rotate;
16795
16796 // Try to use bit rotation instructions.
16797 if (V2.isUndef())
16798 if (SDValue Rotate =
16799 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16800 return Rotate;
16801
16802 // Try to create an in-lane repeating shuffle mask and then shuffle the
16803 // results into the target lanes.
16805 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16806 return V;
16807
16808 // There are no generalized cross-lane shuffle operations available on i8
16809 // element types.
16810 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16811 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16812 // because that should be faster than the variable permute alternatives.
16813 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
16814 return V;
16815
16817 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16818 return V;
16819
16820 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16821 DAG, Subtarget);
16822 }
16823
16824 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16825 Zeroable, Subtarget, DAG))
16826 return PSHUFB;
16827
16828 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16829 if (Subtarget.hasVBMI())
16830 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16831
16832 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16833 // shuffle.
16835 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16836 return Result;
16837
16838 // Try to permute the lanes and then use a per-lane permute.
16840 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16841 return V;
16842
16843 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16844 // by zeroable elements in the remaining 24 elements. Turn this into two
16845 // vmovqb instructions shuffled together.
16846 if (Subtarget.hasVLX())
16847 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16848 Mask, Zeroable, DAG))
16849 return V;
16850
16851 // Try to match an interleave of two v32i8s and lower them as unpck and
16852 // permutes using ymms.
16853 if (!Subtarget.hasAVX512())
16854 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16855 Mask, DAG))
16856 return V;
16857
16858 // Otherwise fall back on generic lowering.
16859 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16860 Subtarget, DAG);
16861}
16862
16863/// High-level routine to lower various 256-bit x86 vector shuffles.
16864///
16865/// This routine either breaks down the specific type of a 256-bit x86 vector
16866/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16867/// together based on the available instructions.
16869 SDValue V1, SDValue V2, const APInt &Zeroable,
16870 const X86Subtarget &Subtarget,
16871 SelectionDAG &DAG) {
16872 // If we have a single input to the zero element, insert that into V1 if we
16873 // can do so cheaply.
16874 int NumElts = VT.getVectorNumElements();
16875 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16876
16877 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16879 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16880 return Insertion;
16881
16882 // Handle special cases where the lower or upper half is UNDEF.
16883 if (SDValue V =
16884 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16885 return V;
16886
16887 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16888 // can check for those subtargets here and avoid much of the subtarget
16889 // querying in the per-vector-type lowering routines. With AVX1 we have
16890 // essentially *zero* ability to manipulate a 256-bit vector with integer
16891 // types. Since we'll use floating point types there eventually, just
16892 // immediately cast everything to a float and operate entirely in that domain.
16893 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16894 int ElementBits = VT.getScalarSizeInBits();
16895 if (ElementBits < 32) {
16896 // No floating point type available, if we can't use the bit operations
16897 // for masking/blending then decompose into 128-bit vectors.
16898 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16899 Subtarget, DAG))
16900 return V;
16901 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16902 return V;
16903 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16904 }
16905
16906 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16908 V1 = DAG.getBitcast(FpVT, V1);
16909 V2 = DAG.getBitcast(FpVT, V2);
16910 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16911 }
16912
16913 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16914 V1 = DAG.getBitcast(MVT::v16i16, V1);
16915 V2 = DAG.getBitcast(MVT::v16i16, V2);
16916 return DAG.getBitcast(VT,
16917 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16918 }
16919
16920 switch (VT.SimpleTy) {
16921 case MVT::v4f64:
16922 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16923 case MVT::v4i64:
16924 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16925 case MVT::v8f32:
16926 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16927 case MVT::v8i32:
16928 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16929 case MVT::v16i16:
16930 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16931 case MVT::v32i8:
16932 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16933
16934 default:
16935 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16936 }
16937}
16938
16939/// Try to lower a vector shuffle as a 128-bit shuffles.
16941 const APInt &Zeroable, SDValue V1, SDValue V2,
16942 const X86Subtarget &Subtarget,
16943 SelectionDAG &DAG) {
16944 assert(VT.getScalarSizeInBits() == 64 &&
16945 "Unexpected element type size for 128bit shuffle.");
16946
16947 // To handle 256 bit vector requires VLX and most probably
16948 // function lowerV2X128VectorShuffle() is better solution.
16949 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16950
16951 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16952 SmallVector<int, 4> Widened128Mask;
16953 if (!canWidenShuffleElements(Mask, Widened128Mask))
16954 return SDValue();
16955 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16956
16957 // Try to use an insert into a zero vector.
16958 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16959 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16960 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16961 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16962 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16963 DAG.getVectorIdxConstant(0, DL));
16964 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16965 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16966 DAG.getVectorIdxConstant(0, DL));
16967 }
16968
16969 // Check for patterns which can be matched with a single insert of a 256-bit
16970 // subvector.
16971 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16972 if (OnlyUsesV1 ||
16973 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16974 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16975 SDValue SubVec =
16976 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16977 DAG.getVectorIdxConstant(0, DL));
16978 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16979 DAG.getVectorIdxConstant(4, DL));
16980 }
16981
16982 // See if this is an insertion of the lower 128-bits of V2 into V1.
16983 bool IsInsert = true;
16984 int V2Index = -1;
16985 for (int i = 0; i < 4; ++i) {
16986 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16987 if (Widened128Mask[i] < 0)
16988 continue;
16989
16990 // Make sure all V1 subvectors are in place.
16991 if (Widened128Mask[i] < 4) {
16992 if (Widened128Mask[i] != i) {
16993 IsInsert = false;
16994 break;
16995 }
16996 } else {
16997 // Make sure we only have a single V2 index and its the lowest 128-bits.
16998 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16999 IsInsert = false;
17000 break;
17001 }
17002 V2Index = i;
17003 }
17004 }
17005 if (IsInsert && V2Index >= 0) {
17006 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17007 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17008 DAG.getVectorIdxConstant(0, DL));
17009 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17010 }
17011
17012 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17013 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17014 // possible we at least ensure the lanes stay sequential to help later
17015 // combines.
17016 SmallVector<int, 2> Widened256Mask;
17017 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17018 Widened128Mask.clear();
17019 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17020 }
17021
17022 // Try to lower to vshuf64x2/vshuf32x4.
17023 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17024 int PermMask[4] = {-1, -1, -1, -1};
17025 // Ensure elements came from the same Op.
17026 for (int i = 0; i < 4; ++i) {
17027 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17028 if (Widened128Mask[i] < 0)
17029 continue;
17030
17031 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17032 unsigned OpIndex = i / 2;
17033 if (Ops[OpIndex].isUndef())
17034 Ops[OpIndex] = Op;
17035 else if (Ops[OpIndex] != Op)
17036 return SDValue();
17037
17038 PermMask[i] = Widened128Mask[i] % 4;
17039 }
17040
17041 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17042 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17043}
17044
17045/// Handle lowering of 8-lane 64-bit floating point shuffles.
17047 const APInt &Zeroable, SDValue V1, SDValue V2,
17048 const X86Subtarget &Subtarget,
17049 SelectionDAG &DAG) {
17050 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17051 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17052 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17053
17054 if (V2.isUndef()) {
17055 // Use low duplicate instructions for masks that match their pattern.
17056 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17057 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17058
17059 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17060 // Non-half-crossing single input shuffles can be lowered with an
17061 // interleaved permutation.
17062 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17063 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17064 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17065 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17066 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17067 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17068 }
17069
17070 SmallVector<int, 4> RepeatedMask;
17071 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17072 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17073 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17074 }
17075
17076 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17077 V2, Subtarget, DAG))
17078 return Shuf128;
17079
17080 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17081 return Unpck;
17082
17083 // Check if the blend happens to exactly fit that of SHUFPD.
17084 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17085 Zeroable, Subtarget, DAG))
17086 return Op;
17087
17088 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17089 Subtarget, DAG))
17090 return V;
17091
17092 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17093 Zeroable, Subtarget, DAG))
17094 return Blend;
17095
17096 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17097}
17098
17099/// Handle lowering of 16-lane 32-bit floating point shuffles.
17101 const APInt &Zeroable, SDValue V1, SDValue V2,
17102 const X86Subtarget &Subtarget,
17103 SelectionDAG &DAG) {
17104 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17105 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17106 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17107
17108 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17109 // options to efficiently lower the shuffle.
17110 SmallVector<int, 4> RepeatedMask;
17111 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17112 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17113
17114 // Use even/odd duplicate instructions for masks that match their pattern.
17115 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17116 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17117 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17118 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17119
17120 if (V2.isUndef())
17121 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17122 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17123
17124 // Use dedicated unpack instructions for masks that match their pattern.
17125 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17126 return V;
17127
17128 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17129 Zeroable, Subtarget, DAG))
17130 return Blend;
17131
17132 // Otherwise, fall back to a SHUFPS sequence.
17133 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17134 }
17135
17136 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17137 Zeroable, Subtarget, DAG))
17138 return Blend;
17139
17141 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17142 return DAG.getBitcast(MVT::v16f32, ZExt);
17143
17144 // Try to create an in-lane repeating shuffle mask and then shuffle the
17145 // results into the target lanes.
17147 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17148 return V;
17149
17150 // If we have a single input shuffle with different shuffle patterns in the
17151 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17152 if (V2.isUndef() &&
17153 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17154 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17155 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17156 }
17157
17158 // If we have AVX512F support, we can use VEXPAND.
17159 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17160 Zeroable, Subtarget, DAG))
17161 return V;
17162
17163 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17164}
17165
17166/// Handle lowering of 8-lane 64-bit integer shuffles.
17168 const APInt &Zeroable, SDValue V1, SDValue V2,
17169 const X86Subtarget &Subtarget,
17170 SelectionDAG &DAG) {
17171 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17172 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17173 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17174
17175 // Try to use shift instructions if fast.
17176 if (Subtarget.preferLowerShuffleAsShift())
17177 if (SDValue Shift =
17178 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17179 Subtarget, DAG, /*BitwiseOnly*/ true))
17180 return Shift;
17181
17182 if (V2.isUndef()) {
17183 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17184 // can use lower latency instructions that will operate on all four
17185 // 128-bit lanes.
17186 SmallVector<int, 2> Repeated128Mask;
17187 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17188 SmallVector<int, 4> PSHUFDMask;
17189 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17190 return DAG.getBitcast(
17191 MVT::v8i64,
17192 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17193 DAG.getBitcast(MVT::v16i32, V1),
17194 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17195 }
17196
17197 SmallVector<int, 4> Repeated256Mask;
17198 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17199 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17200 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17201 }
17202
17203 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17204 V2, Subtarget, DAG))
17205 return Shuf128;
17206
17207 // Try to use shift instructions.
17208 if (SDValue Shift =
17209 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17210 DAG, /*BitwiseOnly*/ false))
17211 return Shift;
17212
17213 // Try to use VALIGN.
17214 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17215 Zeroable, Subtarget, DAG))
17216 return Rotate;
17217
17218 // Try to use PALIGNR.
17219 if (Subtarget.hasBWI())
17220 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17221 Subtarget, DAG))
17222 return Rotate;
17223
17224 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17225 return Unpck;
17226
17227 // If we have AVX512F support, we can use VEXPAND.
17228 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17229 Subtarget, DAG))
17230 return V;
17231
17232 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17233 Zeroable, Subtarget, DAG))
17234 return Blend;
17235
17236 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17237}
17238
17239/// Handle lowering of 16-lane 32-bit integer shuffles.
17241 const APInt &Zeroable, SDValue V1, SDValue V2,
17242 const X86Subtarget &Subtarget,
17243 SelectionDAG &DAG) {
17244 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17245 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17246 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17247
17248 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17249
17250 // Whenever we can lower this as a zext, that instruction is strictly faster
17251 // than any alternative. It also allows us to fold memory operands into the
17252 // shuffle in many cases.
17254 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17255 return ZExt;
17256
17257 // Try to use shift instructions if fast.
17258 if (Subtarget.preferLowerShuffleAsShift()) {
17259 if (SDValue Shift =
17260 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17261 Subtarget, DAG, /*BitwiseOnly*/ true))
17262 return Shift;
17263 if (NumV2Elements == 0)
17264 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17265 Subtarget, DAG))
17266 return Rotate;
17267 }
17268
17269 // If the shuffle mask is repeated in each 128-bit lane we can use more
17270 // efficient instructions that mirror the shuffles across the four 128-bit
17271 // lanes.
17272 SmallVector<int, 4> RepeatedMask;
17273 bool Is128BitLaneRepeatedShuffle =
17274 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17275 if (Is128BitLaneRepeatedShuffle) {
17276 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17277 if (V2.isUndef())
17278 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17279 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17280
17281 // Use dedicated unpack instructions for masks that match their pattern.
17282 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17283 return V;
17284 }
17285
17286 // Try to use shift instructions.
17287 if (SDValue Shift =
17288 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17289 Subtarget, DAG, /*BitwiseOnly*/ false))
17290 return Shift;
17291
17292 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17293 if (SDValue Rotate =
17294 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17295 return Rotate;
17296
17297 // Try to use VALIGN.
17298 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17299 Zeroable, Subtarget, DAG))
17300 return Rotate;
17301
17302 // Try to use byte rotation instructions.
17303 if (Subtarget.hasBWI())
17304 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17305 Subtarget, DAG))
17306 return Rotate;
17307
17308 // Assume that a single SHUFPS is faster than using a permv shuffle.
17309 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17310 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17311 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17312 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17313 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17314 CastV1, CastV2, DAG);
17315 return DAG.getBitcast(MVT::v16i32, ShufPS);
17316 }
17317
17318 // Try to create an in-lane repeating shuffle mask and then shuffle the
17319 // results into the target lanes.
17321 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17322 return V;
17323
17324 // If we have AVX512F support, we can use VEXPAND.
17325 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17326 Zeroable, Subtarget, DAG))
17327 return V;
17328
17329 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17330 Zeroable, Subtarget, DAG))
17331 return Blend;
17332
17333 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17334}
17335
17336/// Handle lowering of 32-lane 16-bit integer shuffles.
17338 const APInt &Zeroable, SDValue V1, SDValue V2,
17339 const X86Subtarget &Subtarget,
17340 SelectionDAG &DAG) {
17341 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17342 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17343 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17344 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17345
17346 // Whenever we can lower this as a zext, that instruction is strictly faster
17347 // than any alternative. It also allows us to fold memory operands into the
17348 // shuffle in many cases.
17350 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17351 return ZExt;
17352
17353 // Use dedicated unpack instructions for masks that match their pattern.
17354 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17355 return V;
17356
17357 // Use dedicated pack instructions for masks that match their pattern.
17358 if (SDValue V =
17359 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17360 return V;
17361
17362 // Try to use shift instructions.
17363 if (SDValue Shift =
17364 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17365 Subtarget, DAG, /*BitwiseOnly*/ false))
17366 return Shift;
17367
17368 // Try to use byte rotation instructions.
17369 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17370 Subtarget, DAG))
17371 return Rotate;
17372
17373 if (V2.isUndef()) {
17374 // Try to use bit rotation instructions.
17375 if (SDValue Rotate =
17376 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17377 return Rotate;
17378
17379 SmallVector<int, 8> RepeatedMask;
17380 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17381 // As this is a single-input shuffle, the repeated mask should be
17382 // a strictly valid v8i16 mask that we can pass through to the v8i16
17383 // lowering to handle even the v32 case.
17384 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17385 RepeatedMask, Subtarget, DAG);
17386 }
17387 }
17388
17389 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17390 Zeroable, Subtarget, DAG))
17391 return Blend;
17392
17393 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17394 Zeroable, Subtarget, DAG))
17395 return PSHUFB;
17396
17397 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17398 // shuffle.
17399 if (!V2.isUndef())
17401 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17402 return Result;
17403
17404 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17405}
17406
17407/// Handle lowering of 64-lane 8-bit integer shuffles.
17409 const APInt &Zeroable, SDValue V1, SDValue V2,
17410 const X86Subtarget &Subtarget,
17411 SelectionDAG &DAG) {
17412 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17413 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17414 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17415 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17416
17417 // Whenever we can lower this as a zext, that instruction is strictly faster
17418 // than any alternative. It also allows us to fold memory operands into the
17419 // shuffle in many cases.
17421 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17422 return ZExt;
17423
17424 // Use dedicated unpack instructions for masks that match their pattern.
17425 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17426 return V;
17427
17428 // Use dedicated pack instructions for masks that match their pattern.
17429 if (SDValue V =
17430 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17431 return V;
17432
17433 // Try to use shift instructions.
17434 if (SDValue Shift =
17435 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17436 DAG, /*BitwiseOnly*/ false))
17437 return Shift;
17438
17439 // Try to use byte rotation instructions.
17440 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17441 Subtarget, DAG))
17442 return Rotate;
17443
17444 // Try to use bit rotation instructions.
17445 if (V2.isUndef())
17446 if (SDValue Rotate =
17447 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17448 return Rotate;
17449
17450 // Lower as AND if possible.
17451 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17452 Zeroable, Subtarget, DAG))
17453 return Masked;
17454
17455 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17456 Zeroable, Subtarget, DAG))
17457 return PSHUFB;
17458
17459 // Try to create an in-lane repeating shuffle mask and then shuffle the
17460 // results into the target lanes.
17462 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17463 return V;
17464
17466 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17467 return Result;
17468
17469 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17470 Zeroable, Subtarget, DAG))
17471 return Blend;
17472
17473 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17474 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17475 // PALIGNR will be cheaper than the second PSHUFB+OR.
17476 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17477 Mask, Subtarget, DAG))
17478 return V;
17479
17480 // If we can't directly blend but can use PSHUFB, that will be better as it
17481 // can both shuffle and set up the inefficient blend.
17482 bool V1InUse, V2InUse;
17483 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17484 DAG, V1InUse, V2InUse);
17485 }
17486
17487 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17488 // shuffle.
17489 if (!V2.isUndef())
17491 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17492 return Result;
17493
17494 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17495 if (Subtarget.hasVBMI())
17496 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17497
17498 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17499}
17500
17501/// High-level routine to lower various 512-bit x86 vector shuffles.
17502///
17503/// This routine either breaks down the specific type of a 512-bit x86 vector
17504/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17505/// together based on the available instructions.
17507 MVT VT, SDValue V1, SDValue V2,
17508 const APInt &Zeroable,
17509 const X86Subtarget &Subtarget,
17510 SelectionDAG &DAG) {
17511 assert(Subtarget.hasAVX512() &&
17512 "Cannot lower 512-bit vectors w/ basic ISA!");
17513
17514 // If we have a single input to the zero element, insert that into V1 if we
17515 // can do so cheaply.
17516 int NumElts = Mask.size();
17517 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17518
17519 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17521 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17522 return Insertion;
17523
17524 // Handle special cases where the lower or upper half is UNDEF.
17525 if (SDValue V =
17526 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17527 return V;
17528
17529 // Check for being able to broadcast a single element.
17530 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17531 Subtarget, DAG))
17532 return Broadcast;
17533
17534 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17535 // Try using bit ops for masking and blending before falling back to
17536 // splitting.
17537 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17538 Subtarget, DAG))
17539 return V;
17540 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17541 return V;
17542
17543 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17544 }
17545
17546 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17547 if (!Subtarget.hasBWI())
17548 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17549 /*SimpleOnly*/ false);
17550
17551 V1 = DAG.getBitcast(MVT::v32i16, V1);
17552 V2 = DAG.getBitcast(MVT::v32i16, V2);
17553 return DAG.getBitcast(VT,
17554 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17555 }
17556
17557 // Dispatch to each element type for lowering. If we don't have support for
17558 // specific element type shuffles at 512 bits, immediately split them and
17559 // lower them. Each lowering routine of a given type is allowed to assume that
17560 // the requisite ISA extensions for that element type are available.
17561 switch (VT.SimpleTy) {
17562 case MVT::v8f64:
17563 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17564 case MVT::v16f32:
17565 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17566 case MVT::v8i64:
17567 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17568 case MVT::v16i32:
17569 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17570 case MVT::v32i16:
17571 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17572 case MVT::v64i8:
17573 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17574
17575 default:
17576 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17577 }
17578}
17579
17581 MVT VT, SDValue V1, SDValue V2,
17582 const X86Subtarget &Subtarget,
17583 SelectionDAG &DAG) {
17584 // Shuffle should be unary.
17585 if (!V2.isUndef())
17586 return SDValue();
17587
17588 int ShiftAmt = -1;
17589 int NumElts = Mask.size();
17590 for (int i = 0; i != NumElts; ++i) {
17591 int M = Mask[i];
17592 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17593 "Unexpected mask index.");
17594 if (M < 0)
17595 continue;
17596
17597 // The first non-undef element determines our shift amount.
17598 if (ShiftAmt < 0) {
17599 ShiftAmt = M - i;
17600 // Need to be shifting right.
17601 if (ShiftAmt <= 0)
17602 return SDValue();
17603 }
17604 // All non-undef elements must shift by the same amount.
17605 if (ShiftAmt != M - i)
17606 return SDValue();
17607 }
17608 assert(ShiftAmt >= 0 && "All undef?");
17609
17610 // Great we found a shift right.
17611 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17612 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17613 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17614 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17615 DAG.getVectorIdxConstant(0, DL));
17616}
17617
17618// Determine if this shuffle can be implemented with a KSHIFT instruction.
17619// Returns the shift amount if possible or -1 if not. This is a simplified
17620// version of matchShuffleAsShift.
17621static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17622 int MaskOffset, const APInt &Zeroable) {
17623 int Size = Mask.size();
17624
17625 auto CheckZeros = [&](int Shift, bool Left) {
17626 for (int j = 0; j < Shift; ++j)
17627 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17628 return false;
17629
17630 return true;
17631 };
17632
17633 auto MatchShift = [&](int Shift, bool Left) {
17634 unsigned Pos = Left ? Shift : 0;
17635 unsigned Low = Left ? 0 : Shift;
17636 unsigned Len = Size - Shift;
17637 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17638 };
17639
17640 for (int Shift = 1; Shift != Size; ++Shift)
17641 for (bool Left : {true, false})
17642 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17644 return Shift;
17645 }
17646
17647 return -1;
17648}
17649
17650
17651// Lower vXi1 vector shuffles.
17652// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17653// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17654// vector, shuffle and then truncate it back.
17656 MVT VT, SDValue V1, SDValue V2,
17657 const APInt &Zeroable,
17658 const X86Subtarget &Subtarget,
17659 SelectionDAG &DAG) {
17660 assert(Subtarget.hasAVX512() &&
17661 "Cannot lower 512-bit vectors w/o basic ISA!");
17662
17663 int NumElts = Mask.size();
17664 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17665
17666 // Try to recognize shuffles that are just padding a subvector with zeros.
17667 int SubvecElts = 0;
17668 int Src = -1;
17669 for (int i = 0; i != NumElts; ++i) {
17670 if (Mask[i] >= 0) {
17671 // Grab the source from the first valid mask. All subsequent elements need
17672 // to use this same source.
17673 if (Src < 0)
17674 Src = Mask[i] / NumElts;
17675 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17676 break;
17677 }
17678
17679 ++SubvecElts;
17680 }
17681 assert(SubvecElts != NumElts && "Identity shuffle?");
17682
17683 // Clip to a power 2.
17684 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17685
17686 // Make sure the number of zeroable bits in the top at least covers the bits
17687 // not covered by the subvector.
17688 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17689 assert(Src >= 0 && "Expected a source!");
17690 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17691 SDValue Extract =
17692 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
17693 DAG.getVectorIdxConstant(0, DL));
17694 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17695 DAG.getConstant(0, DL, VT), Extract,
17696 DAG.getVectorIdxConstant(0, DL));
17697 }
17698
17699 // Try a simple shift right with undef elements. Later we'll try with zeros.
17700 if (SDValue Shift =
17701 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
17702 return Shift;
17703
17704 // Try to match KSHIFTs.
17705 unsigned Offset = 0;
17706 for (SDValue V : {V1, V2}) {
17707 unsigned Opcode;
17708 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17709 if (ShiftAmt >= 0) {
17710 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17711 MVT WideVT = Res.getSimpleValueType();
17712 // Widened right shifts need two shifts to ensure we shift in zeroes.
17713 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17714 int WideElts = WideVT.getVectorNumElements();
17715 // Shift left to put the original vector in the MSBs of the new size.
17716 Res =
17717 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17718 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17719 // Increase the shift amount to account for the left shift.
17720 ShiftAmt += WideElts - NumElts;
17721 }
17722
17723 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17724 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17725 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17726 DAG.getVectorIdxConstant(0, DL));
17727 }
17728 Offset += NumElts; // Increment for next iteration.
17729 }
17730
17731 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
17732 // ops instead.
17733 // TODO: What other unary shuffles would benefit from this?
17734 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17735 SDValue Op0 = V1.getOperand(0);
17736 SDValue Op1 = V1.getOperand(1);
17737 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17738 EVT OpVT = Op0.getValueType();
17739 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17740 return DAG.getSetCC(
17741 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17742 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17743 }
17744
17745 MVT ExtVT;
17746 switch (VT.SimpleTy) {
17747 default:
17748 llvm_unreachable("Expected a vector of i1 elements");
17749 case MVT::v2i1:
17750 ExtVT = MVT::v2i64;
17751 break;
17752 case MVT::v4i1:
17753 ExtVT = MVT::v4i32;
17754 break;
17755 case MVT::v8i1:
17756 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17757 // shuffle.
17758 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17759 break;
17760 case MVT::v16i1:
17761 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17762 // 256-bit operation available.
17763 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17764 break;
17765 case MVT::v32i1:
17766 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17767 // 256-bit operation available.
17768 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17769 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17770 break;
17771 case MVT::v64i1:
17772 // Fall back to scalarization. FIXME: We can do better if the shuffle
17773 // can be partitioned cleanly.
17774 if (!Subtarget.useBWIRegs())
17775 return SDValue();
17776 ExtVT = MVT::v64i8;
17777 break;
17778 }
17779
17780 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17781 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17782
17783 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17784 // i1 was sign extended we can use X86ISD::CVT2MASK.
17785 int NumElems = VT.getVectorNumElements();
17786 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17787 (Subtarget.hasDQI() && (NumElems < 32)))
17788 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17789 Shuffle, ISD::SETGT);
17790
17791 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17792}
17793
17794/// Helper function that returns true if the shuffle mask should be
17795/// commuted to improve canonicalization.
17797 int NumElements = Mask.size();
17798
17799 int NumV1Elements = 0, NumV2Elements = 0;
17800 for (int M : Mask)
17801 if (M < 0)
17802 continue;
17803 else if (M < NumElements)
17804 ++NumV1Elements;
17805 else
17806 ++NumV2Elements;
17807
17808 // Commute the shuffle as needed such that more elements come from V1 than
17809 // V2. This allows us to match the shuffle pattern strictly on how many
17810 // elements come from V1 without handling the symmetric cases.
17811 if (NumV2Elements > NumV1Elements)
17812 return true;
17813
17814 assert(NumV1Elements > 0 && "No V1 indices");
17815
17816 if (NumV2Elements == 0)
17817 return false;
17818
17819 // When the number of V1 and V2 elements are the same, try to minimize the
17820 // number of uses of V2 in the low half of the vector. When that is tied,
17821 // ensure that the sum of indices for V1 is equal to or lower than the sum
17822 // indices for V2. When those are equal, try to ensure that the number of odd
17823 // indices for V1 is lower than the number of odd indices for V2.
17824 if (NumV1Elements == NumV2Elements) {
17825 int LowV1Elements = 0, LowV2Elements = 0;
17826 for (int M : Mask.slice(0, NumElements / 2))
17827 if (M >= NumElements)
17828 ++LowV2Elements;
17829 else if (M >= 0)
17830 ++LowV1Elements;
17831 if (LowV2Elements > LowV1Elements)
17832 return true;
17833 if (LowV2Elements == LowV1Elements) {
17834 int SumV1Indices = 0, SumV2Indices = 0;
17835 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17836 if (Mask[i] >= NumElements)
17837 SumV2Indices += i;
17838 else if (Mask[i] >= 0)
17839 SumV1Indices += i;
17840 if (SumV2Indices < SumV1Indices)
17841 return true;
17842 if (SumV2Indices == SumV1Indices) {
17843 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17844 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17845 if (Mask[i] >= NumElements)
17846 NumV2OddIndices += i % 2;
17847 else if (Mask[i] >= 0)
17848 NumV1OddIndices += i % 2;
17849 if (NumV2OddIndices < NumV1OddIndices)
17850 return true;
17851 }
17852 }
17853 }
17854
17855 return false;
17856}
17857
17859 const X86Subtarget &Subtarget) {
17860 if (!Subtarget.hasAVX512())
17861 return false;
17862
17863 if (!V.getValueType().isSimple())
17864 return false;
17865
17866 MVT VT = V.getSimpleValueType().getScalarType();
17867 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17868 return false;
17869
17870 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17871 // are preferable to blendw/blendvb/masked-mov.
17872 if ((VT == MVT::i16 || VT == MVT::i8) &&
17873 V.getSimpleValueType().getSizeInBits() < 512)
17874 return false;
17875
17876 auto HasMaskOperation = [&](SDValue V) {
17877 // TODO: Currently we only check limited opcode. We probably extend
17878 // it to all binary operation by checking TLI.isBinOp().
17879 switch (V->getOpcode()) {
17880 default:
17881 return false;
17882 case ISD::ADD:
17883 case ISD::SUB:
17884 case ISD::AND:
17885 case ISD::XOR:
17886 case ISD::OR:
17887 case ISD::SMAX:
17888 case ISD::SMIN:
17889 case ISD::UMAX:
17890 case ISD::UMIN:
17891 case ISD::ABS:
17892 case ISD::SHL:
17893 case ISD::SRL:
17894 case ISD::SRA:
17895 case ISD::MUL:
17896 break;
17897 }
17898 if (!V->hasOneUse())
17899 return false;
17900
17901 return true;
17902 };
17903
17904 if (HasMaskOperation(V))
17905 return true;
17906
17907 return false;
17908}
17909
17910// Forward declaration.
17913 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17914 const X86Subtarget &Subtarget);
17915
17916 /// Top-level lowering for x86 vector shuffles.
17917///
17918/// This handles decomposition, canonicalization, and lowering of all x86
17919/// vector shuffles. Most of the specific lowering strategies are encapsulated
17920/// above in helper routines. The canonicalization attempts to widen shuffles
17921/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17922/// s.t. only one of the two inputs needs to be tested, etc.
17924 SelectionDAG &DAG) {
17925 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17926 ArrayRef<int> OrigMask = SVOp->getMask();
17927 SDValue V1 = Op.getOperand(0);
17928 SDValue V2 = Op.getOperand(1);
17929 MVT VT = Op.getSimpleValueType();
17930 int NumElements = VT.getVectorNumElements();
17931 SDLoc DL(Op);
17932 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17933
17934 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17935 "Can't lower MMX shuffles");
17936
17937 bool V1IsUndef = V1.isUndef();
17938 bool V2IsUndef = V2.isUndef();
17939 if (V1IsUndef && V2IsUndef)
17940 return DAG.getUNDEF(VT);
17941
17942 // When we create a shuffle node we put the UNDEF node to second operand,
17943 // but in some cases the first operand may be transformed to UNDEF.
17944 // In this case we should just commute the node.
17945 if (V1IsUndef)
17946 return DAG.getCommutedVectorShuffle(*SVOp);
17947
17948 // Check for non-undef masks pointing at an undef vector and make the masks
17949 // undef as well. This makes it easier to match the shuffle based solely on
17950 // the mask.
17951 if (V2IsUndef &&
17952 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17953 SmallVector<int, 8> NewMask(OrigMask);
17954 for (int &M : NewMask)
17955 if (M >= NumElements)
17956 M = -1;
17957 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17958 }
17959
17960 // Check for illegal shuffle mask element index values.
17961 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17962 (void)MaskUpperLimit;
17963 assert(llvm::all_of(OrigMask,
17964 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17965 "Out of bounds shuffle index");
17966
17967 // We actually see shuffles that are entirely re-arrangements of a set of
17968 // zero inputs. This mostly happens while decomposing complex shuffles into
17969 // simple ones. Directly lower these as a buildvector of zeros.
17970 APInt KnownUndef, KnownZero;
17971 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17972
17973 APInt Zeroable = KnownUndef | KnownZero;
17974 if (Zeroable.isAllOnes())
17975 return getZeroVector(VT, Subtarget, DAG, DL);
17976
17977 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17978
17979 // Try to collapse shuffles into using a vector type with fewer elements but
17980 // wider element types. We cap this to not form integers or floating point
17981 // elements wider than 64 bits. It does not seem beneficial to form i128
17982 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17983 SmallVector<int, 16> WidenedMask;
17984 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17985 !canCombineAsMaskOperation(V1, Subtarget) &&
17986 !canCombineAsMaskOperation(V2, Subtarget) &&
17987 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17988 // Shuffle mask widening should not interfere with a broadcast opportunity
17989 // by obfuscating the operands with bitcasts.
17990 // TODO: Avoid lowering directly from this top-level function: make this
17991 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17992 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17993 Subtarget, DAG))
17994 return Broadcast;
17995
17996 MVT NewEltVT = VT.isFloatingPoint()
17999 int NewNumElts = NumElements / 2;
18000 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18001 // Make sure that the new vector type is legal. For example, v2f64 isn't
18002 // legal on SSE1.
18003 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18004 if (V2IsZero) {
18005 // Modify the new Mask to take all zeros from the all-zero vector.
18006 // Choose indices that are blend-friendly.
18007 bool UsedZeroVector = false;
18008 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18009 "V2's non-undef elements are used?!");
18010 for (int i = 0; i != NewNumElts; ++i)
18011 if (WidenedMask[i] == SM_SentinelZero) {
18012 WidenedMask[i] = i + NewNumElts;
18013 UsedZeroVector = true;
18014 }
18015 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18016 // some elements to be undef.
18017 if (UsedZeroVector)
18018 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18019 }
18020 V1 = DAG.getBitcast(NewVT, V1);
18021 V2 = DAG.getBitcast(NewVT, V2);
18022 return DAG.getBitcast(
18023 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18024 }
18025 }
18026
18027 SmallVector<SDValue> Ops = {V1, V2};
18028 SmallVector<int> Mask(OrigMask);
18029
18030 // Canonicalize the shuffle with any horizontal ops inputs.
18031 // NOTE: This may update Ops and Mask.
18033 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18034 return DAG.getBitcast(VT, HOp);
18035
18036 V1 = DAG.getBitcast(VT, Ops[0]);
18037 V2 = DAG.getBitcast(VT, Ops[1]);
18038 assert(NumElements == (int)Mask.size() &&
18039 "canonicalizeShuffleMaskWithHorizOp "
18040 "shouldn't alter the shuffle mask size");
18041
18042 // Commute the shuffle if it will improve canonicalization.
18045 std::swap(V1, V2);
18046 }
18047
18048 // For each vector width, delegate to a specialized lowering routine.
18049 if (VT.is128BitVector())
18050 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18051
18052 if (VT.is256BitVector())
18053 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18054
18055 if (VT.is512BitVector())
18056 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18057
18058 if (Is1BitVector)
18059 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18060
18061 llvm_unreachable("Unimplemented!");
18062}
18063
18064// As legal vpcompress instructions depend on various AVX512 extensions, try to
18065// convert illegal vector sizes to legal ones to avoid expansion.
18067 SelectionDAG &DAG) {
18068 assert(Subtarget.hasAVX512() &&
18069 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18070
18071 SDLoc DL(Op);
18072 SDValue Vec = Op.getOperand(0);
18073 SDValue Mask = Op.getOperand(1);
18074 SDValue Passthru = Op.getOperand(2);
18075
18076 EVT VecVT = Vec.getValueType();
18077 EVT ElementVT = VecVT.getVectorElementType();
18078 unsigned NumElements = VecVT.getVectorNumElements();
18079 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18080 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18081
18082 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18083 // compressed as 512-bit vectors in AVX512F.
18084 if (NumVecBits != 128 && NumVecBits != 256)
18085 return SDValue();
18086
18087 if (NumElementBits == 32 || NumElementBits == 64) {
18088 unsigned NumLargeElements = 512 / NumElementBits;
18089 MVT LargeVecVT =
18090 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18091 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18092
18093 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18094 DAG, DL);
18095 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18096 Subtarget, DAG, DL);
18097 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18098 : widenSubVector(LargeVecVT, Passthru,
18099 /*ZeroNewElements=*/false,
18100 Subtarget, DAG, DL);
18101
18102 SDValue Compressed =
18103 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18104 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18105 DAG.getConstant(0, DL, MVT::i64));
18106 }
18107
18108 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18109 VecVT == MVT::v16i16) {
18110 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18111 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18112
18113 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18114 Passthru = Passthru.isUndef()
18115 ? DAG.getUNDEF(LargeVecVT)
18116 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18117
18118 SDValue Compressed =
18119 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18120 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18121 }
18122
18123 return SDValue();
18124}
18125
18126/// Try to lower a VSELECT instruction to a vector shuffle.
18128 const X86Subtarget &Subtarget,
18129 SelectionDAG &DAG) {
18130 SDValue Cond = Op.getOperand(0);
18131 SDValue LHS = Op.getOperand(1);
18132 SDValue RHS = Op.getOperand(2);
18133 MVT VT = Op.getSimpleValueType();
18134
18135 // Only non-legal VSELECTs reach this lowering, convert those into generic
18136 // shuffles and re-use the shuffle lowering path for blends.
18140 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18141 }
18142
18143 return SDValue();
18144}
18145
18146SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18147 SDValue Cond = Op.getOperand(0);
18148 SDValue LHS = Op.getOperand(1);
18149 SDValue RHS = Op.getOperand(2);
18150
18151 SDLoc dl(Op);
18152 MVT VT = Op.getSimpleValueType();
18153 if (isSoftF16(VT, Subtarget)) {
18155 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18156 DAG.getBitcast(NVT, LHS),
18157 DAG.getBitcast(NVT, RHS)));
18158 }
18159
18160 // A vselect where all conditions and data are constants can be optimized into
18161 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18165 return SDValue();
18166
18167 // Try to lower this to a blend-style vector shuffle. This can handle all
18168 // constant condition cases.
18169 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18170 return BlendOp;
18171
18172 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18173 // with patterns on the mask registers on AVX-512.
18174 MVT CondVT = Cond.getSimpleValueType();
18175 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18176 if (CondEltSize == 1)
18177 return Op;
18178
18179 // Variable blends are only legal from SSE4.1 onward.
18180 if (!Subtarget.hasSSE41())
18181 return SDValue();
18182
18183 unsigned EltSize = VT.getScalarSizeInBits();
18184 unsigned NumElts = VT.getVectorNumElements();
18185
18186 // Expand v32i16/v64i8 without BWI.
18187 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18188 return SDValue();
18189
18190 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18191 // into an i1 condition so that we can use the mask-based 512-bit blend
18192 // instructions.
18193 if (VT.getSizeInBits() == 512) {
18194 // Build a mask by testing the condition against zero.
18195 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18196 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18197 DAG.getConstant(0, dl, CondVT),
18198 ISD::SETNE);
18199 // Now return a new VSELECT using the mask.
18200 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18201 }
18202
18203 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18204 if (CondEltSize != EltSize) {
18205 // If we don't have a sign splat, rely on the expansion.
18206 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18207 return SDValue();
18208
18209 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18210 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18211 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18212 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18213 }
18214
18215 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18216 // are free to split, then better to split before expanding the
18217 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18218 // TODO: This is very similar to narrowVectorSelect.
18219 // TODO: Add Load splitting to isFreeToSplitVector ?
18220 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18221 !Subtarget.hasXOP()) {
18222 bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG);
18223 bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) ||
18224 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18225 bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) ||
18226 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18227 if (FreeCond && (FreeLHS || FreeRHS))
18228 return splitVectorOp(Op, DAG, dl);
18229 }
18230
18231 // Only some types will be legal on some subtargets. If we can emit a legal
18232 // VSELECT-matching blend, return Op, and but if we need to expand, return
18233 // a null value.
18234 switch (VT.SimpleTy) {
18235 default:
18236 // Most of the vector types have blends past SSE4.1.
18237 return Op;
18238
18239 case MVT::v32i8:
18240 // The byte blends for AVX vectors were introduced only in AVX2.
18241 if (Subtarget.hasAVX2())
18242 return Op;
18243
18244 return SDValue();
18245
18246 case MVT::v8i16:
18247 case MVT::v16i16: {
18248 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18249 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18250 Cond = DAG.getBitcast(CastVT, Cond);
18251 LHS = DAG.getBitcast(CastVT, LHS);
18252 RHS = DAG.getBitcast(CastVT, RHS);
18253 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18254 return DAG.getBitcast(VT, Select);
18255 }
18256 }
18257}
18258
18260 MVT VT = Op.getSimpleValueType();
18261 SDValue Vec = Op.getOperand(0);
18262 SDValue Idx = Op.getOperand(1);
18263 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18264 SDLoc dl(Op);
18265
18267 return SDValue();
18268
18269 if (VT.getSizeInBits() == 8) {
18270 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18271 // we're going to zero extend the register or fold the store.
18274 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18275 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18276 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18277
18278 unsigned IdxVal = Idx->getAsZExtVal();
18279 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18280 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18281 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18282 }
18283
18284 if (VT == MVT::f32) {
18285 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18286 // the result back to FR32 register. It's only worth matching if the
18287 // result has a single use which is a store or a bitcast to i32. And in
18288 // the case of a store, it's not worth it if the index is a constant 0,
18289 // because a MOVSSmr can be used instead, which is smaller and faster.
18290 if (!Op.hasOneUse())
18291 return SDValue();
18292 SDNode *User = *Op.getNode()->user_begin();
18293 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18294 (User->getOpcode() != ISD::BITCAST ||
18295 User->getValueType(0) != MVT::i32))
18296 return SDValue();
18297 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18298 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18299 return DAG.getBitcast(MVT::f32, Extract);
18300 }
18301
18302 if (VT == MVT::i32 || VT == MVT::i64)
18303 return Op;
18304
18305 return SDValue();
18306}
18307
18308/// Extract one bit from mask vector, like v16i1 or v8i1.
18309/// AVX-512 feature.
18311 const X86Subtarget &Subtarget) {
18312 SDValue Vec = Op.getOperand(0);
18313 SDLoc dl(Vec);
18314 MVT VecVT = Vec.getSimpleValueType();
18315 SDValue Idx = Op.getOperand(1);
18316 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18317 MVT EltVT = Op.getSimpleValueType();
18318
18319 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18320 "Unexpected vector type in ExtractBitFromMaskVector");
18321
18322 // variable index can't be handled in mask registers,
18323 // extend vector to VR512/128
18324 if (!IdxC) {
18325 unsigned NumElts = VecVT.getVectorNumElements();
18326 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18327 // than extending to 128/256bit.
18328 if (NumElts == 1) {
18329 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18331 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18332 }
18333 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18334 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18335 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18336 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18337 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18338 }
18339
18340 unsigned IdxVal = IdxC->getZExtValue();
18341 if (IdxVal == 0) // the operation is legal
18342 return Op;
18343
18344 // Extend to natively supported kshift.
18345 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18346
18347 // Use kshiftr instruction to move to the lower element.
18348 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18349 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18350
18351 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18352 DAG.getVectorIdxConstant(0, dl));
18353}
18354
18355// Helper to find all the extracted elements from a vector.
18357 MVT VT = N->getSimpleValueType(0);
18358 unsigned NumElts = VT.getVectorNumElements();
18359 APInt DemandedElts = APInt::getZero(NumElts);
18360 for (SDNode *User : N->users()) {
18361 switch (User->getOpcode()) {
18362 case X86ISD::PEXTRB:
18363 case X86ISD::PEXTRW:
18365 if (!isa<ConstantSDNode>(User->getOperand(1))) {
18366 DemandedElts.setAllBits();
18367 return DemandedElts;
18368 }
18369 DemandedElts.setBit(User->getConstantOperandVal(1));
18370 break;
18371 case ISD::BITCAST: {
18372 if (!User->getValueType(0).isSimple() ||
18373 !User->getValueType(0).isVector()) {
18374 DemandedElts.setAllBits();
18375 return DemandedElts;
18376 }
18377 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18378 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18379 break;
18380 }
18381 default:
18382 DemandedElts.setAllBits();
18383 return DemandedElts;
18384 }
18385 }
18386 return DemandedElts;
18387}
18388
18389SDValue
18390X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18391 SelectionDAG &DAG) const {
18392 SDLoc dl(Op);
18393 SDValue Vec = Op.getOperand(0);
18394 MVT VecVT = Vec.getSimpleValueType();
18395 SDValue Idx = Op.getOperand(1);
18396 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18397
18398 if (VecVT.getVectorElementType() == MVT::i1)
18399 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18400
18401 if (!IdxC) {
18402 // Its more profitable to go through memory (1 cycles throughput)
18403 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18404 // IACA tool was used to get performance estimation
18405 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18406 //
18407 // example : extractelement <16 x i8> %a, i32 %i
18408 //
18409 // Block Throughput: 3.00 Cycles
18410 // Throughput Bottleneck: Port5
18411 //
18412 // | Num Of | Ports pressure in cycles | |
18413 // | Uops | 0 - DV | 5 | 6 | 7 | |
18414 // ---------------------------------------------
18415 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18416 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18417 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18418 // Total Num Of Uops: 4
18419 //
18420 //
18421 // Block Throughput: 1.00 Cycles
18422 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18423 //
18424 // | | Ports pressure in cycles | |
18425 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18426 // ---------------------------------------------------------
18427 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18428 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18429 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18430 // Total Num Of Uops: 4
18431
18432 return SDValue();
18433 }
18434
18435 unsigned IdxVal = IdxC->getZExtValue();
18436
18437 // If this is a 256-bit vector result, first extract the 128-bit vector and
18438 // then extract the element from the 128-bit vector.
18439 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18440 // Get the 128-bit vector.
18441 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18442 MVT EltVT = VecVT.getVectorElementType();
18443
18444 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18445 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18446
18447 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18448 // this can be done with a mask.
18449 IdxVal &= ElemsPerChunk - 1;
18450 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18451 DAG.getVectorIdxConstant(IdxVal, dl));
18452 }
18453
18454 assert(VecVT.is128BitVector() && "Unexpected vector length");
18455
18456 MVT VT = Op.getSimpleValueType();
18457
18458 if (VT == MVT::i16) {
18459 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18460 // we're going to zero extend the register or fold the store (SSE41 only).
18461 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18462 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18463 if (Subtarget.hasFP16())
18464 return Op;
18465
18466 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18467 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18468 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18469 }
18470
18471 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18472 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18473 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18474 }
18475
18476 if (Subtarget.hasSSE41())
18477 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18478 return Res;
18479
18480 // Only extract a single element from a v16i8 source - determine the common
18481 // DWORD/WORD that all extractions share, and extract the sub-byte.
18482 // TODO: Add QWORD MOVQ extraction?
18483 if (VT == MVT::i8) {
18484 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18485 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18486
18487 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18488 int DWordIdx = IdxVal / 4;
18489 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18490 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18491 DAG.getBitcast(MVT::v4i32, Vec),
18492 DAG.getVectorIdxConstant(DWordIdx, dl));
18493 int ShiftVal = (IdxVal % 4) * 8;
18494 if (ShiftVal != 0)
18495 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18496 DAG.getConstant(ShiftVal, dl, MVT::i8));
18497 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18498 }
18499
18500 int WordIdx = IdxVal / 2;
18501 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18502 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18503 DAG.getBitcast(MVT::v8i16, Vec),
18504 DAG.getVectorIdxConstant(WordIdx, dl));
18505 int ShiftVal = (IdxVal % 2) * 8;
18506 if (ShiftVal != 0)
18507 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18508 DAG.getConstant(ShiftVal, dl, MVT::i8));
18509 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18510 }
18511 }
18512
18513 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18514 if (IdxVal == 0)
18515 return Op;
18516
18517 // Shuffle the element to the lowest element, then movss or movsh.
18519 Mask[0] = static_cast<int>(IdxVal);
18520 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18521 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18522 DAG.getVectorIdxConstant(0, dl));
18523 }
18524
18525 if (VT.getSizeInBits() == 64) {
18526 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18527 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18528 // to match extract_elt for f64.
18529 if (IdxVal == 0)
18530 return Op;
18531
18532 // UNPCKHPD the element to the lowest double word, then movsd.
18533 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18534 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18535 int Mask[2] = { 1, -1 };
18536 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18537 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18538 DAG.getVectorIdxConstant(0, dl));
18539 }
18540
18541 return SDValue();
18542}
18543
18544/// Insert one bit to mask vector, like v16i1 or v8i1.
18545/// AVX-512 feature.
18547 const X86Subtarget &Subtarget) {
18548 SDLoc dl(Op);
18549 SDValue Vec = Op.getOperand(0);
18550 SDValue Elt = Op.getOperand(1);
18551 SDValue Idx = Op.getOperand(2);
18552 MVT VecVT = Vec.getSimpleValueType();
18553
18554 if (!isa<ConstantSDNode>(Idx)) {
18555 // Non constant index. Extend source and destination,
18556 // insert element and then truncate the result.
18557 unsigned NumElts = VecVT.getVectorNumElements();
18558 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18559 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18560 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18561 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18562 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18563 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18564 }
18565
18566 // Copy into a k-register, extract to v1i1 and insert_subvector.
18567 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18568 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18569}
18570
18571SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18572 SelectionDAG &DAG) const {
18573 MVT VT = Op.getSimpleValueType();
18574 MVT EltVT = VT.getVectorElementType();
18575 unsigned NumElts = VT.getVectorNumElements();
18576 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18577
18578 if (EltVT == MVT::i1)
18579 return InsertBitToMaskVector(Op, DAG, Subtarget);
18580
18581 SDLoc dl(Op);
18582 SDValue N0 = Op.getOperand(0);
18583 SDValue N1 = Op.getOperand(1);
18584 SDValue N2 = Op.getOperand(2);
18585 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18586
18587 if (EltVT == MVT::bf16) {
18589 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18590 DAG.getBitcast(IVT, N0),
18591 DAG.getBitcast(MVT::i16, N1), N2);
18592 return DAG.getBitcast(VT, Res);
18593 }
18594
18595 if (!N2C) {
18596 // Variable insertion indices, usually we're better off spilling to stack,
18597 // but AVX512 can use a variable compare+select by comparing against all
18598 // possible vector indices, and FP insertion has less gpr->simd traffic.
18599 if (!(Subtarget.hasBWI() ||
18600 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18601 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18602 return SDValue();
18603
18604 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18605 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18606 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18607 return SDValue();
18608
18609 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18610 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18611 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18612
18613 SmallVector<SDValue, 16> RawIndices;
18614 for (unsigned I = 0; I != NumElts; ++I)
18615 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18616 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18617
18618 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18619 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18621 }
18622
18623 if (N2C->getAPIntValue().uge(NumElts))
18624 return SDValue();
18625 uint64_t IdxVal = N2C->getZExtValue();
18626
18627 bool IsZeroElt = X86::isZeroNode(N1);
18628 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18629
18630 if (IsZeroElt || IsAllOnesElt) {
18631 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18632 // We don't deal with i8 0 since it appears to be handled elsewhere.
18633 if (IsAllOnesElt &&
18634 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18635 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18636 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18637 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18638 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18639 CstVectorElts[IdxVal] = OnesCst;
18640 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18641 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18642 }
18643 // See if we can do this more efficiently with a blend shuffle with a
18644 // rematerializable vector.
18645 if (Subtarget.hasSSE41() &&
18646 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18647 SmallVector<int, 8> BlendMask;
18648 for (unsigned i = 0; i != NumElts; ++i)
18649 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18650 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18651 : getOnesVector(VT, DAG, dl);
18652 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18653 }
18654 }
18655
18656 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18657 // into that, and then insert the subvector back into the result.
18658 if (VT.is256BitVector() || VT.is512BitVector()) {
18659 // With a 256-bit vector, we can insert into the zero element efficiently
18660 // using a blend if we have AVX or AVX2 and the right data type.
18661 if (VT.is256BitVector() && IdxVal == 0) {
18662 // TODO: It is worthwhile to cast integer to floating point and back
18663 // and incur a domain crossing penalty if that's what we'll end up
18664 // doing anyway after extracting to a 128-bit vector.
18665 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18666 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18667 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18668 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18669 DAG.getTargetConstant(1, dl, MVT::i8));
18670 }
18671 }
18672
18673 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18674 assert(isPowerOf2_32(NumEltsIn128) &&
18675 "Vectors will always have power-of-two number of elements.");
18676
18677 // If we are not inserting into the low 128-bit vector chunk,
18678 // then prefer the broadcast+blend sequence.
18679 // FIXME: relax the profitability check iff all N1 uses are insertions.
18680 if (IdxVal >= NumEltsIn128 &&
18681 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18682 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18683 X86::mayFoldLoad(N1, Subtarget)))) {
18684 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18685 SmallVector<int, 8> BlendMask;
18686 for (unsigned i = 0; i != NumElts; ++i)
18687 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18688 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18689 }
18690
18691 // Get the desired 128-bit vector chunk.
18692 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18693
18694 // Insert the element into the desired chunk.
18695 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18696 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18697
18698 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18699 DAG.getVectorIdxConstant(IdxIn128, dl));
18700
18701 // Insert the changed part back into the bigger vector
18702 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18703 }
18704 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18705
18706 // This will be just movw/movd/movq/movsh/movss/movsd.
18707 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18708 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18709 EltVT == MVT::f16 || EltVT == MVT::i64) {
18710 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18711 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18712 }
18713
18714 // We can't directly insert an i8 or i16 into a vector, so zero extend
18715 // it to i32 first.
18716 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18717 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18718 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18719 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18720 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18721 return DAG.getBitcast(VT, N1);
18722 }
18723 }
18724
18725 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18726 // argument. SSE41 required for pinsrb.
18727 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18728 unsigned Opc;
18729 if (VT == MVT::v8i16) {
18730 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18731 Opc = X86ISD::PINSRW;
18732 } else {
18733 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18734 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18735 Opc = X86ISD::PINSRB;
18736 }
18737
18738 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18739 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18740 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18741 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18742 }
18743
18744 if (Subtarget.hasSSE41()) {
18745 if (EltVT == MVT::f32) {
18746 // Bits [7:6] of the constant are the source select. This will always be
18747 // zero here. The DAG Combiner may combine an extract_elt index into
18748 // these bits. For example (insert (extract, 3), 2) could be matched by
18749 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18750 // Bits [5:4] of the constant are the destination select. This is the
18751 // value of the incoming immediate.
18752 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18753 // combine either bitwise AND or insert of float 0.0 to set these bits.
18754
18755 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18756 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18757 // If this is an insertion of 32-bits into the low 32-bits of
18758 // a vector, we prefer to generate a blend with immediate rather
18759 // than an insertps. Blends are simpler operations in hardware and so
18760 // will always have equal or better performance than insertps.
18761 // But if optimizing for size and there's a load folding opportunity,
18762 // generate insertps because blendps does not have a 32-bit memory
18763 // operand form.
18764 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18765 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18766 DAG.getTargetConstant(1, dl, MVT::i8));
18767 }
18768 // Create this as a scalar to vector..
18769 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18770 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18771 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18772 }
18773
18774 // PINSR* works with constant index.
18775 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18776 return Op;
18777 }
18778
18779 return SDValue();
18780}
18781
18783 SelectionDAG &DAG) {
18784 SDLoc dl(Op);
18785 MVT OpVT = Op.getSimpleValueType();
18786
18787 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18788 // combines.
18789 if (X86::isZeroNode(Op.getOperand(0)))
18790 return getZeroVector(OpVT, Subtarget, DAG, dl);
18791
18792 // If this is a 256-bit vector result, first insert into a 128-bit
18793 // vector and then insert into the 256-bit vector.
18794 if (!OpVT.is128BitVector()) {
18795 // Insert into a 128-bit vector.
18796 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18798 OpVT.getVectorNumElements() / SizeFactor);
18799
18800 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18801
18802 // Insert the 128-bit vector.
18803 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18804 }
18805 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18806 "Expected an SSE type!");
18807
18808 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18809 // tblgen.
18810 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18811 return Op;
18812
18813 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18814 return DAG.getBitcast(
18815 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18816}
18817
18818// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18819// simple superregister reference or explicit instructions to insert
18820// the upper bits of a vector.
18822 SelectionDAG &DAG) {
18823 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18824
18825 return insert1BitVector(Op, DAG, Subtarget);
18826}
18827
18829 SelectionDAG &DAG) {
18830 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18831 "Only vXi1 extract_subvectors need custom lowering");
18832
18833 SDLoc dl(Op);
18834 SDValue Vec = Op.getOperand(0);
18835 uint64_t IdxVal = Op.getConstantOperandVal(1);
18836
18837 if (IdxVal == 0) // the operation is legal
18838 return Op;
18839
18840 // Extend to natively supported kshift.
18841 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18842
18843 // Shift to the LSB.
18844 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18845 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18846
18847 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18848 DAG.getVectorIdxConstant(0, dl));
18849}
18850
18851// Returns the appropriate wrapper opcode for a global reference.
18852unsigned X86TargetLowering::getGlobalWrapperKind(
18853 const GlobalValue *GV, const unsigned char OpFlags) const {
18854 // References to absolute symbols are never PC-relative.
18855 if (GV && GV->isAbsoluteSymbolRef())
18856 return X86ISD::Wrapper;
18857
18858 // The following OpFlags under RIP-rel PIC use RIP.
18859 if (Subtarget.isPICStyleRIPRel() &&
18860 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18861 OpFlags == X86II::MO_DLLIMPORT))
18862 return X86ISD::WrapperRIP;
18863
18864 // GOTPCREL references must always use RIP.
18865 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18866 return X86ISD::WrapperRIP;
18867
18868 return X86ISD::Wrapper;
18869}
18870
18871// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18872// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18873// one of the above mentioned nodes. It has to be wrapped because otherwise
18874// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18875// be used to form addressing mode. These wrapped nodes will be selected
18876// into MOV32ri.
18877SDValue
18878X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18879 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18880
18881 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18882 // global base reg.
18883 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18884
18885 auto PtrVT = getPointerTy(DAG.getDataLayout());
18887 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18888 SDLoc DL(CP);
18889 Result =
18890 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18891 // With PIC, the address is actually $g + Offset.
18892 if (OpFlag) {
18893 Result =
18894 DAG.getNode(ISD::ADD, DL, PtrVT,
18895 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18896 }
18897
18898 return Result;
18899}
18900
18901SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18902 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18903
18904 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18905 // global base reg.
18906 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18907
18908 auto PtrVT = getPointerTy(DAG.getDataLayout());
18909 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18910 SDLoc DL(JT);
18911 Result =
18912 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18913
18914 // With PIC, the address is actually $g + Offset.
18915 if (OpFlag)
18916 Result =
18917 DAG.getNode(ISD::ADD, DL, PtrVT,
18918 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18919
18920 return Result;
18921}
18922
18923SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18924 SelectionDAG &DAG) const {
18925 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18926}
18927
18928SDValue
18929X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18930 // Create the TargetBlockAddressAddress node.
18931 unsigned char OpFlags =
18933 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18934 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18935 SDLoc dl(Op);
18936 auto PtrVT = getPointerTy(DAG.getDataLayout());
18937 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18938 Result =
18939 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18940
18941 // With PIC, the address is actually $g + Offset.
18942 if (isGlobalRelativeToPICBase(OpFlags)) {
18943 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18944 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18945 }
18946
18947 return Result;
18948}
18949
18950/// Creates target global address or external symbol nodes for calls or
18951/// other uses.
18952SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18953 bool ForCall) const {
18954 // Unpack the global address or external symbol.
18955 SDLoc dl(Op);
18956 const GlobalValue *GV = nullptr;
18957 int64_t Offset = 0;
18958 const char *ExternalSym = nullptr;
18959 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18960 GV = G->getGlobal();
18961 Offset = G->getOffset();
18962 } else {
18963 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18964 ExternalSym = ES->getSymbol();
18965 }
18966
18967 // Calculate some flags for address lowering.
18969 unsigned char OpFlags;
18970 if (ForCall)
18971 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18972 else
18973 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18974 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18975 bool NeedsLoad = isGlobalStubReference(OpFlags);
18976
18978 auto PtrVT = getPointerTy(DAG.getDataLayout());
18980
18981 if (GV) {
18982 // Create a target global address if this is a global. If possible, fold the
18983 // offset into the global address reference. Otherwise, ADD it on later.
18984 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18985 // allowed because if the address of foo is 0, the ELF R_X86_64_32
18986 // relocation will compute to a negative value, which is invalid.
18987 int64_t GlobalOffset = 0;
18988 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18990 std::swap(GlobalOffset, Offset);
18991 }
18992 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18993 } else {
18994 // If this is not a global address, this must be an external symbol.
18995 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18996 }
18997
18998 // If this is a direct call, avoid the wrapper if we don't need to do any
18999 // loads or adds. This allows SDAG ISel to match direct calls.
19000 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19001 return Result;
19002
19003 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19004
19005 // With PIC, the address is actually $g + Offset.
19006 if (HasPICReg) {
19007 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19008 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19009 }
19010
19011 // For globals that require a load from a stub to get the address, emit the
19012 // load.
19013 if (NeedsLoad)
19014 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19016
19017 // If there was a non-zero offset that we didn't fold, create an explicit
19018 // addition for it.
19019 if (Offset != 0)
19020 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19021 DAG.getSignedConstant(Offset, dl, PtrVT));
19022
19023 return Result;
19024}
19025
19026SDValue
19027X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19028 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19029}
19030
19032 const EVT PtrVT, unsigned ReturnReg,
19033 unsigned char OperandFlags,
19034 bool LoadGlobalBaseReg = false,
19035 bool LocalDynamic = false) {
19037 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19038 SDLoc dl(GA);
19039 SDValue TGA;
19040 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19041 SDValue Chain = DAG.getEntryNode();
19042 SDValue Ret;
19043 if (LocalDynamic && UseTLSDESC) {
19044 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19045 // Reuse existing GetTLSADDR node if we can find it.
19046 if (TGA->hasOneUse()) {
19047 // TLSDESC uses TGA.
19048 SDNode *TLSDescOp = *TGA->user_begin();
19049 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19050 "Unexpected TLSDESC DAG");
19051 // CALLSEQ_END uses TGA via a chain and glue.
19052 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19053 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19054 "Unexpected TLSDESC DAG");
19055 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19056 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19057 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19058 "Unexpected TLSDESC DAG");
19059 Ret = SDValue(CopyFromRegOp, 0);
19060 }
19061 } else {
19062 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19063 GA->getOffset(), OperandFlags);
19064 }
19065
19066 if (!Ret) {
19067 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19068 : LocalDynamic ? X86ISD::TLSBASEADDR
19070
19071 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19072 if (LoadGlobalBaseReg) {
19073 SDValue InGlue;
19074 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19075 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19076 InGlue);
19077 InGlue = Chain.getValue(1);
19078 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19079 } else {
19080 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19081 }
19082 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19083
19084 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19085 MFI.setHasCalls(true);
19086
19087 SDValue Glue = Chain.getValue(1);
19088 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19089 }
19090
19091 if (!UseTLSDESC)
19092 return Ret;
19093
19094 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19095 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19096
19098 SDValue Offset =
19099 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19101 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19102}
19103
19104// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19105static SDValue
19107 const EVT PtrVT) {
19108 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19109 /*LoadGlobalBaseReg=*/true);
19110}
19111
19112// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19113static SDValue
19115 const EVT PtrVT) {
19116 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19117}
19118
19119// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19120static SDValue
19122 const EVT PtrVT) {
19123 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19124}
19125
19127 SelectionDAG &DAG, const EVT PtrVT,
19128 bool Is64Bit, bool Is64BitLP64) {
19129 SDLoc dl(GA);
19130
19131 // Get the start address of the TLS block for this module.
19135
19136 SDValue Base;
19137 if (Is64Bit) {
19138 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19139 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19140 /*LoadGlobalBaseReg=*/false,
19141 /*LocalDynamic=*/true);
19142 } else {
19143 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19144 /*LoadGlobalBaseReg=*/true,
19145 /*LocalDynamic=*/true);
19146 }
19147
19148 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19149 // of Base.
19150
19151 // Build x@dtpoff.
19152 unsigned char OperandFlags = X86II::MO_DTPOFF;
19153 unsigned WrapperKind = X86ISD::Wrapper;
19154 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19155 GA->getValueType(0),
19156 GA->getOffset(), OperandFlags);
19157 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19158
19159 // Add x@dtpoff with the base.
19160 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19161}
19162
19163// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19165 const EVT PtrVT, TLSModel::Model model,
19166 bool is64Bit, bool isPIC) {
19167 SDLoc dl(GA);
19168
19169 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19172
19173 SDValue ThreadPointer =
19174 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19176
19177 unsigned char OperandFlags = 0;
19178 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19179 // initialexec.
19180 unsigned WrapperKind = X86ISD::Wrapper;
19181 if (model == TLSModel::LocalExec) {
19182 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19183 } else if (model == TLSModel::InitialExec) {
19184 if (is64Bit) {
19185 OperandFlags = X86II::MO_GOTTPOFF;
19186 WrapperKind = X86ISD::WrapperRIP;
19187 } else {
19188 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19189 }
19190 } else {
19191 llvm_unreachable("Unexpected model");
19192 }
19193
19194 // emit "addl x@ntpoff,%eax" (local exec)
19195 // or "addl x@indntpoff,%eax" (initial exec)
19196 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19197 SDValue TGA =
19198 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19199 GA->getOffset(), OperandFlags);
19200 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19201
19202 if (model == TLSModel::InitialExec) {
19203 if (isPIC && !is64Bit) {
19204 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19205 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19206 Offset);
19207 }
19208
19209 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19211 }
19212
19213 // The address of the thread local variable is the add of the thread
19214 // pointer with the offset of the variable.
19215 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19216}
19217
19218SDValue
19219X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19220
19221 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19222
19223 if (DAG.getTarget().useEmulatedTLS())
19224 return LowerToTLSEmulatedModel(GA, DAG);
19225
19226 const GlobalValue *GV = GA->getGlobal();
19227 auto PtrVT = getPointerTy(DAG.getDataLayout());
19228 bool PositionIndependent = isPositionIndependent();
19229
19230 if (Subtarget.isTargetELF()) {
19231 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19232 switch (model) {
19234 if (Subtarget.is64Bit()) {
19235 if (Subtarget.isTarget64BitLP64())
19236 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19237 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19238 }
19239 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19241 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19242 Subtarget.isTarget64BitLP64());
19245 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19246 PositionIndependent);
19247 }
19248 llvm_unreachable("Unknown TLS model.");
19249 }
19250
19251 if (Subtarget.isTargetDarwin()) {
19252 // Darwin only has one model of TLS. Lower to that.
19253 unsigned char OpFlag = 0;
19254 unsigned WrapperKind = 0;
19255
19256 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19257 // global base reg.
19258 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19259 if (PIC32) {
19260 OpFlag = X86II::MO_TLVP_PIC_BASE;
19261 WrapperKind = X86ISD::Wrapper;
19262 } else {
19263 OpFlag = X86II::MO_TLVP;
19264 WrapperKind = X86ISD::WrapperRIP;
19265 }
19266 SDLoc DL(Op);
19268 GA->getValueType(0),
19269 GA->getOffset(), OpFlag);
19270 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19271
19272 // With PIC32, the address is actually $g + Offset.
19273 if (PIC32)
19274 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19275 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19276 Offset);
19277
19278 // Lowering the machine isd will make sure everything is in the right
19279 // location.
19280 SDValue Chain = DAG.getEntryNode();
19281 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19282 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19283 SDValue Args[] = { Chain, Offset };
19284 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19285 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19286
19287 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19289 MFI.setAdjustsStack(true);
19290
19291 // And our return value (tls address) is in the standard call return value
19292 // location.
19293 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19294 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19295 }
19296
19297 if (Subtarget.isOSWindows()) {
19298 // Just use the implicit TLS architecture
19299 // Need to generate something similar to:
19300 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19301 // ; from TEB
19302 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19303 // mov rcx, qword [rdx+rcx*8]
19304 // mov eax, .tls$:tlsvar
19305 // [rax+rcx] contains the address
19306 // Windows 64bit: gs:0x58
19307 // Windows 32bit: fs:__tls_array
19308
19309 SDLoc dl(GA);
19310 SDValue Chain = DAG.getEntryNode();
19311
19312 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19313 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19314 // use its literal value of 0x2C.
19316 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19318
19319 SDValue TlsArray = Subtarget.is64Bit()
19320 ? DAG.getIntPtrConstant(0x58, dl)
19321 : (Subtarget.isTargetWindowsGNU()
19322 ? DAG.getIntPtrConstant(0x2C, dl)
19323 : DAG.getExternalSymbol("_tls_array", PtrVT));
19324
19326 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19327
19328 SDValue res;
19330 res = ThreadPointer;
19331 } else {
19332 // Load the _tls_index variable
19333 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19334 if (Subtarget.is64Bit())
19335 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19336 MachinePointerInfo(), MVT::i32);
19337 else
19338 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19339
19340 const DataLayout &DL = DAG.getDataLayout();
19341 SDValue Scale =
19342 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19343 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19344
19345 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19346 }
19347
19348 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19349
19350 // Get the offset of start of .tls section
19351 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19352 GA->getValueType(0),
19354 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19355
19356 // The address of the thread local variable is the add of the thread
19357 // pointer with the offset of the variable.
19358 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19359 }
19360
19361 llvm_unreachable("TLS not implemented for this target.");
19362}
19363
19365 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19366 const TargetMachine &TM = getTargetMachine();
19367 TLSModel::Model Model = TM.getTLSModel(&GV);
19368 switch (Model) {
19371 // We can include the %fs segment register in addressing modes.
19372 return true;
19375 // These models do not result in %fs relative addresses unless
19376 // TLS descriptior are used.
19377 //
19378 // Even in the case of TLS descriptors we currently have no way to model
19379 // the difference between %fs access and the computations needed for the
19380 // offset and returning `true` for TLS-desc currently duplicates both
19381 // which is detrimental :-/
19382 return false;
19383 }
19384 }
19385 return false;
19386}
19387
19388/// Lower SRA_PARTS and friends, which return two i32 values
19389/// and take a 2 x i32 value to shift plus a shift amount.
19390/// TODO: Can this be moved to general expansion code?
19392 SDValue Lo, Hi;
19393 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19394 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19395}
19396
19397// Try to use a packed vector operation to handle i64 on 32-bit targets when
19398// AVX512DQ is enabled.
19400 SelectionDAG &DAG,
19401 const X86Subtarget &Subtarget) {
19402 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19403 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19404 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19405 Op.getOpcode() == ISD::UINT_TO_FP) &&
19406 "Unexpected opcode!");
19407 bool IsStrict = Op->isStrictFPOpcode();
19408 unsigned OpNo = IsStrict ? 1 : 0;
19409 SDValue Src = Op.getOperand(OpNo);
19410 MVT SrcVT = Src.getSimpleValueType();
19411 MVT VT = Op.getSimpleValueType();
19412
19413 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19414 (VT != MVT::f32 && VT != MVT::f64))
19415 return SDValue();
19416
19417 // Pack the i64 into a vector, do the operation and extract.
19418
19419 // Using 256-bit to ensure result is 128-bits for f32 case.
19420 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19421 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19422 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19423
19424 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19425 if (IsStrict) {
19426 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19427 {Op.getOperand(0), InVec});
19428 SDValue Chain = CvtVec.getValue(1);
19429 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19430 DAG.getVectorIdxConstant(0, dl));
19431 return DAG.getMergeValues({Value, Chain}, dl);
19432 }
19433
19434 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19435
19436 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19437 DAG.getVectorIdxConstant(0, dl));
19438}
19439
19440// Try to use a packed vector operation to handle i64 on 32-bit targets.
19442 const X86Subtarget &Subtarget) {
19443 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19444 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19445 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19446 Op.getOpcode() == ISD::UINT_TO_FP) &&
19447 "Unexpected opcode!");
19448 bool IsStrict = Op->isStrictFPOpcode();
19449 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19450 MVT SrcVT = Src.getSimpleValueType();
19451 MVT VT = Op.getSimpleValueType();
19452
19453 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19454 return SDValue();
19455
19456 // Pack the i64 into a vector, do the operation and extract.
19457
19458 assert(Subtarget.hasFP16() && "Expected FP16");
19459
19460 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19461 if (IsStrict) {
19462 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19463 {Op.getOperand(0), InVec});
19464 SDValue Chain = CvtVec.getValue(1);
19465 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19466 DAG.getVectorIdxConstant(0, dl));
19467 return DAG.getMergeValues({Value, Chain}, dl);
19468 }
19469
19470 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19471
19472 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19473 DAG.getVectorIdxConstant(0, dl));
19474}
19475
19476static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19477 const X86Subtarget &Subtarget) {
19478 switch (Opcode) {
19479 case ISD::SINT_TO_FP:
19480 // TODO: Handle wider types with AVX/AVX512.
19481 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19482 return false;
19483 // CVTDQ2PS or (V)CVTDQ2PD
19484 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19485
19486 case ISD::UINT_TO_FP:
19487 // TODO: Handle wider types and i64 elements.
19488 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19489 return false;
19490 // VCVTUDQ2PS or VCVTUDQ2PD
19491 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19492
19493 default:
19494 return false;
19495 }
19496}
19497
19498/// Given a scalar cast operation that is extracted from a vector, try to
19499/// vectorize the cast op followed by extraction. This will avoid an expensive
19500/// round-trip between XMM and GPR.
19502 SelectionDAG &DAG,
19503 const X86Subtarget &Subtarget) {
19504 // TODO: This could be enhanced to handle smaller integer types by peeking
19505 // through an extend.
19506 SDValue Extract = Cast.getOperand(0);
19507 MVT DestVT = Cast.getSimpleValueType();
19508 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19509 !isa<ConstantSDNode>(Extract.getOperand(1)))
19510 return SDValue();
19511
19512 // See if we have a 128-bit vector cast op for this type of cast.
19513 SDValue VecOp = Extract.getOperand(0);
19514 MVT FromVT = VecOp.getSimpleValueType();
19515 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19516 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19517 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19518 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19519 return SDValue();
19520
19521 // If we are extracting from a non-zero element, first shuffle the source
19522 // vector to allow extracting from element zero.
19523 if (!isNullConstant(Extract.getOperand(1))) {
19524 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19525 Mask[0] = Extract.getConstantOperandVal(1);
19526 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19527 }
19528 // If the source vector is wider than 128-bits, extract the low part. Do not
19529 // create an unnecessarily wide vector cast op.
19530 if (FromVT != Vec128VT)
19531 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19532
19533 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19534 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19535 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19536 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19537 DAG.getVectorIdxConstant(0, DL));
19538}
19539
19540/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19541/// try to vectorize the cast ops. This will avoid an expensive round-trip
19542/// between XMM and GPR.
19543static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19544 SelectionDAG &DAG,
19545 const X86Subtarget &Subtarget) {
19546 // TODO: Allow FP_TO_UINT.
19547 SDValue CastToInt = CastToFP.getOperand(0);
19548 MVT VT = CastToFP.getSimpleValueType();
19549 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19550 return SDValue();
19551
19552 MVT IntVT = CastToInt.getSimpleValueType();
19553 SDValue X = CastToInt.getOperand(0);
19554 MVT SrcVT = X.getSimpleValueType();
19555 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19556 return SDValue();
19557
19558 // See if we have 128-bit vector cast instructions for this type of cast.
19559 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19560 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19561 IntVT != MVT::i32)
19562 return SDValue();
19563
19564 unsigned SrcSize = SrcVT.getSizeInBits();
19565 unsigned IntSize = IntVT.getSizeInBits();
19566 unsigned VTSize = VT.getSizeInBits();
19567 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19568 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19569 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19570
19571 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19572 unsigned ToIntOpcode =
19573 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19574 unsigned ToFPOpcode =
19575 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19576
19577 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19578 //
19579 // We are not defining the high elements (for example, zero them) because
19580 // that could nullify any performance advantage that we hoped to gain from
19581 // this vector op hack. We do not expect any adverse effects (like denorm
19582 // penalties) with cast ops.
19583 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19584 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19585 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19586 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19587 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19588}
19589
19591 SelectionDAG &DAG,
19592 const X86Subtarget &Subtarget) {
19593 bool IsStrict = Op->isStrictFPOpcode();
19594 MVT VT = Op->getSimpleValueType(0);
19595 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19596
19597 if (Subtarget.hasDQI()) {
19598 assert(!Subtarget.hasVLX() && "Unexpected features");
19599
19600 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19601 Src.getSimpleValueType() == MVT::v4i64) &&
19602 "Unsupported custom type");
19603
19604 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19605 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19606 "Unexpected VT!");
19607 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19608
19609 // Need to concat with zero vector for strict fp to avoid spurious
19610 // exceptions.
19611 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19612 : DAG.getUNDEF(MVT::v8i64);
19613 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19614 DAG.getVectorIdxConstant(0, DL));
19615 SDValue Res, Chain;
19616 if (IsStrict) {
19617 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19618 {Op->getOperand(0), Src});
19619 Chain = Res.getValue(1);
19620 } else {
19621 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19622 }
19623
19624 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19625 DAG.getVectorIdxConstant(0, DL));
19626
19627 if (IsStrict)
19628 return DAG.getMergeValues({Res, Chain}, DL);
19629 return Res;
19630 }
19631
19632 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19633 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19634 if (VT != MVT::v4f32 || IsSigned)
19635 return SDValue();
19636
19637 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19638 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19639 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19640 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19641 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19642 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19643 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19644 SmallVector<SDValue, 4> SignCvts(4);
19645 SmallVector<SDValue, 4> Chains(4);
19646 for (int i = 0; i != 4; ++i) {
19647 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19648 DAG.getVectorIdxConstant(i, DL));
19649 if (IsStrict) {
19650 SignCvts[i] =
19651 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19652 {Op.getOperand(0), Elt});
19653 Chains[i] = SignCvts[i].getValue(1);
19654 } else {
19655 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19656 }
19657 }
19658 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19659
19660 SDValue Slow, Chain;
19661 if (IsStrict) {
19662 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19663 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19664 {Chain, SignCvt, SignCvt});
19665 Chain = Slow.getValue(1);
19666 } else {
19667 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19668 }
19669
19670 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19671 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19672
19673 if (IsStrict)
19674 return DAG.getMergeValues({Cvt, Chain}, DL);
19675
19676 return Cvt;
19677}
19678
19680 SelectionDAG &DAG) {
19681 bool IsStrict = Op->isStrictFPOpcode();
19682 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19683 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19684 MVT VT = Op.getSimpleValueType();
19685 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
19686
19687 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
19688 if (IsStrict)
19689 return DAG.getNode(
19690 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19691 {Chain,
19692 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19693 Rnd});
19694 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19695 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19696}
19697
19698static bool isLegalConversion(MVT VT, bool IsSigned,
19699 const X86Subtarget &Subtarget) {
19700 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19701 return true;
19702 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19703 return true;
19704 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19705 return true;
19706 if (Subtarget.useAVX512Regs()) {
19707 if (VT == MVT::v16i32)
19708 return true;
19709 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19710 return true;
19711 }
19712 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19713 (VT == MVT::v2i64 || VT == MVT::v4i64))
19714 return true;
19715 return false;
19716}
19717
19718SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19719 SelectionDAG &DAG) const {
19720 bool IsStrict = Op->isStrictFPOpcode();
19721 unsigned OpNo = IsStrict ? 1 : 0;
19722 SDValue Src = Op.getOperand(OpNo);
19723 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19724 MVT SrcVT = Src.getSimpleValueType();
19725 MVT VT = Op.getSimpleValueType();
19726 SDLoc dl(Op);
19727
19728 if (isSoftF16(VT, Subtarget))
19729 return promoteXINT_TO_FP(Op, dl, DAG);
19730 else if (isLegalConversion(SrcVT, true, Subtarget))
19731 return Op;
19732
19733 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19734 return LowerWin64_INT128_TO_FP(Op, DAG);
19735
19736 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19737 return Extract;
19738
19739 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
19740 return R;
19741
19742 if (SrcVT.isVector()) {
19743 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19744 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19745 // source for strict FP.
19746 if (IsStrict)
19747 return DAG.getNode(
19748 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19749 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19750 DAG.getUNDEF(SrcVT))});
19751 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19752 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19753 DAG.getUNDEF(SrcVT)));
19754 }
19755 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19756 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19757
19758 return SDValue();
19759 }
19760
19761 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19762 "Unknown SINT_TO_FP to lower!");
19763
19764 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19765
19766 // These are really Legal; return the operand so the caller accepts it as
19767 // Legal.
19768 if (SrcVT == MVT::i32 && UseSSEReg)
19769 return Op;
19770 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19771 return Op;
19772
19773 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19774 return V;
19775 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19776 return V;
19777
19778 // SSE doesn't have an i16 conversion so we need to promote.
19779 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19780 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19781 if (IsStrict)
19782 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19783 {Chain, Ext});
19784
19785 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19786 }
19787
19788 if (VT == MVT::f128 || !Subtarget.hasX87())
19789 return SDValue();
19790
19791 SDValue ValueToStore = Src;
19792 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19793 // Bitcasting to f64 here allows us to do a single 64-bit store from
19794 // an SSE register, avoiding the store forwarding penalty that would come
19795 // with two 32-bit stores.
19796 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19797
19798 unsigned Size = SrcVT.getStoreSize();
19799 Align Alignment(Size);
19801 auto PtrVT = getPointerTy(MF.getDataLayout());
19802 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19803 MachinePointerInfo MPI =
19805 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19806 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19807 std::pair<SDValue, SDValue> Tmp =
19808 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19809
19810 if (IsStrict)
19811 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19812
19813 return Tmp.first;
19814}
19815
19816std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19817 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19818 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19819 // Build the FILD
19820 SDVTList Tys;
19821 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19822 if (useSSE)
19823 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19824 else
19825 Tys = DAG.getVTList(DstVT, MVT::Other);
19826
19827 SDValue FILDOps[] = {Chain, Pointer};
19828 SDValue Result =
19829 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19830 Alignment, MachineMemOperand::MOLoad);
19831 Chain = Result.getValue(1);
19832
19833 if (useSSE) {
19835 unsigned SSFISize = DstVT.getStoreSize();
19836 int SSFI =
19837 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19838 auto PtrVT = getPointerTy(MF.getDataLayout());
19839 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19840 Tys = DAG.getVTList(MVT::Other);
19841 SDValue FSTOps[] = {Chain, Result, StackSlot};
19844 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19845
19846 Chain =
19847 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19848 Result = DAG.getLoad(
19849 DstVT, DL, Chain, StackSlot,
19851 Chain = Result.getValue(1);
19852 }
19853
19854 return { Result, Chain };
19855}
19856
19857/// Horizontal vector math instructions may be slower than normal math with
19858/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19859/// implementation, and likely shuffle complexity of the alternate sequence.
19860static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19861 const X86Subtarget &Subtarget) {
19862 bool IsOptimizingSize = DAG.shouldOptForSize();
19863 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19864 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19865}
19866
19867/// 64-bit unsigned integer to double expansion.
19869 SelectionDAG &DAG,
19870 const X86Subtarget &Subtarget) {
19871 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19872 // when converting 0 when rounding toward negative infinity. Caller will
19873 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19874 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19875 // This algorithm is not obvious. Here it is what we're trying to output:
19876 /*
19877 movq %rax, %xmm0
19878 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19879 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19880 #ifdef __SSE3__
19881 haddpd %xmm0, %xmm0
19882 #else
19883 pshufd $0x4e, %xmm0, %xmm1
19884 addpd %xmm1, %xmm0
19885 #endif
19886 */
19887
19888 LLVMContext *Context = DAG.getContext();
19889
19890 // Build some magic constants.
19891 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19892 Constant *C0 = ConstantDataVector::get(*Context, CV0);
19893 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19894 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19895
19897 CV1.push_back(
19898 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19899 APInt(64, 0x4330000000000000ULL))));
19900 CV1.push_back(
19901 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19902 APInt(64, 0x4530000000000000ULL))));
19903 Constant *C1 = ConstantVector::get(CV1);
19904 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19905
19906 // Load the 64-bit value into an XMM register.
19907 SDValue XR1 =
19908 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19909 SDValue CLod0 = DAG.getLoad(
19910 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19912 SDValue Unpck1 =
19913 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19914
19915 SDValue CLod1 = DAG.getLoad(
19916 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19918 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19919 // TODO: Are there any fast-math-flags to propagate here?
19920 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19921 SDValue Result;
19922
19923 if (Subtarget.hasSSE3() &&
19924 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19925 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19926 } else {
19927 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19928 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19929 }
19930 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19931 DAG.getVectorIdxConstant(0, dl));
19932 return Result;
19933}
19934
19935/// 32-bit unsigned integer to float expansion.
19937 SelectionDAG &DAG,
19938 const X86Subtarget &Subtarget) {
19939 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19940 // FP constant to bias correct the final result.
19941 SDValue Bias = DAG.getConstantFP(
19942 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19943
19944 // Load the 32-bit value into an XMM register.
19945 SDValue Load =
19946 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19947
19948 // Zero out the upper parts of the register.
19949 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19950
19951 // Or the load with the bias.
19952 SDValue Or = DAG.getNode(
19953 ISD::OR, dl, MVT::v2i64,
19954 DAG.getBitcast(MVT::v2i64, Load),
19955 DAG.getBitcast(MVT::v2i64,
19956 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19957 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19958 DAG.getBitcast(MVT::v2f64, Or),
19959 DAG.getVectorIdxConstant(0, dl));
19960
19961 if (Op.getNode()->isStrictFPOpcode()) {
19962 // Subtract the bias.
19963 // TODO: Are there any fast-math-flags to propagate here?
19964 SDValue Chain = Op.getOperand(0);
19965 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19966 {Chain, Or, Bias});
19967
19968 if (Op.getValueType() == Sub.getValueType())
19969 return Sub;
19970
19971 // Handle final rounding.
19972 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19973 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19974
19975 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19976 }
19977
19978 // Subtract the bias.
19979 // TODO: Are there any fast-math-flags to propagate here?
19980 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19981
19982 // Handle final rounding.
19983 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19984}
19985
19987 SelectionDAG &DAG,
19988 const X86Subtarget &Subtarget) {
19989 if (Op.getSimpleValueType() != MVT::v2f64)
19990 return SDValue();
19991
19992 bool IsStrict = Op->isStrictFPOpcode();
19993
19994 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19995 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19996
19997 if (Subtarget.hasAVX512()) {
19998 if (!Subtarget.hasVLX()) {
19999 // Let generic type legalization widen this.
20000 if (!IsStrict)
20001 return SDValue();
20002 // Otherwise pad the integer input with 0s and widen the operation.
20003 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20004 DAG.getConstant(0, DL, MVT::v2i32));
20005 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20006 {Op.getOperand(0), N0});
20007 SDValue Chain = Res.getValue(1);
20008 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20009 DAG.getVectorIdxConstant(0, DL));
20010 return DAG.getMergeValues({Res, Chain}, DL);
20011 }
20012
20013 // Legalize to v4i32 type.
20014 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20015 DAG.getUNDEF(MVT::v2i32));
20016 if (IsStrict)
20017 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20018 {Op.getOperand(0), N0});
20019 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20020 }
20021
20022 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20023 // This gives us the floating point equivalent of 2^52 + the i32 integer
20024 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20025 // point leaving just our i32 integers in double format.
20026 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20027 SDValue VBias = DAG.getConstantFP(
20028 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20029 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20030 DAG.getBitcast(MVT::v2i64, VBias));
20031 Or = DAG.getBitcast(MVT::v2f64, Or);
20032
20033 if (IsStrict)
20034 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20035 {Op.getOperand(0), Or, VBias});
20036 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20037}
20038
20040 SelectionDAG &DAG,
20041 const X86Subtarget &Subtarget) {
20042 bool IsStrict = Op->isStrictFPOpcode();
20043 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20044 MVT VecIntVT = V.getSimpleValueType();
20045 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20046 "Unsupported custom type");
20047
20048 if (Subtarget.hasAVX512()) {
20049 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20050 assert(!Subtarget.hasVLX() && "Unexpected features");
20051 MVT VT = Op->getSimpleValueType(0);
20052
20053 // v8i32->v8f64 is legal with AVX512 so just return it.
20054 if (VT == MVT::v8f64)
20055 return Op;
20056
20057 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20058 "Unexpected VT!");
20059 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20060 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20061 // Need to concat with zero vector for strict fp to avoid spurious
20062 // exceptions.
20063 SDValue Tmp =
20064 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20065 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20066 DAG.getVectorIdxConstant(0, DL));
20067 SDValue Res, Chain;
20068 if (IsStrict) {
20069 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20070 {Op->getOperand(0), V});
20071 Chain = Res.getValue(1);
20072 } else {
20073 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20074 }
20075
20076 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20077 DAG.getVectorIdxConstant(0, DL));
20078
20079 if (IsStrict)
20080 return DAG.getMergeValues({Res, Chain}, DL);
20081 return Res;
20082 }
20083
20084 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20085 Op->getSimpleValueType(0) == MVT::v4f64) {
20086 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20087 Constant *Bias = ConstantFP::get(
20088 *DAG.getContext(),
20089 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20090 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20091 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20092 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20093 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20094 SDValue VBias = DAG.getMemIntrinsicNode(
20095 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20098
20099 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20100 DAG.getBitcast(MVT::v4i64, VBias));
20101 Or = DAG.getBitcast(MVT::v4f64, Or);
20102
20103 if (IsStrict)
20104 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20105 {Op.getOperand(0), Or, VBias});
20106 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20107 }
20108
20109 // The algorithm is the following:
20110 // #ifdef __SSE4_1__
20111 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20112 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20113 // (uint4) 0x53000000, 0xaa);
20114 // #else
20115 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20116 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20117 // #endif
20118 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20119 // return (float4) lo + fhi;
20120
20121 bool Is128 = VecIntVT == MVT::v4i32;
20122 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20123 // If we convert to something else than the supported type, e.g., to v4f64,
20124 // abort early.
20125 if (VecFloatVT != Op->getSimpleValueType(0))
20126 return SDValue();
20127
20128 // In the #idef/#else code, we have in common:
20129 // - The vector of constants:
20130 // -- 0x4b000000
20131 // -- 0x53000000
20132 // - A shift:
20133 // -- v >> 16
20134
20135 // Create the splat vector for 0x4b000000.
20136 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20137 // Create the splat vector for 0x53000000.
20138 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20139
20140 // Create the right shift.
20141 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20142 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20143
20144 SDValue Low, High;
20145 if (Subtarget.hasSSE41()) {
20146 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20147 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20148 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20149 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20150 // Low will be bitcasted right away, so do not bother bitcasting back to its
20151 // original type.
20152 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20153 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20154 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20155 // (uint4) 0x53000000, 0xaa);
20156 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20157 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20158 // High will be bitcasted right away, so do not bother bitcasting back to
20159 // its original type.
20160 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20161 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20162 } else {
20163 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20164 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20165 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20166 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20167
20168 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20169 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20170 }
20171
20172 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20173 SDValue VecCstFSub = DAG.getConstantFP(
20174 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20175
20176 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20177 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20178 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20179 // enabled. See PR24512.
20180 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20181 // TODO: Are there any fast-math-flags to propagate here?
20182 // (float4) lo;
20183 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20184 // return (float4) lo + fhi;
20185 if (IsStrict) {
20186 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20187 {Op.getOperand(0), HighBitcast, VecCstFSub});
20188 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20189 {FHigh.getValue(1), LowBitcast, FHigh});
20190 }
20191
20192 SDValue FHigh =
20193 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20194 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20195}
20196
20198 const X86Subtarget &Subtarget) {
20199 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20200 SDValue N0 = Op.getOperand(OpNo);
20201 MVT SrcVT = N0.getSimpleValueType();
20202
20203 switch (SrcVT.SimpleTy) {
20204 default:
20205 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20206 case MVT::v2i32:
20207 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20208 case MVT::v4i32:
20209 case MVT::v8i32:
20210 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20211 case MVT::v2i64:
20212 case MVT::v4i64:
20213 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20214 }
20215}
20216
20217SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20218 SelectionDAG &DAG) const {
20219 bool IsStrict = Op->isStrictFPOpcode();
20220 unsigned OpNo = IsStrict ? 1 : 0;
20221 SDValue Src = Op.getOperand(OpNo);
20222 SDLoc dl(Op);
20223 auto PtrVT = getPointerTy(DAG.getDataLayout());
20224 MVT SrcVT = Src.getSimpleValueType();
20225 MVT DstVT = Op->getSimpleValueType(0);
20226 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20227
20228 // Bail out when we don't have native conversion instructions.
20229 if (DstVT == MVT::f128)
20230 return SDValue();
20231
20232 if (isSoftF16(DstVT, Subtarget))
20233 return promoteXINT_TO_FP(Op, dl, DAG);
20234 else if (isLegalConversion(SrcVT, false, Subtarget))
20235 return Op;
20236
20237 if (DstVT.isVector())
20238 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20239
20240 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20241 return LowerWin64_INT128_TO_FP(Op, DAG);
20242
20243 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20244 return Extract;
20245
20246 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20247 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20248 // Conversions from unsigned i32 to f32/f64 are legal,
20249 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20250 return Op;
20251 }
20252
20253 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20254 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20255 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20256 if (IsStrict)
20257 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20258 {Chain, Src});
20259 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20260 }
20261
20262 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20263 return V;
20264 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20265 return V;
20266
20267 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20268 // infinity. It produces -0.0, so disable under strictfp.
20269 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20270 !IsStrict)
20271 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20272 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20273 // negative infinity. So disable under strictfp. Using FILD instead.
20274 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20275 !IsStrict)
20276 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20277 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20278 (DstVT == MVT::f32 || DstVT == MVT::f64))
20279 return SDValue();
20280
20281 // Make a 64-bit buffer, and use it to build an FILD.
20282 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20283 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20284 Align SlotAlign(8);
20285 MachinePointerInfo MPI =
20287 if (SrcVT == MVT::i32) {
20288 SDValue OffsetSlot =
20289 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20290 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20291 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20292 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20293 std::pair<SDValue, SDValue> Tmp =
20294 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20295 if (IsStrict)
20296 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20297
20298 return Tmp.first;
20299 }
20300
20301 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20302 SDValue ValueToStore = Src;
20303 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20304 // Bitcasting to f64 here allows us to do a single 64-bit store from
20305 // an SSE register, avoiding the store forwarding penalty that would come
20306 // with two 32-bit stores.
20307 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20308 }
20309 SDValue Store =
20310 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20311 // For i64 source, we need to add the appropriate power of 2 if the input
20312 // was negative. We must be careful to do the computation in x87 extended
20313 // precision, not in SSE.
20314 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20315 SDValue Ops[] = {Store, StackSlot};
20316 SDValue Fild =
20317 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20318 SlotAlign, MachineMemOperand::MOLoad);
20319 Chain = Fild.getValue(1);
20320
20321 // Check whether the sign bit is set.
20322 SDValue SignSet = DAG.getSetCC(
20323 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20324 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20325
20326 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20327 APInt FF(64, 0x5F80000000000000ULL);
20328 SDValue FudgePtr =
20329 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20330 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20331
20332 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20333 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20334 SDValue Four = DAG.getIntPtrConstant(4, dl);
20335 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20336 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20337
20338 // Load the value out, extending it from f32 to f80.
20339 SDValue Fudge = DAG.getExtLoad(
20340 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20342 CPAlignment);
20343 Chain = Fudge.getValue(1);
20344 // Extend everything to 80 bits to force it to be done on x87.
20345 // TODO: Are there any fast-math-flags to propagate here?
20346 if (IsStrict) {
20347 unsigned Opc = ISD::STRICT_FADD;
20348 // Windows needs the precision control changed to 80bits around this add.
20349 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20351
20352 SDValue Add =
20353 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20354 // STRICT_FP_ROUND can't handle equal types.
20355 if (DstVT == MVT::f80)
20356 return Add;
20357 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20358 {Add.getValue(1), Add,
20359 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20360 }
20361 unsigned Opc = ISD::FADD;
20362 // Windows needs the precision control changed to 80bits around this add.
20363 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20364 Opc = X86ISD::FP80_ADD;
20365
20366 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20367 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20368 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20369}
20370
20371// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20372// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20373// just return an SDValue().
20374// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20375// to i16, i32 or i64, and we lower it to a legal sequence and return the
20376// result.
20377SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20378 bool IsSigned,
20379 SDValue &Chain) const {
20380 bool IsStrict = Op->isStrictFPOpcode();
20381 SDLoc DL(Op);
20382
20383 EVT DstTy = Op.getValueType();
20384 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20385 EVT TheVT = Value.getValueType();
20386 auto PtrVT = getPointerTy(DAG.getDataLayout());
20387
20388 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20389 // f16 must be promoted before using the lowering in this routine.
20390 // fp128 does not use this lowering.
20391 return SDValue();
20392 }
20393
20394 // If using FIST to compute an unsigned i64, we'll need some fixup
20395 // to handle values above the maximum signed i64. A FIST is always
20396 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20397 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20398
20399 // FIXME: This does not generate an invalid exception if the input does not
20400 // fit in i32. PR44019
20401 if (!IsSigned && DstTy != MVT::i64) {
20402 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20403 // The low 32 bits of the fist result will have the correct uint32 result.
20404 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20405 DstTy = MVT::i64;
20406 }
20407
20408 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20409 DstTy.getSimpleVT() >= MVT::i16 &&
20410 "Unknown FP_TO_INT to lower!");
20411
20412 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20413 // stack slot.
20415 unsigned MemSize = DstTy.getStoreSize();
20416 int SSFI =
20417 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20418 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20419
20420 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20421
20422 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20423
20424 if (UnsignedFixup) {
20425 //
20426 // Conversion to unsigned i64 is implemented with a select,
20427 // depending on whether the source value fits in the range
20428 // of a signed i64. Let Thresh be the FP equivalent of
20429 // 0x8000000000000000ULL.
20430 //
20431 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20432 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20433 // FistSrc = (Value - FltOfs);
20434 // Fist-to-mem64 FistSrc
20435 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20436 // to XOR'ing the high 32 bits with Adjust.
20437 //
20438 // Being a power of 2, Thresh is exactly representable in all FP formats.
20439 // For X87 we'd like to use the smallest FP type for this constant, but
20440 // for DAG type consistency we have to match the FP operand type.
20441
20442 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20444 bool LosesInfo = false;
20445 if (TheVT == MVT::f64)
20446 // The rounding mode is irrelevant as the conversion should be exact.
20448 &LosesInfo);
20449 else if (TheVT == MVT::f80)
20450 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20451 APFloat::rmNearestTiesToEven, &LosesInfo);
20452
20453 assert(Status == APFloat::opOK && !LosesInfo &&
20454 "FP conversion should have been exact");
20455
20456 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20457
20458 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20459 *DAG.getContext(), TheVT);
20460 SDValue Cmp;
20461 if (IsStrict) {
20462 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20463 /*IsSignaling*/ true);
20464 Chain = Cmp.getValue(1);
20465 } else {
20466 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20467 }
20468
20469 // Our preferred lowering of
20470 //
20471 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20472 //
20473 // is
20474 //
20475 // (Value >= Thresh) << 63
20476 //
20477 // but since we can get here after LegalOperations, DAGCombine might do the
20478 // wrong thing if we create a select. So, directly create the preferred
20479 // version.
20480 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20481 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20482 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20483
20484 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20485 DAG.getConstantFP(0.0, DL, TheVT));
20486
20487 if (IsStrict) {
20488 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20489 { Chain, Value, FltOfs });
20490 Chain = Value.getValue(1);
20491 } else
20492 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20493 }
20494
20496
20497 // FIXME This causes a redundant load/store if the SSE-class value is already
20498 // in memory, such as if it is on the callstack.
20499 if (isScalarFPTypeInSSEReg(TheVT)) {
20500 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20501 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20502 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20503 SDValue Ops[] = { Chain, StackSlot };
20504
20505 unsigned FLDSize = TheVT.getStoreSize();
20506 assert(FLDSize <= MemSize && "Stack slot not big enough");
20508 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20509 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20510 Chain = Value.getValue(1);
20511 }
20512
20513 // Build the FP_TO_INT*_IN_MEM
20515 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20516 SDValue Ops[] = { Chain, Value, StackSlot };
20518 DAG.getVTList(MVT::Other),
20519 Ops, DstTy, MMO);
20520
20521 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20522 Chain = Res.getValue(1);
20523
20524 // If we need an unsigned fixup, XOR the result with adjust.
20525 if (UnsignedFixup)
20526 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20527
20528 return Res;
20529}
20530
20532 const X86Subtarget &Subtarget) {
20533 MVT VT = Op.getSimpleValueType();
20534 SDValue In = Op.getOperand(0);
20535 MVT InVT = In.getSimpleValueType();
20536 unsigned Opc = Op.getOpcode();
20537
20538 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20539 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20540 "Unexpected extension opcode");
20542 "Expected same number of elements");
20543 assert((VT.getVectorElementType() == MVT::i16 ||
20544 VT.getVectorElementType() == MVT::i32 ||
20545 VT.getVectorElementType() == MVT::i64) &&
20546 "Unexpected element type");
20547 assert((InVT.getVectorElementType() == MVT::i8 ||
20548 InVT.getVectorElementType() == MVT::i16 ||
20549 InVT.getVectorElementType() == MVT::i32) &&
20550 "Unexpected element type");
20551
20552 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20553
20554 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20555 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20556 return splitVectorIntUnary(Op, DAG, dl);
20557 }
20558
20559 if (Subtarget.hasInt256())
20560 return Op;
20561
20562 // Optimize vectors in AVX mode:
20563 //
20564 // v8i16 -> v8i32
20565 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20566 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20567 // Concat upper and lower parts.
20568 //
20569 // v4i32 -> v4i64
20570 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20571 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20572 // Concat upper and lower parts.
20573 //
20574 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20575 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20576
20577 // Short-circuit if we can determine that each 128-bit half is the same value.
20578 // Otherwise, this is difficult to match and optimize.
20579 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20580 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20581 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20582
20583 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20584 SDValue Undef = DAG.getUNDEF(InVT);
20585 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20586 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20587 OpHi = DAG.getBitcast(HalfVT, OpHi);
20588
20589 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20590}
20591
20592// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20593static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20594 const SDLoc &dl, SelectionDAG &DAG) {
20595 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20596 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20597 DAG.getVectorIdxConstant(0, dl));
20598 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20599 DAG.getVectorIdxConstant(8, dl));
20600 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20601 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20602 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20603 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20604}
20605
20607 const X86Subtarget &Subtarget,
20608 SelectionDAG &DAG) {
20609 MVT VT = Op->getSimpleValueType(0);
20610 SDValue In = Op->getOperand(0);
20611 MVT InVT = In.getSimpleValueType();
20612 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20613 unsigned NumElts = VT.getVectorNumElements();
20614
20615 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20616 // avoids a constant pool load.
20617 if (VT.getVectorElementType() != MVT::i8) {
20618 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20619 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20620 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20621 }
20622
20623 // Extend VT if BWI is not supported.
20624 MVT ExtVT = VT;
20625 if (!Subtarget.hasBWI()) {
20626 // If v16i32 is to be avoided, we'll need to split and concatenate.
20627 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20628 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20629
20630 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20631 }
20632
20633 // Widen to 512-bits if VLX is not supported.
20634 MVT WideVT = ExtVT;
20635 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20636 NumElts *= 512 / ExtVT.getSizeInBits();
20637 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20638 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
20639 DAG.getVectorIdxConstant(0, DL));
20640 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
20641 }
20642
20643 SDValue One = DAG.getConstant(1, DL, WideVT);
20644 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20645
20646 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20647
20648 // Truncate if we had to extend above.
20649 if (VT != ExtVT) {
20650 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20651 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20652 }
20653
20654 // Extract back to 128/256-bit if we widened.
20655 if (WideVT != VT)
20656 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20657 DAG.getVectorIdxConstant(0, DL));
20658
20659 return SelectedVal;
20660}
20661
20663 SelectionDAG &DAG) {
20664 SDValue In = Op.getOperand(0);
20665 MVT SVT = In.getSimpleValueType();
20666 SDLoc DL(Op);
20667
20668 if (SVT.getVectorElementType() == MVT::i1)
20669 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
20670
20671 assert(Subtarget.hasAVX() && "Expected AVX support");
20672 return LowerAVXExtend(Op, DL, DAG, Subtarget);
20673}
20674
20675/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20676/// It makes use of the fact that vectors with enough leading sign/zero bits
20677/// prevent the PACKSS/PACKUS from saturating the results.
20678/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20679/// within each 128-bit lane.
20680static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20681 const SDLoc &DL, SelectionDAG &DAG,
20682 const X86Subtarget &Subtarget) {
20683 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20684 "Unexpected PACK opcode");
20685 assert(DstVT.isVector() && "VT not a vector?");
20686
20687 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20688 if (!Subtarget.hasSSE2())
20689 return SDValue();
20690
20691 EVT SrcVT = In.getValueType();
20692
20693 // No truncation required, we might get here due to recursive calls.
20694 if (SrcVT == DstVT)
20695 return In;
20696
20697 unsigned NumElems = SrcVT.getVectorNumElements();
20698 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20699 return SDValue();
20700
20701 unsigned DstSizeInBits = DstVT.getSizeInBits();
20702 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20703 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20704 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20705
20706 LLVMContext &Ctx = *DAG.getContext();
20707 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20708 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20709
20710 // Pack to the largest type possible:
20711 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20712 EVT InVT = MVT::i16, OutVT = MVT::i8;
20713 if (SrcVT.getScalarSizeInBits() > 16 &&
20714 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20715 InVT = MVT::i32;
20716 OutVT = MVT::i16;
20717 }
20718
20719 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20720 // On pre-AVX512, pack the src in both halves to help value tracking.
20721 if (SrcSizeInBits <= 128) {
20722 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20723 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20724 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20725 SDValue LHS = DAG.getBitcast(InVT, In);
20726 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20727 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20728 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20729 Res = DAG.getBitcast(PackedVT, Res);
20730 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20731 }
20732
20733 // Split lower/upper subvectors.
20734 SDValue Lo, Hi;
20735 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20736
20737 // If Hi is undef, then don't bother packing it and widen the result instead.
20738 if (Hi.isUndef()) {
20739 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20740 if (SDValue Res =
20741 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20742 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20743 }
20744
20745 unsigned SubSizeInBits = SrcSizeInBits / 2;
20746 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20747 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20748
20749 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20750 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20751 Lo = DAG.getBitcast(InVT, Lo);
20752 Hi = DAG.getBitcast(InVT, Hi);
20753 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20754 return DAG.getBitcast(DstVT, Res);
20755 }
20756
20757 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20758 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20759 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20760 Lo = DAG.getBitcast(InVT, Lo);
20761 Hi = DAG.getBitcast(InVT, Hi);
20762 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20763
20764 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20765 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20766 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20768 int Scale = 64 / OutVT.getScalarSizeInBits();
20769 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20770 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20771
20772 if (DstVT.is256BitVector())
20773 return DAG.getBitcast(DstVT, Res);
20774
20775 // If 512bit -> 128bit truncate another stage.
20776 Res = DAG.getBitcast(PackedVT, Res);
20777 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20778 }
20779
20780 // Recursively pack lower/upper subvectors, concat result and pack again.
20781 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20782
20783 if (PackedVT.is128BitVector()) {
20784 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20785 // type legalization.
20786 SDValue Res =
20787 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20788 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20789 }
20790
20791 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20792 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20793 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20794 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20795 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20796}
20797
20798/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20799/// e.g. trunc <8 x i32> X to <8 x i16> -->
20800/// MaskX = X & 0xffff (clear high bits to prevent saturation)
20801/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20803 const X86Subtarget &Subtarget,
20804 SelectionDAG &DAG) {
20805 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20806 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20807}
20808
20809/// Truncate using inreg sign extension and X86ISD::PACKSS.
20811 const X86Subtarget &Subtarget,
20812 SelectionDAG &DAG) {
20813 EVT SrcVT = In.getValueType();
20814 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20815 DAG.getValueType(DstVT));
20816 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20817}
20818
20819/// Helper to determine if \p In truncated to \p DstVT has the necessary
20820/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20821/// possibly by converting a SRL node to SRA for sign extension.
20822static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20823 SDValue In, const SDLoc &DL,
20824 SelectionDAG &DAG,
20825 const X86Subtarget &Subtarget,
20826 const SDNodeFlags Flags = SDNodeFlags()) {
20827 // Requires SSE2.
20828 if (!Subtarget.hasSSE2())
20829 return SDValue();
20830
20831 EVT SrcVT = In.getValueType();
20832 EVT DstSVT = DstVT.getVectorElementType();
20833 EVT SrcSVT = SrcVT.getVectorElementType();
20834 unsigned NumDstEltBits = DstSVT.getSizeInBits();
20835 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
20836
20837 // Check we have a truncation suited for PACKSS/PACKUS.
20838 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20839 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20840 return SDValue();
20841
20842 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
20843 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
20844
20845 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20846 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20847 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20848 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20849 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20850 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20851 return SDValue();
20852
20853 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20854 // split this for packing.
20855 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20856 !isFreeToSplitVector(In.getNode(), DAG) &&
20857 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20858 return SDValue();
20859
20860 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20861 if (Subtarget.hasAVX512() && NumStages > 1)
20862 return SDValue();
20863
20864 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20865 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20866
20867 // Truncate with PACKUS if we are truncating a vector with leading zero
20868 // bits that extend all the way to the packed/truncated value.
20869 // e.g. Masks, zext_in_reg, etc.
20870 // Pre-SSE41 we can only use PACKUSWB.
20871 KnownBits Known = DAG.computeKnownBits(In);
20872 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
20873 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20874 PackOpcode = X86ISD::PACKUS;
20875 return In;
20876 }
20877
20878 // Truncate with PACKSS if we are truncating a vector with sign-bits
20879 // that extend all the way to the packed/truncated value.
20880 // e.g. Comparison result, sext_in_reg, etc.
20881 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20882
20883 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20884 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20885 // see through BITCASTs later on and combines/simplifications can't then use
20886 // it.
20887 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20888 !Subtarget.hasAVX512())
20889 return SDValue();
20890
20891 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20892 if (Flags.hasNoSignedWrap() || MinSignBits < NumSignBits) {
20893 PackOpcode = X86ISD::PACKSS;
20894 return In;
20895 }
20896
20897 // If we have a srl that only generates signbits that we will discard in
20898 // the truncation then we can use PACKSS by converting the srl to a sra.
20899 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20900 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20901 if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
20902 if (*ShAmt == MinSignBits) {
20903 PackOpcode = X86ISD::PACKSS;
20904 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20905 }
20906 }
20907
20908 return SDValue();
20909}
20910
20911/// This function lowers a vector truncation of 'extended sign-bits' or
20912/// 'extended zero-bits' values.
20913/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20915 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
20916 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
20917 MVT SrcVT = In.getSimpleValueType();
20918 MVT DstSVT = DstVT.getVectorElementType();
20919 MVT SrcSVT = SrcVT.getVectorElementType();
20920 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20921 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20922 return SDValue();
20923
20924 // If the upper half of the source is undef, then attempt to split and
20925 // only truncate the lower half.
20926 if (DstVT.getSizeInBits() >= 128) {
20927 SmallVector<SDValue> LowerOps;
20928 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20929 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20930 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20931 Subtarget, DAG))
20932 return widenSubVector(Res, false, Subtarget, DAG, DL,
20933 DstVT.getSizeInBits());
20934 }
20935 }
20936
20937 unsigned PackOpcode;
20938 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
20939 Subtarget, Flags))
20940 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20941
20942 return SDValue();
20943}
20944
20945/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20946/// X86ISD::PACKUS/X86ISD::PACKSS operations.
20948 const X86Subtarget &Subtarget,
20949 SelectionDAG &DAG) {
20950 MVT SrcVT = In.getSimpleValueType();
20951 MVT DstSVT = DstVT.getVectorElementType();
20952 MVT SrcSVT = SrcVT.getVectorElementType();
20953 unsigned NumElems = DstVT.getVectorNumElements();
20954 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20955 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20956 NumElems >= 8))
20957 return SDValue();
20958
20959 // SSSE3's pshufb results in less instructions in the cases below.
20960 if (Subtarget.hasSSSE3() && NumElems == 8) {
20961 if (SrcSVT == MVT::i16)
20962 return SDValue();
20963 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20964 return SDValue();
20965 }
20966
20967 // If the upper half of the source is undef, then attempt to split and
20968 // only truncate the lower half.
20969 if (DstVT.getSizeInBits() >= 128) {
20970 SmallVector<SDValue> LowerOps;
20971 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20972 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20973 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20974 return widenSubVector(Res, false, Subtarget, DAG, DL,
20975 DstVT.getSizeInBits());
20976 }
20977 }
20978
20979 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20980 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20981 // truncate 2 x v4i32 to v8i16.
20982 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20983 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20984
20985 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20986 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20987
20988 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20989 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20990 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20991 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20992 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20993 }
20994
20995 return SDValue();
20996}
20997
20999 SelectionDAG &DAG,
21000 const X86Subtarget &Subtarget) {
21001 MVT VT = Op.getSimpleValueType();
21002 SDValue In = Op.getOperand(0);
21003 MVT InVT = In.getSimpleValueType();
21004 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21005
21006 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21007 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21008 if (InVT.getScalarSizeInBits() <= 16) {
21009 if (Subtarget.hasBWI()) {
21010 // legal, will go to VPMOVB2M, VPMOVW2M
21011 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21012 // We need to shift to get the lsb into sign position.
21013 // Shift packed bytes not supported natively, bitcast to word
21014 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21015 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21016 DAG.getBitcast(ExtVT, In),
21017 DAG.getConstant(ShiftInx, DL, ExtVT));
21018 In = DAG.getBitcast(InVT, In);
21019 }
21020 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21021 In, ISD::SETGT);
21022 }
21023 // Use TESTD/Q, extended vector to packed dword/qword.
21024 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21025 "Unexpected vector type.");
21026 unsigned NumElts = InVT.getVectorNumElements();
21027 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21028 // We need to change to a wider element type that we have support for.
21029 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21030 // For 16 element vectors we extend to v16i32 unless we are explicitly
21031 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21032 // we need to split into two 8 element vectors which we can extend to v8i32,
21033 // truncate and concat the results. There's an additional complication if
21034 // the original type is v16i8. In that case we can't split the v16i8
21035 // directly, so we need to shuffle high elements to low and use
21036 // sign_extend_vector_inreg.
21037 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21038 SDValue Lo, Hi;
21039 if (InVT == MVT::v16i8) {
21040 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21041 Hi = DAG.getVectorShuffle(
21042 InVT, DL, In, In,
21043 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21044 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21045 } else {
21046 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21047 Lo = extract128BitVector(In, 0, DAG, DL);
21048 Hi = extract128BitVector(In, 8, DAG, DL);
21049 }
21050 // We're split now, just emit two truncates and a concat. The two
21051 // truncates will trigger legalization to come back to this function.
21052 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21053 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21054 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21055 }
21056 // We either have 8 elements or we're allowed to use 512-bit vectors.
21057 // If we have VLX, we want to use the narrowest vector that can get the
21058 // job done so we use vXi32.
21059 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21060 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21061 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21062 InVT = ExtVT;
21063 ShiftInx = InVT.getScalarSizeInBits() - 1;
21064 }
21065
21066 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21067 // We need to shift to get the lsb into sign position.
21068 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21069 DAG.getConstant(ShiftInx, DL, InVT));
21070 }
21071 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21072 if (Subtarget.hasDQI())
21073 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21074 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21075}
21076
21077SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21078 SDLoc DL(Op);
21079 MVT VT = Op.getSimpleValueType();
21080 SDValue In = Op.getOperand(0);
21081 MVT InVT = In.getSimpleValueType();
21083 "Invalid TRUNCATE operation");
21084
21085 // If we're called by the type legalizer, handle a few cases.
21086 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21087 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21088 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21089 VT.is128BitVector() && Subtarget.hasAVX512()) {
21090 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21091 "Unexpected subtarget!");
21092 // The default behavior is to truncate one step, concatenate, and then
21093 // truncate the remainder. We'd rather produce two 64-bit results and
21094 // concatenate those.
21095 SDValue Lo, Hi;
21096 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21097
21098 EVT LoVT, HiVT;
21099 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21100
21101 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21102 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21103 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21104 }
21105
21106 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21107 if (!Subtarget.hasAVX512() ||
21108 (InVT.is512BitVector() && VT.is256BitVector()))
21110 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21111 return SignPack;
21112
21113 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21114 if (!Subtarget.hasAVX512())
21115 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21116
21117 // Otherwise let default legalization handle it.
21118 return SDValue();
21119 }
21120
21121 if (VT.getVectorElementType() == MVT::i1)
21122 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21123
21124 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21125 // concat from subvectors to use VPTRUNC etc.
21126 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
21128 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21129 return SignPack;
21130
21131 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21132 if (Subtarget.hasAVX512()) {
21133 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21134 assert(VT == MVT::v32i8 && "Unexpected VT!");
21135 return splitVectorIntUnary(Op, DAG, DL);
21136 }
21137
21138 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21139 // and then truncate that. But we should only do that if we haven't been
21140 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21141 // handled by isel patterns.
21142 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21143 Subtarget.canExtendTo512DQ())
21144 return Op;
21145 }
21146
21147 // Handle truncation of V256 to V128 using shuffles.
21148 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21149
21150 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21151 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21152 if (Subtarget.hasInt256()) {
21153 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21154 In = DAG.getBitcast(MVT::v8i32, In);
21155 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21156 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21157 DAG.getVectorIdxConstant(0, DL));
21158 }
21159
21160 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21161 DAG.getVectorIdxConstant(0, DL));
21162 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21163 DAG.getVectorIdxConstant(2, DL));
21164 static const int ShufMask[] = {0, 2, 4, 6};
21165 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21166 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21167 }
21168
21169 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21170 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21171 if (Subtarget.hasInt256()) {
21172 // The PSHUFB mask:
21173 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21174 -1, -1, -1, -1, -1, -1, -1, -1,
21175 16, 17, 20, 21, 24, 25, 28, 29,
21176 -1, -1, -1, -1, -1, -1, -1, -1 };
21177 In = DAG.getBitcast(MVT::v32i8, In);
21178 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21179 In = DAG.getBitcast(MVT::v4i64, In);
21180
21181 static const int ShufMask2[] = {0, 2, -1, -1};
21182 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21183 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21184 DAG.getVectorIdxConstant(0, DL));
21185 return DAG.getBitcast(MVT::v8i16, In);
21186 }
21187
21188 return Subtarget.hasSSE41()
21189 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21190 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21191 }
21192
21193 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21194 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21195
21196 llvm_unreachable("All 256->128 cases should have been handled above!");
21197}
21198
21199// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21200// behaves on out of range inputs to generate optimized conversions.
21202 SelectionDAG &DAG,
21203 const X86Subtarget &Subtarget) {
21204 MVT SrcVT = Src.getSimpleValueType();
21205 unsigned DstBits = VT.getScalarSizeInBits();
21206 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21207
21208 // Calculate the converted result for values in the range 0 to
21209 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21210 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21211 SDValue Big =
21212 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21213 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21214 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21215
21216 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21217 // and only if the value was out of range. So we can use that
21218 // as our indicator that we rather use "Big" instead of "Small".
21219 //
21220 // Use "Small" if "IsOverflown" has all bits cleared
21221 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21222
21223 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21224 // use the slightly slower blendv select instead.
21225 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21226 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21227 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21228 }
21229
21230 SDValue IsOverflown =
21231 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21232 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21233 return DAG.getNode(ISD::OR, dl, VT, Small,
21234 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21235}
21236
21237SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21238 bool IsStrict = Op->isStrictFPOpcode();
21239 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21240 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21241 MVT VT = Op->getSimpleValueType(0);
21242 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21243 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21244 MVT SrcVT = Src.getSimpleValueType();
21245 SDLoc dl(Op);
21246
21247 SDValue Res;
21248 if (isSoftF16(SrcVT, Subtarget)) {
21249 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21250 if (IsStrict)
21251 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21252 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21253 {NVT, MVT::Other}, {Chain, Src})});
21254 return DAG.getNode(Op.getOpcode(), dl, VT,
21255 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21256 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
21257 return Op;
21258 }
21259
21260 if (VT.isVector()) {
21261 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21262 MVT ResVT = MVT::v4i32;
21263 MVT TruncVT = MVT::v4i1;
21264 unsigned Opc;
21265 if (IsStrict)
21267 else
21268 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21269
21270 if (!IsSigned && !Subtarget.hasVLX()) {
21271 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21272 // Widen to 512-bits.
21273 ResVT = MVT::v8i32;
21274 TruncVT = MVT::v8i1;
21275 Opc = Op.getOpcode();
21276 // Need to concat with zero vector for strict fp to avoid spurious
21277 // exceptions.
21278 // TODO: Should we just do this for non-strict as well?
21279 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21280 : DAG.getUNDEF(MVT::v8f64);
21281 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21282 DAG.getVectorIdxConstant(0, dl));
21283 }
21284 if (IsStrict) {
21285 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21286 Chain = Res.getValue(1);
21287 } else {
21288 Res = DAG.getNode(Opc, dl, ResVT, Src);
21289 }
21290
21291 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21292 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21293 DAG.getVectorIdxConstant(0, dl));
21294 if (IsStrict)
21295 return DAG.getMergeValues({Res, Chain}, dl);
21296 return Res;
21297 }
21298
21299 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21300 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
21301 return Op;
21302
21303 MVT ResVT = VT;
21304 MVT EleVT = VT.getVectorElementType();
21305 if (EleVT != MVT::i64)
21306 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21307
21308 if (SrcVT != MVT::v8f16) {
21309 SDValue Tmp =
21310 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21311 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21312 Ops[0] = Src;
21313 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21314 }
21315
21316 if (IsStrict) {
21317 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21319 dl, {ResVT, MVT::Other}, {Chain, Src});
21320 Chain = Res.getValue(1);
21321 } else {
21322 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21323 ResVT, Src);
21324 }
21325
21326 // TODO: Need to add exception check code for strict FP.
21327 if (EleVT.getSizeInBits() < 16) {
21328 ResVT = MVT::getVectorVT(EleVT, 8);
21329 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21330 }
21331
21332 if (ResVT != VT)
21333 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21334 DAG.getVectorIdxConstant(0, dl));
21335
21336 if (IsStrict)
21337 return DAG.getMergeValues({Res, Chain}, dl);
21338 return Res;
21339 }
21340
21341 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21342 if (VT.getVectorElementType() == MVT::i16) {
21343 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21344 SrcVT.getVectorElementType() == MVT::f64) &&
21345 "Expected f32/f64 vector!");
21346 MVT NVT = VT.changeVectorElementType(MVT::i32);
21347 if (IsStrict) {
21348 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21350 dl, {NVT, MVT::Other}, {Chain, Src});
21351 Chain = Res.getValue(1);
21352 } else {
21353 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21354 NVT, Src);
21355 }
21356
21357 // TODO: Need to add exception check code for strict FP.
21358 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21359
21360 if (IsStrict)
21361 return DAG.getMergeValues({Res, Chain}, dl);
21362 return Res;
21363 }
21364
21365 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21366 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21367 assert(!IsSigned && "Expected unsigned conversion!");
21368 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21369 return Op;
21370 }
21371
21372 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21373 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21374 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21375 Subtarget.useAVX512Regs()) {
21376 assert(!IsSigned && "Expected unsigned conversion!");
21377 assert(!Subtarget.hasVLX() && "Unexpected features!");
21378 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21379 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21380 // Need to concat with zero vector for strict fp to avoid spurious
21381 // exceptions.
21382 // TODO: Should we just do this for non-strict as well?
21383 SDValue Tmp =
21384 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21385 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21386 DAG.getVectorIdxConstant(0, dl));
21387
21388 if (IsStrict) {
21389 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21390 {Chain, Src});
21391 Chain = Res.getValue(1);
21392 } else {
21393 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21394 }
21395
21396 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21397 DAG.getVectorIdxConstant(0, dl));
21398
21399 if (IsStrict)
21400 return DAG.getMergeValues({Res, Chain}, dl);
21401 return Res;
21402 }
21403
21404 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21405 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21406 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21407 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21408 assert(!Subtarget.hasVLX() && "Unexpected features!");
21409 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21410 // Need to concat with zero vector for strict fp to avoid spurious
21411 // exceptions.
21412 // TODO: Should we just do this for non-strict as well?
21413 SDValue Tmp =
21414 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21415 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21416 DAG.getVectorIdxConstant(0, dl));
21417
21418 if (IsStrict) {
21419 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21420 {Chain, Src});
21421 Chain = Res.getValue(1);
21422 } else {
21423 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21424 }
21425
21426 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21427 DAG.getVectorIdxConstant(0, dl));
21428
21429 if (IsStrict)
21430 return DAG.getMergeValues({Res, Chain}, dl);
21431 return Res;
21432 }
21433
21434 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21435 if (!Subtarget.hasVLX()) {
21436 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21437 // legalizer and then widened again by vector op legalization.
21438 if (!IsStrict)
21439 return SDValue();
21440
21441 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21442 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21443 {Src, Zero, Zero, Zero});
21444 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21445 {Chain, Tmp});
21446 SDValue Chain = Tmp.getValue(1);
21447 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21448 DAG.getVectorIdxConstant(0, dl));
21449 return DAG.getMergeValues({Tmp, Chain}, dl);
21450 }
21451
21452 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21453 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21454 DAG.getUNDEF(MVT::v2f32));
21455 if (IsStrict) {
21456 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21458 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21459 }
21460 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21461 return DAG.getNode(Opc, dl, VT, Tmp);
21462 }
21463
21464 // Generate optimized instructions for pre AVX512 unsigned conversions from
21465 // vXf32 to vXi32.
21466 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21467 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21468 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21469 assert(!IsSigned && "Expected unsigned conversion!");
21470 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21471 }
21472
21473 return SDValue();
21474 }
21475
21476 assert(!VT.isVector());
21477
21478 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21479
21480 if (!IsSigned && UseSSEReg) {
21481 // Conversions from f32/f64 with AVX512 should be legal.
21482 if (Subtarget.hasAVX512())
21483 return Op;
21484
21485 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21486 // behaves on out of range inputs to generate optimized conversions.
21487 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21488 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21489 unsigned DstBits = VT.getScalarSizeInBits();
21490 APInt UIntLimit = APInt::getSignMask(DstBits);
21491 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21492 DAG.getConstant(UIntLimit, dl, VT));
21493 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21494
21495 // Calculate the converted result for values in the range:
21496 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21497 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21498 SDValue Small =
21499 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21500 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21501 SDValue Big = DAG.getNode(
21502 X86ISD::CVTTS2SI, dl, VT,
21503 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21504 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21505
21506 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21507 // and only if the value was out of range. So we can use that
21508 // as our indicator that we rather use "Big" instead of "Small".
21509 //
21510 // Use "Small" if "IsOverflown" has all bits cleared
21511 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21512 SDValue IsOverflown = DAG.getNode(
21513 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21514 return DAG.getNode(ISD::OR, dl, VT, Small,
21515 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21516 }
21517
21518 // Use default expansion for i64.
21519 if (VT == MVT::i64)
21520 return SDValue();
21521
21522 assert(VT == MVT::i32 && "Unexpected VT!");
21523
21524 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21525 // FIXME: This does not generate an invalid exception if the input does not
21526 // fit in i32. PR44019
21527 if (Subtarget.is64Bit()) {
21528 if (IsStrict) {
21529 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21530 {Chain, Src});
21531 Chain = Res.getValue(1);
21532 } else
21533 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21534
21535 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21536 if (IsStrict)
21537 return DAG.getMergeValues({Res, Chain}, dl);
21538 return Res;
21539 }
21540
21541 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21542 // use fisttp which will be handled later.
21543 if (!Subtarget.hasSSE3())
21544 return SDValue();
21545 }
21546
21547 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21548 // FIXME: This does not generate an invalid exception if the input does not
21549 // fit in i16. PR44019
21550 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21551 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21552 if (IsStrict) {
21553 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21554 {Chain, Src});
21555 Chain = Res.getValue(1);
21556 } else
21557 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21558
21559 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21560 if (IsStrict)
21561 return DAG.getMergeValues({Res, Chain}, dl);
21562 return Res;
21563 }
21564
21565 // If this is a FP_TO_SINT using SSEReg we're done.
21566 if (UseSSEReg && IsSigned)
21567 return Op;
21568
21569 // fp128 needs to use a libcall.
21570 if (SrcVT == MVT::f128) {
21571 RTLIB::Libcall LC;
21572 if (IsSigned)
21573 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21574 else
21575 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21576
21577 MakeLibCallOptions CallOptions;
21578 std::pair<SDValue, SDValue> Tmp =
21579 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21580
21581 if (IsStrict)
21582 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21583
21584 return Tmp.first;
21585 }
21586
21587 // Fall back to X87.
21588 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21589 if (IsStrict)
21590 return DAG.getMergeValues({V, Chain}, dl);
21591 return V;
21592 }
21593
21594 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21595}
21596
21597SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21598 SelectionDAG &DAG) const {
21599 SDValue Src = Op.getOperand(0);
21600 EVT DstVT = Op.getSimpleValueType();
21601 MVT SrcVT = Src.getSimpleValueType();
21602
21603 if (SrcVT.isVector())
21604 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21605
21606 if (SrcVT == MVT::f16)
21607 return SDValue();
21608
21609 // If the source is in an SSE register, the node is Legal.
21610 if (isScalarFPTypeInSSEReg(SrcVT))
21611 return Op;
21612
21613 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21614}
21615
21616SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21617 SelectionDAG &DAG) const {
21618 EVT DstVT = N->getValueType(0);
21619 SDValue Src = N->getOperand(0);
21620 EVT SrcVT = Src.getValueType();
21621
21622 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21623 // f16 must be promoted before using the lowering in this routine.
21624 // fp128 does not use this lowering.
21625 return SDValue();
21626 }
21627
21628 SDLoc DL(N);
21629 SDValue Chain = DAG.getEntryNode();
21630
21631 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21632
21633 // If we're converting from SSE, the stack slot needs to hold both types.
21634 // Otherwise it only needs to hold the DstVT.
21635 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21636 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21637 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21638 MachinePointerInfo MPI =
21640
21641 if (UseSSE) {
21642 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21643 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21644 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21645 SDValue Ops[] = { Chain, StackPtr };
21646
21647 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21648 /*Align*/ std::nullopt,
21650 Chain = Src.getValue(1);
21651 }
21652
21653 SDValue StoreOps[] = { Chain, Src, StackPtr };
21654 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21655 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
21657
21658 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21659}
21660
21661SDValue
21662X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21663 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21664 // but making use of X86 specifics to produce better instruction sequences.
21665 SDNode *Node = Op.getNode();
21666 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21667 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21668 SDLoc dl(SDValue(Node, 0));
21669 SDValue Src = Node->getOperand(0);
21670
21671 // There are three types involved here: SrcVT is the source floating point
21672 // type, DstVT is the type of the result, and TmpVT is the result of the
21673 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21674 // DstVT).
21675 EVT SrcVT = Src.getValueType();
21676 EVT DstVT = Node->getValueType(0);
21677 EVT TmpVT = DstVT;
21678
21679 // This code is only for floats and doubles. Fall back to generic code for
21680 // anything else.
21681 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
21682 return SDValue();
21683
21684 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21685 unsigned SatWidth = SatVT.getScalarSizeInBits();
21686 unsigned DstWidth = DstVT.getScalarSizeInBits();
21687 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21688 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21689 "Expected saturation width smaller than result width");
21690
21691 // Promote result of FP_TO_*INT to at least 32 bits.
21692 if (TmpWidth < 32) {
21693 TmpVT = MVT::i32;
21694 TmpWidth = 32;
21695 }
21696
21697 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21698 // us to use a native signed conversion instead.
21699 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21700 TmpVT = MVT::i64;
21701 TmpWidth = 64;
21702 }
21703
21704 // If the saturation width is smaller than the size of the temporary result,
21705 // we can always use signed conversion, which is native.
21706 if (SatWidth < TmpWidth)
21707 FpToIntOpcode = ISD::FP_TO_SINT;
21708
21709 // Determine minimum and maximum integer values and their corresponding
21710 // floating-point values.
21711 APInt MinInt, MaxInt;
21712 if (IsSigned) {
21713 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
21714 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
21715 } else {
21716 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
21717 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
21718 }
21719
21720 const fltSemantics &Sem = SrcVT.getFltSemantics();
21721 APFloat MinFloat(Sem);
21722 APFloat MaxFloat(Sem);
21723
21724 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21725 MinInt, IsSigned, APFloat::rmTowardZero);
21726 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21727 MaxInt, IsSigned, APFloat::rmTowardZero);
21728 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21729 && !(MaxStatus & APFloat::opStatus::opInexact);
21730
21731 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21732 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21733
21734 // If the integer bounds are exactly representable as floats, emit a
21735 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21736 if (AreExactFloatBounds) {
21737 if (DstVT != TmpVT) {
21738 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21739 SDValue MinClamped = DAG.getNode(
21740 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21741 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21742 SDValue BothClamped = DAG.getNode(
21743 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21744 // Convert clamped value to integer.
21745 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21746
21747 // NaN will become INDVAL, with the top bit set and the rest zero.
21748 // Truncation will discard the top bit, resulting in zero.
21749 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21750 }
21751
21752 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21753 SDValue MinClamped = DAG.getNode(
21754 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21755 // Clamp by MaxFloat from above. NaN cannot occur.
21756 SDValue BothClamped = DAG.getNode(
21757 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21758 // Convert clamped value to integer.
21759 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21760
21761 if (!IsSigned) {
21762 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21763 // which is zero.
21764 return FpToInt;
21765 }
21766
21767 // Otherwise, select zero if Src is NaN.
21768 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21769 return DAG.getSelectCC(
21770 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21771 }
21772
21773 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21774 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21775
21776 // Result of direct conversion, which may be selected away.
21777 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21778
21779 if (DstVT != TmpVT) {
21780 // NaN will become INDVAL, with the top bit set and the rest zero.
21781 // Truncation will discard the top bit, resulting in zero.
21782 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21783 }
21784
21785 SDValue Select = FpToInt;
21786 // For signed conversions where we saturate to the same size as the
21787 // result type of the fptoi instructions, INDVAL coincides with integer
21788 // minimum, so we don't need to explicitly check it.
21789 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21790 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21791 // MinInt if Src is NaN.
21792 Select = DAG.getSelectCC(
21793 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21794 }
21795
21796 // If Src OGT MaxFloat, select MaxInt.
21797 Select = DAG.getSelectCC(
21798 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21799
21800 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21801 // is already zero. The promoted case was already handled above.
21802 if (!IsSigned || DstVT != TmpVT) {
21803 return Select;
21804 }
21805
21806 // Otherwise, select 0 if Src is NaN.
21807 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21808 return DAG.getSelectCC(
21809 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21810}
21811
21812SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21813 bool IsStrict = Op->isStrictFPOpcode();
21814
21815 SDLoc DL(Op);
21816 MVT VT = Op.getSimpleValueType();
21817 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21818 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21819 MVT SVT = In.getSimpleValueType();
21820
21821 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21822 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
21823 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
21824 !Subtarget.getTargetTriple().isOSDarwin()))
21825 return SDValue();
21826
21827 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
21828 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
21829 return Op;
21830
21831 if (SVT == MVT::f16) {
21832 if (Subtarget.hasFP16())
21833 return Op;
21834
21835 if (VT != MVT::f32) {
21836 if (IsStrict)
21837 return DAG.getNode(
21838 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
21839 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
21840 {MVT::f32, MVT::Other}, {Chain, In})});
21841
21842 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21843 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
21844 }
21845
21846 if (!Subtarget.hasF16C()) {
21847 if (!Subtarget.getTargetTriple().isOSDarwin())
21848 return SDValue();
21849
21850 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
21851
21852 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21854 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21855
21856 In = DAG.getBitcast(MVT::i16, In);
21859 Entry.Node = In;
21860 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
21861 Entry.IsSExt = false;
21862 Entry.IsZExt = true;
21863 Args.push_back(Entry);
21864
21866 getLibcallName(RTLIB::FPEXT_F16_F32),
21868 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21869 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
21870 std::move(Args));
21871
21872 SDValue Res;
21873 std::tie(Res,Chain) = LowerCallTo(CLI);
21874 if (IsStrict)
21875 Res = DAG.getMergeValues({Res, Chain}, DL);
21876
21877 return Res;
21878 }
21879
21880 In = DAG.getBitcast(MVT::i16, In);
21881 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
21882 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
21883 DAG.getVectorIdxConstant(0, DL));
21884 SDValue Res;
21885 if (IsStrict) {
21886 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
21887 {Chain, In});
21888 Chain = Res.getValue(1);
21889 } else {
21890 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
21891 DAG.getTargetConstant(4, DL, MVT::i32));
21892 }
21893 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
21894 DAG.getVectorIdxConstant(0, DL));
21895 if (IsStrict)
21896 return DAG.getMergeValues({Res, Chain}, DL);
21897 return Res;
21898 }
21899
21900 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
21901 return Op;
21902
21903 if (SVT.getVectorElementType() == MVT::f16) {
21904 if (Subtarget.hasFP16() && isTypeLegal(SVT))
21905 return Op;
21906 assert(Subtarget.hasF16C() && "Unexpected features!");
21907 if (SVT == MVT::v2f16)
21908 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
21909 DAG.getUNDEF(MVT::v2f16));
21910 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
21911 DAG.getUNDEF(MVT::v4f16));
21912 if (IsStrict)
21913 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21914 {Op->getOperand(0), Res});
21915 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21916 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
21917 return Op;
21918 }
21919
21920 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21921
21922 SDValue Res =
21923 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21924 if (IsStrict)
21925 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21926 {Op->getOperand(0), Res});
21927 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21928}
21929
21930SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21931 bool IsStrict = Op->isStrictFPOpcode();
21932
21933 SDLoc DL(Op);
21934 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21935 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21936 MVT VT = Op.getSimpleValueType();
21937 MVT SVT = In.getSimpleValueType();
21938
21939 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
21940 return SDValue();
21941
21942 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
21943 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
21944 if (!Subtarget.getTargetTriple().isOSDarwin())
21945 return SDValue();
21946
21947 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
21949 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21950
21953 Entry.Node = In;
21954 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
21955 Entry.IsSExt = false;
21956 Entry.IsZExt = true;
21957 Args.push_back(Entry);
21958
21960 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
21961 : RTLIB::FPROUND_F32_F16),
21963 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21964 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
21965 std::move(Args));
21966
21967 SDValue Res;
21968 std::tie(Res, Chain) = LowerCallTo(CLI);
21969
21970 Res = DAG.getBitcast(MVT::f16, Res);
21971
21972 if (IsStrict)
21973 Res = DAG.getMergeValues({Res, Chain}, DL);
21974
21975 return Res;
21976 }
21977
21978 if (VT.getScalarType() == MVT::bf16) {
21979 if (SVT.getScalarType() == MVT::f32 &&
21980 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21981 Subtarget.hasAVXNECONVERT()))
21982 return Op;
21983 return SDValue();
21984 }
21985
21986 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
21987 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
21988 return SDValue();
21989
21990 if (VT.isVector())
21991 return Op;
21992
21993 SDValue Res;
21995 MVT::i32);
21996 if (IsStrict) {
21997 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
21998 DAG.getConstantFP(0, DL, MVT::v4f32), In,
21999 DAG.getVectorIdxConstant(0, DL));
22000 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22001 {Chain, Res, Rnd});
22002 Chain = Res.getValue(1);
22003 } else {
22004 // FIXME: Should we use zeros for upper elements for non-strict?
22005 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22006 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22007 }
22008
22009 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22010 DAG.getVectorIdxConstant(0, DL));
22011 Res = DAG.getBitcast(MVT::f16, Res);
22012
22013 if (IsStrict)
22014 return DAG.getMergeValues({Res, Chain}, DL);
22015
22016 return Res;
22017 }
22018
22019 return Op;
22020}
22021
22023 bool IsStrict = Op->isStrictFPOpcode();
22024 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22025 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22026 "Unexpected VT!");
22027
22028 SDLoc dl(Op);
22029 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22030 DAG.getConstant(0, dl, MVT::v8i16), Src,
22031 DAG.getVectorIdxConstant(0, dl));
22032
22033 SDValue Chain;
22034 if (IsStrict) {
22035 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22036 {Op.getOperand(0), Res});
22037 Chain = Res.getValue(1);
22038 } else {
22039 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22040 }
22041
22042 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22043 DAG.getVectorIdxConstant(0, dl));
22044
22045 if (IsStrict)
22046 return DAG.getMergeValues({Res, Chain}, dl);
22047
22048 return Res;
22049}
22050
22052 bool IsStrict = Op->isStrictFPOpcode();
22053 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22054 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22055 "Unexpected VT!");
22056
22057 SDLoc dl(Op);
22058 SDValue Res, Chain;
22059 if (IsStrict) {
22060 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22061 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22062 DAG.getVectorIdxConstant(0, dl));
22063 Res = DAG.getNode(
22064 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22065 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22066 Chain = Res.getValue(1);
22067 } else {
22068 // FIXME: Should we use zeros for upper elements for non-strict?
22069 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22070 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22071 DAG.getTargetConstant(4, dl, MVT::i32));
22072 }
22073
22074 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22075 DAG.getVectorIdxConstant(0, dl));
22076
22077 if (IsStrict)
22078 return DAG.getMergeValues({Res, Chain}, dl);
22079
22080 return Res;
22081}
22082
22083SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22084 SelectionDAG &DAG) const {
22085 SDLoc DL(Op);
22086
22087 MVT SVT = Op.getOperand(0).getSimpleValueType();
22088 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22089 Subtarget.hasAVXNECONVERT())) {
22090 SDValue Res;
22091 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22092 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22093 Res = DAG.getBitcast(MVT::v8i16, Res);
22094 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22095 DAG.getVectorIdxConstant(0, DL));
22096 }
22097
22098 MakeLibCallOptions CallOptions;
22099 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22100 SDValue Res =
22101 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22102 return DAG.getBitcast(MVT::i16, Res);
22103}
22104
22105/// Depending on uarch and/or optimizing for size, we might prefer to use a
22106/// vector operation in place of the typical scalar operation.
22108 SelectionDAG &DAG,
22109 const X86Subtarget &Subtarget) {
22110 // If both operands have other uses, this is probably not profitable.
22111 SDValue LHS = Op.getOperand(0);
22112 SDValue RHS = Op.getOperand(1);
22113 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22114 return Op;
22115
22116 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22117 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22118 if (IsFP && !Subtarget.hasSSE3())
22119 return Op;
22120 if (!IsFP && !Subtarget.hasSSSE3())
22121 return Op;
22122
22123 // Extract from a common vector.
22124 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22125 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22126 LHS.getOperand(0) != RHS.getOperand(0) ||
22127 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22128 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22129 !shouldUseHorizontalOp(true, DAG, Subtarget))
22130 return Op;
22131
22132 // Allow commuted 'hadd' ops.
22133 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22134 unsigned HOpcode;
22135 switch (Op.getOpcode()) {
22136 // clang-format off
22137 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22138 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22139 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22140 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22141 default:
22142 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22143 // clang-format on
22144 }
22145 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22146 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22147 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22148 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22149 std::swap(LExtIndex, RExtIndex);
22150
22151 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22152 return Op;
22153
22154 SDValue X = LHS.getOperand(0);
22155 EVT VecVT = X.getValueType();
22156 unsigned BitWidth = VecVT.getSizeInBits();
22157 unsigned NumLanes = BitWidth / 128;
22158 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22159 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22160 "Not expecting illegal vector widths here");
22161
22162 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22163 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22164 if (BitWidth == 256 || BitWidth == 512) {
22165 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22166 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22167 LExtIndex %= NumEltsPerLane;
22168 }
22169
22170 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22171 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22172 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22173 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22174 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22175 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22176 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22177}
22178
22179/// Depending on uarch and/or optimizing for size, we might prefer to use a
22180/// vector operation in place of the typical scalar operation.
22181SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22182 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22183 "Only expecting float/double");
22184 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22185}
22186
22187/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22188/// This mode isn't supported in hardware on X86. But as long as we aren't
22189/// compiling with trapping math, we can emulate this with
22190/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22192 SDValue N0 = Op.getOperand(0);
22193 SDLoc dl(Op);
22194 MVT VT = Op.getSimpleValueType();
22195
22196 // N0 += copysign(nextafter(0.5, 0.0), N0)
22197 const fltSemantics &Sem = VT.getFltSemantics();
22198 bool Ignored;
22199 APFloat Point5Pred = APFloat(0.5f);
22200 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22201 Point5Pred.next(/*nextDown*/true);
22202
22203 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22204 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22205 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22206
22207 // Truncate the result to remove fraction.
22208 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22209}
22210
22211/// The only differences between FABS and FNEG are the mask and the logic op.
22212/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22214 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22215 "Wrong opcode for lowering FABS or FNEG.");
22216
22217 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22218
22219 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22220 // into an FNABS. We'll lower the FABS after that if it is still in use.
22221 if (IsFABS)
22222 for (SDNode *User : Op->users())
22223 if (User->getOpcode() == ISD::FNEG)
22224 return Op;
22225
22226 SDLoc dl(Op);
22227 MVT VT = Op.getSimpleValueType();
22228
22229 bool IsF128 = (VT == MVT::f128);
22230 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22232 "Unexpected type in LowerFABSorFNEG");
22233
22234 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22235 // decide if we should generate a 16-byte constant mask when we only need 4 or
22236 // 8 bytes for the scalar case.
22237
22238 // There are no scalar bitwise logical SSE/AVX instructions, so we
22239 // generate a 16-byte vector constant and logic op even for the scalar case.
22240 // Using a 16-byte mask allows folding the load of the mask with
22241 // the logic op, so it can save (~4 bytes) on code size.
22242 bool IsFakeVector = !VT.isVector() && !IsF128;
22243 MVT LogicVT = VT;
22244 if (IsFakeVector)
22245 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22246 : (VT == MVT::f32) ? MVT::v4f32
22247 : MVT::v8f16;
22248
22249 unsigned EltBits = VT.getScalarSizeInBits();
22250 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22251 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22252 APInt::getSignMask(EltBits);
22253 const fltSemantics &Sem = VT.getFltSemantics();
22254 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22255
22256 SDValue Op0 = Op.getOperand(0);
22257 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22258 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22259 IsFNABS ? X86ISD::FOR :
22261 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22262
22263 if (VT.isVector() || IsF128)
22264 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22265
22266 // For the scalar case extend to a 128-bit vector, perform the logic op,
22267 // and extract the scalar result back out.
22268 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22269 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22270 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22271 DAG.getVectorIdxConstant(0, dl));
22272}
22273
22275 SDValue Mag = Op.getOperand(0);
22276 SDValue Sign = Op.getOperand(1);
22277 SDLoc dl(Op);
22278
22279 // If the sign operand is smaller, extend it first.
22280 MVT VT = Op.getSimpleValueType();
22281 if (Sign.getSimpleValueType().bitsLT(VT))
22282 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22283
22284 // And if it is bigger, shrink it first.
22285 if (Sign.getSimpleValueType().bitsGT(VT))
22286 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22287 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22288
22289 // At this point the operands and the result should have the same
22290 // type, and that won't be f80 since that is not custom lowered.
22291 bool IsF128 = (VT == MVT::f128);
22292 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22294 "Unexpected type in LowerFCOPYSIGN");
22295
22296 const fltSemantics &Sem = VT.getFltSemantics();
22297
22298 // Perform all scalar logic operations as 16-byte vectors because there are no
22299 // scalar FP logic instructions in SSE.
22300 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22301 // unnecessary splats, but we might miss load folding opportunities. Should
22302 // this decision be based on OptimizeForSize?
22303 bool IsFakeVector = !VT.isVector() && !IsF128;
22304 MVT LogicVT = VT;
22305 if (IsFakeVector)
22306 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22307 : (VT == MVT::f32) ? MVT::v4f32
22308 : MVT::v8f16;
22309
22310 // The mask constants are automatically splatted for vector types.
22311 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22312 SDValue SignMask = DAG.getConstantFP(
22313 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22314 SDValue MagMask = DAG.getConstantFP(
22315 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22316
22317 // First, clear all bits but the sign bit from the second operand (sign).
22318 if (IsFakeVector)
22319 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22320 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22321
22322 // Next, clear the sign bit from the first operand (magnitude).
22323 // TODO: If we had general constant folding for FP logic ops, this check
22324 // wouldn't be necessary.
22325 SDValue MagBits;
22326 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22327 APFloat APF = Op0CN->getValueAPF();
22328 APF.clearSign();
22329 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22330 } else {
22331 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22332 if (IsFakeVector)
22333 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22334 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22335 }
22336
22337 // OR the magnitude value with the sign bit.
22338 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22339 return !IsFakeVector ? Or
22340 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22341 DAG.getVectorIdxConstant(0, dl));
22342}
22343
22345 SDValue N0 = Op.getOperand(0);
22346 SDLoc dl(Op);
22347 MVT VT = Op.getSimpleValueType();
22348
22349 MVT OpVT = N0.getSimpleValueType();
22350 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22351 "Unexpected type for FGETSIGN");
22352
22353 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22354 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22355 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22356 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22357 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22358 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22359 return Res;
22360}
22361
22362/// Helper for attempting to create a X86ISD::BT node.
22363static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22364 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22365 // instruction. Since the shift amount is in-range-or-undefined, we know
22366 // that doing a bittest on the i32 value is ok. We extend to i32 because
22367 // the encoding for the i16 version is larger than the i32 version.
22368 // Also promote i16 to i32 for performance / code size reason.
22369 if (Src.getValueType().getScalarSizeInBits() < 32)
22370 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22371
22372 // No legal type found, give up.
22373 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22374 return SDValue();
22375
22376 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22377 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22378 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22379 // known to be zero.
22380 if (Src.getValueType() == MVT::i64 &&
22381 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22382 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22383
22384 // If the operand types disagree, extend the shift amount to match. Since
22385 // BT ignores high bits (like shifts) we can use anyextend.
22386 if (Src.getValueType() != BitNo.getValueType()) {
22387 // Peek through a mask/modulo operation.
22388 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22389 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22390 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22391 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22392 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22393 BitNo.getOperand(0)),
22394 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22395 BitNo.getOperand(1)));
22396 else
22397 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22398 }
22399
22400 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22401}
22402
22403/// Helper for creating a X86ISD::SETCC node.
22405 SelectionDAG &DAG) {
22406 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22407 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22408}
22409
22410/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22411/// recognizable memcmp expansion.
22412static bool isOrXorXorTree(SDValue X, bool Root = true) {
22413 if (X.getOpcode() == ISD::OR)
22414 return isOrXorXorTree(X.getOperand(0), false) &&
22415 isOrXorXorTree(X.getOperand(1), false);
22416 if (Root)
22417 return false;
22418 return X.getOpcode() == ISD::XOR;
22419}
22420
22421/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22422/// expansion.
22423template <typename F>
22425 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22426 SDValue Op0 = X.getOperand(0);
22427 SDValue Op1 = X.getOperand(1);
22428 if (X.getOpcode() == ISD::OR) {
22429 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22430 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22431 if (VecVT != CmpVT)
22432 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22433 if (HasPT)
22434 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22435 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22436 }
22437 if (X.getOpcode() == ISD::XOR) {
22438 SDValue A = SToV(Op0);
22439 SDValue B = SToV(Op1);
22440 if (VecVT != CmpVT)
22441 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22442 if (HasPT)
22443 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22444 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22445 }
22446 llvm_unreachable("Impossible");
22447}
22448
22449/// Try to map a 128-bit or larger integer comparison to vector instructions
22450/// before type legalization splits it up into chunks.
22453 const SDLoc &DL,
22454 SelectionDAG &DAG,
22455 const X86Subtarget &Subtarget) {
22456 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22457
22458 // We're looking for an oversized integer equality comparison.
22459 EVT OpVT = X.getValueType();
22460 unsigned OpSize = OpVT.getSizeInBits();
22461 if (!OpVT.isScalarInteger() || OpSize < 128)
22462 return SDValue();
22463
22464 // Ignore a comparison with zero because that gets special treatment in
22465 // EmitTest(). But make an exception for the special case of a pair of
22466 // logically-combined vector-sized operands compared to zero. This pattern may
22467 // be generated by the memcmp expansion pass with oversized integer compares
22468 // (see PR33325).
22469 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22470 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22471 return SDValue();
22472
22473 // Don't perform this combine if constructing the vector will be expensive.
22474 auto IsVectorBitCastCheap = [](SDValue X) {
22476 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22477 X.getOpcode() == ISD::LOAD;
22478 };
22479 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22480 !IsOrXorXorTreeCCZero)
22481 return SDValue();
22482
22483 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22484 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22485 // Otherwise use PCMPEQ (plus AND) and mask testing.
22486 bool NoImplicitFloatOps =
22488 Attribute::NoImplicitFloat);
22489 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22490 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22491 (OpSize == 256 && Subtarget.hasAVX()) ||
22492 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22493 bool HasPT = Subtarget.hasSSE41();
22494
22495 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22496 // vector registers are essentially free. (Technically, widening registers
22497 // prevents load folding, but the tradeoff is worth it.)
22498 bool PreferKOT = Subtarget.preferMaskRegisters();
22499 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22500
22501 EVT VecVT = MVT::v16i8;
22502 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22503 if (OpSize == 256) {
22504 VecVT = MVT::v32i8;
22505 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22506 }
22507 EVT CastVT = VecVT;
22508 bool NeedsAVX512FCast = false;
22509 if (OpSize == 512 || NeedZExt) {
22510 if (Subtarget.hasBWI()) {
22511 VecVT = MVT::v64i8;
22512 CmpVT = MVT::v64i1;
22513 if (OpSize == 512)
22514 CastVT = VecVT;
22515 } else {
22516 VecVT = MVT::v16i32;
22517 CmpVT = MVT::v16i1;
22518 CastVT = OpSize == 512 ? VecVT
22519 : OpSize == 256 ? MVT::v8i32
22520 : MVT::v4i32;
22521 NeedsAVX512FCast = true;
22522 }
22523 }
22524
22525 auto ScalarToVector = [&](SDValue X) -> SDValue {
22526 bool TmpZext = false;
22527 EVT TmpCastVT = CastVT;
22528 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22529 SDValue OrigX = X.getOperand(0);
22530 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22531 if (OrigSize < OpSize) {
22532 if (OrigSize == 128) {
22533 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22534 X = OrigX;
22535 TmpZext = true;
22536 } else if (OrigSize == 256) {
22537 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22538 X = OrigX;
22539 TmpZext = true;
22540 }
22541 }
22542 }
22543 X = DAG.getBitcast(TmpCastVT, X);
22544 if (!NeedZExt && !TmpZext)
22545 return X;
22546 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22547 DAG.getConstant(0, DL, VecVT), X,
22548 DAG.getVectorIdxConstant(0, DL));
22549 };
22550
22551 SDValue Cmp;
22552 if (IsOrXorXorTreeCCZero) {
22553 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22554 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22555 // Use 2 vector equality compares and 'and' the results before doing a
22556 // MOVMSK.
22557 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22558 } else {
22559 SDValue VecX = ScalarToVector(X);
22560 SDValue VecY = ScalarToVector(Y);
22561 if (VecVT != CmpVT) {
22562 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22563 } else if (HasPT) {
22564 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22565 } else {
22566 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22567 }
22568 }
22569 // AVX512 should emit a setcc that will lower to kortest.
22570 if (VecVT != CmpVT) {
22571 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22572 : CmpVT == MVT::v32i1 ? MVT::i32
22573 : MVT::i16;
22574 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22575 DAG.getConstant(0, DL, KRegVT), CC);
22576 }
22577 if (HasPT) {
22578 SDValue BCCmp =
22579 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22580 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22582 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22583 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22584 }
22585 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22586 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22587 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22588 assert(Cmp.getValueType() == MVT::v16i8 &&
22589 "Non 128-bit vector on pre-SSE41 target");
22590 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22591 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22592 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22593 }
22594
22595 return SDValue();
22596}
22597
22598/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22599/// style scalarized (associative) reduction patterns. Partial reductions
22600/// are supported when the pointer SrcMask is non-null.
22601/// TODO - move this to SelectionDAG?
22604 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22606 DenseMap<SDValue, APInt> SrcOpMap;
22607 EVT VT = MVT::Other;
22608
22609 // Recognize a special case where a vector is casted into wide integer to
22610 // test all 0s.
22611 assert(Op.getOpcode() == unsigned(BinOp) &&
22612 "Unexpected bit reduction opcode");
22613 Opnds.push_back(Op.getOperand(0));
22614 Opnds.push_back(Op.getOperand(1));
22615
22616 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22618 // BFS traverse all BinOp operands.
22619 if (I->getOpcode() == unsigned(BinOp)) {
22620 Opnds.push_back(I->getOperand(0));
22621 Opnds.push_back(I->getOperand(1));
22622 // Re-evaluate the number of nodes to be traversed.
22623 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22624 continue;
22625 }
22626
22627 // Quit if a non-EXTRACT_VECTOR_ELT
22628 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22629 return false;
22630
22631 // Quit if without a constant index.
22632 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22633 if (!Idx)
22634 return false;
22635
22636 SDValue Src = I->getOperand(0);
22637 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22638 if (M == SrcOpMap.end()) {
22639 VT = Src.getValueType();
22640 // Quit if not the same type.
22641 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22642 return false;
22643 unsigned NumElts = VT.getVectorNumElements();
22644 APInt EltCount = APInt::getZero(NumElts);
22645 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22646 SrcOps.push_back(Src);
22647 }
22648
22649 // Quit if element already used.
22650 unsigned CIdx = Idx->getZExtValue();
22651 if (M->second[CIdx])
22652 return false;
22653 M->second.setBit(CIdx);
22654 }
22655
22656 if (SrcMask) {
22657 // Collect the source partial masks.
22658 for (SDValue &SrcOp : SrcOps)
22659 SrcMask->push_back(SrcOpMap[SrcOp]);
22660 } else {
22661 // Quit if not all elements are used.
22662 for (const auto &I : SrcOpMap)
22663 if (!I.second.isAllOnes())
22664 return false;
22665 }
22666
22667 return true;
22668}
22669
22670// Helper function for comparing all bits of two vectors.
22672 ISD::CondCode CC, const APInt &OriginalMask,
22673 const X86Subtarget &Subtarget,
22674 SelectionDAG &DAG, X86::CondCode &X86CC) {
22675 EVT VT = LHS.getValueType();
22676 unsigned ScalarSize = VT.getScalarSizeInBits();
22677 if (OriginalMask.getBitWidth() != ScalarSize) {
22678 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22679 return SDValue();
22680 }
22681
22682 // Quit if not convertable to legal scalar or 128/256-bit vector.
22683 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22684 return SDValue();
22685
22686 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22687 if (VT.isFloatingPoint())
22688 return SDValue();
22689
22690 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22691 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22692
22693 APInt Mask = OriginalMask;
22694
22695 auto MaskBits = [&](SDValue Src) {
22696 if (Mask.isAllOnes())
22697 return Src;
22698 EVT SrcVT = Src.getValueType();
22699 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22700 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22701 };
22702
22703 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22704 if (VT.getSizeInBits() < 128) {
22705 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22706 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
22707 if (IntVT != MVT::i64)
22708 return SDValue();
22709 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
22710 MVT::i32, MVT::i32);
22711 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
22712 MVT::i32, MVT::i32);
22713 SDValue Lo =
22714 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
22715 SDValue Hi =
22716 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
22717 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22718 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
22719 DAG.getConstant(0, DL, MVT::i32));
22720 }
22721 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22722 DAG.getBitcast(IntVT, MaskBits(LHS)),
22723 DAG.getBitcast(IntVT, MaskBits(RHS)));
22724 }
22725
22726 // Without PTEST, a masked v2i64 or-reduction is not faster than
22727 // scalarization.
22728 bool UseKORTEST = Subtarget.useAVX512Regs();
22729 bool UsePTEST = Subtarget.hasSSE41();
22730 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
22731 return SDValue();
22732
22733 // Split down to 128/256/512-bit vector.
22734 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
22735
22736 // If the input vector has vector elements wider than the target test size,
22737 // then cast to <X x i64> so it will safely split.
22738 if (ScalarSize > TestSize) {
22739 if (!Mask.isAllOnes())
22740 return SDValue();
22741 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
22742 LHS = DAG.getBitcast(VT, LHS);
22743 RHS = DAG.getBitcast(VT, RHS);
22744 Mask = APInt::getAllOnes(64);
22745 }
22746
22747 if (VT.getSizeInBits() > TestSize) {
22748 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
22749 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
22750 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22751 while (VT.getSizeInBits() > TestSize) {
22752 auto Split = DAG.SplitVector(LHS, DL);
22753 VT = Split.first.getValueType();
22754 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22755 }
22756 RHS = DAG.getAllOnesConstant(DL, VT);
22757 } else if (!UsePTEST && !KnownRHS.isZero()) {
22758 // MOVMSK Special Case:
22759 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22760 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
22761 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
22762 LHS = DAG.getBitcast(VT, MaskBits(LHS));
22763 RHS = DAG.getBitcast(VT, MaskBits(RHS));
22764 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
22765 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
22766 V = DAG.getSExtOrTrunc(V, DL, VT);
22767 while (VT.getSizeInBits() > TestSize) {
22768 auto Split = DAG.SplitVector(V, DL);
22769 VT = Split.first.getValueType();
22770 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22771 }
22772 V = DAG.getNOT(DL, V, VT);
22773 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22774 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22775 DAG.getConstant(0, DL, MVT::i32));
22776 } else {
22777 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
22778 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
22779 while (VT.getSizeInBits() > TestSize) {
22780 auto Split = DAG.SplitVector(V, DL);
22781 VT = Split.first.getValueType();
22782 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22783 }
22784 LHS = V;
22785 RHS = DAG.getConstant(0, DL, VT);
22786 }
22787 }
22788
22789 if (UseKORTEST && VT.is512BitVector()) {
22790 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22791 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
22792 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22793 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22794 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
22795 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
22796 }
22797
22798 if (UsePTEST) {
22799 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
22800 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22801 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22802 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
22803 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22804 }
22805
22806 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22807 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
22808 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
22809 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
22810 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
22811 V = DAG.getNOT(DL, V, MaskVT);
22812 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22813 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22814 DAG.getConstant(0, DL, MVT::i32));
22815}
22816
22817// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22818// to CMP(MOVMSK(PCMPEQB(X,Y))).
22820 ISD::CondCode CC, const SDLoc &DL,
22821 const X86Subtarget &Subtarget,
22822 SelectionDAG &DAG,
22823 X86::CondCode &X86CC) {
22824 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22825
22826 bool CmpNull = isNullConstant(RHS);
22827 bool CmpAllOnes = isAllOnesConstant(RHS);
22828 if (!CmpNull && !CmpAllOnes)
22829 return SDValue();
22830
22831 SDValue Op = LHS;
22832 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22833 return SDValue();
22834
22835 // Check whether we're masking/truncating an OR-reduction result, in which
22836 // case track the masked bits.
22837 // TODO: Add CmpAllOnes support.
22838 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
22839 if (CmpNull) {
22840 switch (Op.getOpcode()) {
22841 case ISD::TRUNCATE: {
22842 SDValue Src = Op.getOperand(0);
22843 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22844 Op.getScalarValueSizeInBits());
22845 Op = Src;
22846 break;
22847 }
22848 case ISD::AND: {
22849 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22850 Mask = Cst->getAPIntValue();
22851 Op = Op.getOperand(0);
22852 }
22853 break;
22854 }
22855 }
22856 }
22857
22858 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
22859
22860 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
22861 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22863 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
22864 EVT VT = VecIns[0].getValueType();
22865 assert(llvm::all_of(VecIns,
22866 [VT](SDValue V) { return VT == V.getValueType(); }) &&
22867 "Reduction source vector mismatch");
22868
22869 // Quit if not splittable to scalar/128/256/512-bit vector.
22870 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22871 return SDValue();
22872
22873 // If more than one full vector is evaluated, AND/OR them first before
22874 // PTEST.
22875 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22876 Slot += 2, e += 1) {
22877 // Each iteration will AND/OR 2 nodes and append the result until there is
22878 // only 1 node left, i.e. the final value of all vectors.
22879 SDValue LHS = VecIns[Slot];
22880 SDValue RHS = VecIns[Slot + 1];
22881 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
22882 }
22883
22884 return LowerVectorAllEqual(DL, VecIns.back(),
22885 CmpNull ? DAG.getConstant(0, DL, VT)
22886 : DAG.getAllOnesConstant(DL, VT),
22887 CC, Mask, Subtarget, DAG, X86CC);
22888 }
22889
22890 // Match icmp(reduce_or(X),0) anyof reduction patterns.
22891 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22892 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22893 ISD::NodeType BinOp;
22894 if (SDValue Match =
22895 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
22896 EVT MatchVT = Match.getValueType();
22898 CmpNull ? DAG.getConstant(0, DL, MatchVT)
22899 : DAG.getAllOnesConstant(DL, MatchVT),
22900 CC, Mask, Subtarget, DAG, X86CC);
22901 }
22902 }
22903
22904 if (Mask.isAllOnes()) {
22905 assert(!Op.getValueType().isVector() &&
22906 "Illegal vector type for reduction pattern");
22908 if (Src.getValueType().isFixedLengthVector() &&
22909 Src.getValueType().getScalarType() == MVT::i1) {
22910 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
22911 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22912 if (Src.getOpcode() == ISD::SETCC) {
22913 SDValue LHS = Src.getOperand(0);
22914 SDValue RHS = Src.getOperand(1);
22915 EVT LHSVT = LHS.getValueType();
22916 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22917 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
22918 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
22919 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
22920 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
22921 X86CC);
22922 }
22923 }
22924 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
22925 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22926 // Peek through truncation, mask the LSB and compare against zero/LSB.
22927 if (Src.getOpcode() == ISD::TRUNCATE) {
22928 SDValue Inner = Src.getOperand(0);
22929 EVT InnerVT = Inner.getValueType();
22930 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
22931 unsigned BW = InnerVT.getScalarSizeInBits();
22932 APInt SrcMask = APInt(BW, 1);
22933 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
22934 return LowerVectorAllEqual(DL, Inner,
22935 DAG.getConstant(Cmp, DL, InnerVT), CC,
22936 SrcMask, Subtarget, DAG, X86CC);
22937 }
22938 }
22939 }
22940 }
22941
22942 return SDValue();
22943}
22944
22945/// return true if \c Op has a use that doesn't just read flags.
22947 for (SDUse &Use : Op->uses()) {
22948 SDNode *User = Use.getUser();
22949 unsigned UOpNo = Use.getOperandNo();
22950 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22951 // Look past truncate.
22952 UOpNo = User->use_begin()->getOperandNo();
22953 User = User->use_begin()->getUser();
22954 }
22955
22956 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22957 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22958 return true;
22959 }
22960 return false;
22961}
22962
22963// Transform to an x86-specific ALU node with flags if there is a chance of
22964// using an RMW op or only the flags are used. Otherwise, leave
22965// the node alone and emit a 'cmp' or 'test' instruction.
22967 for (SDNode *U : Op->users())
22968 if (U->getOpcode() != ISD::CopyToReg &&
22969 U->getOpcode() != ISD::SETCC &&
22970 U->getOpcode() != ISD::STORE)
22971 return false;
22972
22973 return true;
22974}
22975
22976/// Emit nodes that will be selected as "test Op0,Op0", or something
22977/// equivalent.
22978static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22979 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22980 // CF and OF aren't always set the way we want. Determine which
22981 // of these we need.
22982 bool NeedCF = false;
22983 bool NeedOF = false;
22984 switch (X86CC) {
22985 default: break;
22986 case X86::COND_A: case X86::COND_AE:
22987 case X86::COND_B: case X86::COND_BE:
22988 NeedCF = true;
22989 break;
22990 case X86::COND_G: case X86::COND_GE:
22991 case X86::COND_L: case X86::COND_LE:
22992 case X86::COND_O: case X86::COND_NO: {
22993 // Check if we really need to set the
22994 // Overflow flag. If NoSignedWrap is present
22995 // that is not actually needed.
22996 switch (Op->getOpcode()) {
22997 case ISD::ADD:
22998 case ISD::SUB:
22999 case ISD::MUL:
23000 case ISD::SHL:
23001 if (Op.getNode()->getFlags().hasNoSignedWrap())
23002 break;
23003 [[fallthrough]];
23004 default:
23005 NeedOF = true;
23006 break;
23007 }
23008 break;
23009 }
23010 }
23011 // See if we can use the EFLAGS value from the operand instead of
23012 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23013 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23014 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23015 // Emit a CMP with 0, which is the TEST pattern.
23016 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23017 DAG.getConstant(0, dl, Op.getValueType()));
23018 }
23019 unsigned Opcode = 0;
23020 unsigned NumOperands = 0;
23021
23022 SDValue ArithOp = Op;
23023
23024 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23025 // which may be the result of a CAST. We use the variable 'Op', which is the
23026 // non-casted variable when we check for possible users.
23027 switch (ArithOp.getOpcode()) {
23028 case ISD::AND:
23029 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23030 // because a TEST instruction will be better.
23031 if (!hasNonFlagsUse(Op))
23032 break;
23033
23034 [[fallthrough]];
23035 case ISD::ADD:
23036 case ISD::SUB:
23037 case ISD::OR:
23038 case ISD::XOR:
23040 break;
23041
23042 // Otherwise use a regular EFLAGS-setting instruction.
23043 switch (ArithOp.getOpcode()) {
23044 // clang-format off
23045 default: llvm_unreachable("unexpected operator!");
23046 case ISD::ADD: Opcode = X86ISD::ADD; break;
23047 case ISD::SUB: Opcode = X86ISD::SUB; break;
23048 case ISD::XOR: Opcode = X86ISD::XOR; break;
23049 case ISD::AND: Opcode = X86ISD::AND; break;
23050 case ISD::OR: Opcode = X86ISD::OR; break;
23051 // clang-format on
23052 }
23053
23054 NumOperands = 2;
23055 break;
23056 case X86ISD::ADD:
23057 case X86ISD::SUB:
23058 case X86ISD::OR:
23059 case X86ISD::XOR:
23060 case X86ISD::AND:
23061 return SDValue(Op.getNode(), 1);
23062 case ISD::SSUBO:
23063 case ISD::USUBO: {
23064 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23065 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23066 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23067 Op->getOperand(1)).getValue(1);
23068 }
23069 default:
23070 break;
23071 }
23072
23073 if (Opcode == 0) {
23074 // Emit a CMP with 0, which is the TEST pattern.
23075 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23076 DAG.getConstant(0, dl, Op.getValueType()));
23077 }
23078 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23079 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
23080
23081 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23082 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23083 return SDValue(New.getNode(), 1);
23084}
23085
23086/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23087/// equivalent.
23088static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
23089 const SDLoc &dl, SelectionDAG &DAG,
23090 const X86Subtarget &Subtarget) {
23091 if (isNullConstant(Op1))
23092 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23093
23094 EVT CmpVT = Op0.getValueType();
23095
23096 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23097 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23098
23099 // Only promote the compare up to I32 if it is a 16 bit operation
23100 // with an immediate. 16 bit immediates are to be avoided unless the target
23101 // isn't slowed down by length changing prefixes, we're optimizing for
23102 // codesize or the comparison is with a folded load.
23103 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23104 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23106 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23107 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23108 // Don't do this if the immediate can fit in 8-bits.
23109 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23110 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23111 unsigned ExtendOp =
23113 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23114 // For equality comparisons try to use SIGN_EXTEND if the input was
23115 // truncate from something with enough sign bits.
23116 if (Op0.getOpcode() == ISD::TRUNCATE) {
23117 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23118 ExtendOp = ISD::SIGN_EXTEND;
23119 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23120 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23121 ExtendOp = ISD::SIGN_EXTEND;
23122 }
23123 }
23124
23125 CmpVT = MVT::i32;
23126 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23127 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23128 }
23129 }
23130
23131 // Try to shrink i64 compares if the input has enough zero bits.
23132 // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
23133 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23134 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23135 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23136 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23137 CmpVT = MVT::i32;
23138 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23139 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23140 }
23141
23142 // 0-x == y --> x+y == 0
23143 // 0-x != y --> x+y != 0
23144 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23145 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23146 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23147 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23148 return Add.getValue(1);
23149 }
23150
23151 // x == 0-y --> x+y == 0
23152 // x != 0-y --> x+y != 0
23153 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23154 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23155 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23156 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23157 return Add.getValue(1);
23158 }
23159
23160 // Use SUB instead of CMP to enable CSE between SUB and CMP.
23161 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23162 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
23163 return Sub.getValue(1);
23164}
23165
23167 EVT VT) const {
23168 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
23169}
23170
23171bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23172 SDNode *N, SDValue, SDValue IntPow2) const {
23173 if (N->getOpcode() == ISD::FDIV)
23174 return true;
23175
23176 EVT FPVT = N->getValueType(0);
23177 EVT IntVT = IntPow2.getValueType();
23178
23179 // This indicates a non-free bitcast.
23180 // TODO: This is probably overly conservative as we will need to scale the
23181 // integer vector anyways for the int->fp cast.
23182 if (FPVT.isVector() &&
23183 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23184 return false;
23185
23186 return true;
23187}
23188
23189/// Check if replacement of SQRT with RSQRT should be disabled.
23190bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23191 EVT VT = Op.getValueType();
23192
23193 // We don't need to replace SQRT with RSQRT for half type.
23194 if (VT.getScalarType() == MVT::f16)
23195 return true;
23196
23197 // We never want to use both SQRT and RSQRT instructions for the same input.
23198 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23199 return false;
23200
23201 if (VT.isVector())
23202 return Subtarget.hasFastVectorFSQRT();
23203 return Subtarget.hasFastScalarFSQRT();
23204}
23205
23206/// The minimum architected relative accuracy is 2^-12. We need one
23207/// Newton-Raphson step to have a good float result (24 bits of precision).
23208SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23209 SelectionDAG &DAG, int Enabled,
23210 int &RefinementSteps,
23211 bool &UseOneConstNR,
23212 bool Reciprocal) const {
23213 SDLoc DL(Op);
23214 EVT VT = Op.getValueType();
23215
23216 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23217 // It is likely not profitable to do this for f64 because a double-precision
23218 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23219 // instructions: convert to single, rsqrtss, convert back to double, refine
23220 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23221 // along with FMA, this could be a throughput win.
23222 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23223 // after legalize types.
23224 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23225 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23226 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23227 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23228 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23229 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23230 RefinementSteps = 1;
23231
23232 UseOneConstNR = false;
23233 // There is no FSQRT for 512-bits, but there is RSQRT14.
23234 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23235 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23236 if (RefinementSteps == 0 && !Reciprocal)
23237 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23238 return Estimate;
23239 }
23240
23241 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23242 Subtarget.hasFP16()) {
23243 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23244 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23245 RefinementSteps = 0;
23246
23247 if (VT == MVT::f16) {
23249 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23250 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23251 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23252 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23253 }
23254
23255 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23256 }
23257 return SDValue();
23258}
23259
23260/// The minimum architected relative accuracy is 2^-12. We need one
23261/// Newton-Raphson step to have a good float result (24 bits of precision).
23262SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23263 int Enabled,
23264 int &RefinementSteps) const {
23265 SDLoc DL(Op);
23266 EVT VT = Op.getValueType();
23267
23268 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23269 // It is likely not profitable to do this for f64 because a double-precision
23270 // reciprocal estimate with refinement on x86 prior to FMA requires
23271 // 15 instructions: convert to single, rcpss, convert back to double, refine
23272 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23273 // along with FMA, this could be a throughput win.
23274
23275 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23276 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23277 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23278 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23279 // Enable estimate codegen with 1 refinement step for vector division.
23280 // Scalar division estimates are disabled because they break too much
23281 // real-world code. These defaults are intended to match GCC behavior.
23282 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23283 return SDValue();
23284
23285 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23286 RefinementSteps = 1;
23287
23288 // There is no FSQRT for 512-bits, but there is RCP14.
23289 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23290 return DAG.getNode(Opcode, DL, VT, Op);
23291 }
23292
23293 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23294 Subtarget.hasFP16()) {
23295 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23296 RefinementSteps = 0;
23297
23298 if (VT == MVT::f16) {
23300 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23301 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23302 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23303 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23304 }
23305
23306 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23307 }
23308 return SDValue();
23309}
23310
23311/// If we have at least two divisions that use the same divisor, convert to
23312/// multiplication by a reciprocal. This may need to be adjusted for a given
23313/// CPU if a division's cost is not at least twice the cost of a multiplication.
23314/// This is because we still need one division to calculate the reciprocal and
23315/// then we need two multiplies by that reciprocal as replacements for the
23316/// original divisions.
23317unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
23318 return 2;
23319}
23320
23321SDValue
23322X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23323 SelectionDAG &DAG,
23324 SmallVectorImpl<SDNode *> &Created) const {
23326 if (isIntDivCheap(N->getValueType(0), Attr))
23327 return SDValue(N,0); // Lower SDIV as SDIV
23328
23329 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23330 "Unexpected divisor!");
23331
23332 // Only perform this transform if CMOV is supported otherwise the select
23333 // below will become a branch.
23334 if (!Subtarget.canUseCMOV())
23335 return SDValue();
23336
23337 // fold (sdiv X, pow2)
23338 EVT VT = N->getValueType(0);
23339 // FIXME: Support i8.
23340 if (VT != MVT::i16 && VT != MVT::i32 &&
23341 !(Subtarget.is64Bit() && VT == MVT::i64))
23342 return SDValue();
23343
23344 // If the divisor is 2 or -2, the default expansion is better.
23345 if (Divisor == 2 ||
23346 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23347 return SDValue();
23348
23349 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23350}
23351
23352/// Result of 'and' is compared against zero. Change to a BT node if possible.
23353/// Returns the BT node and the condition code needed to use it.
23355 SelectionDAG &DAG, X86::CondCode &X86CC) {
23356 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23357 SDValue Op0 = And.getOperand(0);
23358 SDValue Op1 = And.getOperand(1);
23359 if (Op0.getOpcode() == ISD::TRUNCATE)
23360 Op0 = Op0.getOperand(0);
23361 if (Op1.getOpcode() == ISD::TRUNCATE)
23362 Op1 = Op1.getOperand(0);
23363
23364 SDValue Src, BitNo;
23365 if (Op1.getOpcode() == ISD::SHL)
23366 std::swap(Op0, Op1);
23367 if (Op0.getOpcode() == ISD::SHL) {
23368 if (isOneConstant(Op0.getOperand(0))) {
23369 // If we looked past a truncate, check that it's only truncating away
23370 // known zeros.
23371 unsigned BitWidth = Op0.getValueSizeInBits();
23372 unsigned AndBitWidth = And.getValueSizeInBits();
23373 if (BitWidth > AndBitWidth) {
23374 KnownBits Known = DAG.computeKnownBits(Op0);
23375 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23376 return SDValue();
23377 }
23378 Src = Op1;
23379 BitNo = Op0.getOperand(1);
23380 }
23381 } else if (Op1.getOpcode() == ISD::Constant) {
23382 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23383 uint64_t AndRHSVal = AndRHS->getZExtValue();
23384 SDValue AndLHS = Op0;
23385
23386 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23387 Src = AndLHS.getOperand(0);
23388 BitNo = AndLHS.getOperand(1);
23389 } else {
23390 // Use BT if the immediate can't be encoded in a TEST instruction or we
23391 // are optimizing for size and the immedaite won't fit in a byte.
23392 bool OptForSize = DAG.shouldOptForSize();
23393 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23394 isPowerOf2_64(AndRHSVal)) {
23395 Src = AndLHS;
23396 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23397 Src.getValueType());
23398 }
23399 }
23400 }
23401
23402 // No patterns found, give up.
23403 if (!Src.getNode())
23404 return SDValue();
23405
23406 // Remove any bit flip.
23407 if (isBitwiseNot(Src)) {
23408 Src = Src.getOperand(0);
23410 }
23411
23412 // Attempt to create the X86ISD::BT node.
23413 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23414 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23415 return BT;
23416 }
23417
23418 return SDValue();
23419}
23420
23421// Check if pre-AVX condcode can be performed by a single FCMP op.
23422static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23423 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23424}
23425
23426/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23427/// CMPs.
23428static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23429 SDValue &Op1, bool &IsAlwaysSignaling) {
23430 unsigned SSECC;
23431 bool Swap = false;
23432
23433 // SSE Condition code mapping:
23434 // 0 - EQ
23435 // 1 - LT
23436 // 2 - LE
23437 // 3 - UNORD
23438 // 4 - NEQ
23439 // 5 - NLT
23440 // 6 - NLE
23441 // 7 - ORD
23442 switch (SetCCOpcode) {
23443 // clang-format off
23444 default: llvm_unreachable("Unexpected SETCC condition");
23445 case ISD::SETOEQ:
23446 case ISD::SETEQ: SSECC = 0; break;
23447 case ISD::SETOGT:
23448 case ISD::SETGT: Swap = true; [[fallthrough]];
23449 case ISD::SETLT:
23450 case ISD::SETOLT: SSECC = 1; break;
23451 case ISD::SETOGE:
23452 case ISD::SETGE: Swap = true; [[fallthrough]];
23453 case ISD::SETLE:
23454 case ISD::SETOLE: SSECC = 2; break;
23455 case ISD::SETUO: SSECC = 3; break;
23456 case ISD::SETUNE:
23457 case ISD::SETNE: SSECC = 4; break;
23458 case ISD::SETULE: Swap = true; [[fallthrough]];
23459 case ISD::SETUGE: SSECC = 5; break;
23460 case ISD::SETULT: Swap = true; [[fallthrough]];
23461 case ISD::SETUGT: SSECC = 6; break;
23462 case ISD::SETO: SSECC = 7; break;
23463 case ISD::SETUEQ: SSECC = 8; break;
23464 case ISD::SETONE: SSECC = 12; break;
23465 // clang-format on
23466 }
23467 if (Swap)
23468 std::swap(Op0, Op1);
23469
23470 switch (SetCCOpcode) {
23471 default:
23472 IsAlwaysSignaling = true;
23473 break;
23474 case ISD::SETEQ:
23475 case ISD::SETOEQ:
23476 case ISD::SETUEQ:
23477 case ISD::SETNE:
23478 case ISD::SETONE:
23479 case ISD::SETUNE:
23480 case ISD::SETO:
23481 case ISD::SETUO:
23482 IsAlwaysSignaling = false;
23483 break;
23484 }
23485
23486 return SSECC;
23487}
23488
23489/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23490/// concatenate the result back.
23492 SelectionDAG &DAG, const SDLoc &dl) {
23493 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23494 "Unsupported VTs!");
23495 SDValue CC = DAG.getCondCode(Cond);
23496
23497 // Extract the LHS Lo/Hi vectors
23498 SDValue LHS1, LHS2;
23499 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23500
23501 // Extract the RHS Lo/Hi vectors
23502 SDValue RHS1, RHS2;
23503 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23504
23505 // Issue the operation on the smaller types and concatenate the result back
23506 EVT LoVT, HiVT;
23507 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23508 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23509 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23510 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23511}
23512
23514 SelectionDAG &DAG) {
23515 SDValue Op0 = Op.getOperand(0);
23516 SDValue Op1 = Op.getOperand(1);
23517 SDValue CC = Op.getOperand(2);
23518 MVT VT = Op.getSimpleValueType();
23519 assert(VT.getVectorElementType() == MVT::i1 &&
23520 "Cannot set masked compare for this operation");
23521
23522 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23523
23524 // Prefer SETGT over SETLT.
23525 if (SetCCOpcode == ISD::SETLT) {
23526 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23527 std::swap(Op0, Op1);
23528 }
23529
23530 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23531}
23532
23533/// Given a buildvector constant, return a new vector constant with each element
23534/// incremented or decremented. If incrementing or decrementing would result in
23535/// unsigned overflow or underflow or this is not a simple vector constant,
23536/// return an empty value.
23538 bool NSW) {
23539 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23540 if (!BV || !V.getValueType().isSimple())
23541 return SDValue();
23542
23543 MVT VT = V.getSimpleValueType();
23544 MVT EltVT = VT.getVectorElementType();
23545 unsigned NumElts = VT.getVectorNumElements();
23547 SDLoc DL(V);
23548 for (unsigned i = 0; i < NumElts; ++i) {
23549 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23550 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23551 return SDValue();
23552
23553 // Avoid overflow/underflow.
23554 const APInt &EltC = Elt->getAPIntValue();
23555 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23556 return SDValue();
23557 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23558 (!IsInc && EltC.isMinSignedValue())))
23559 return SDValue();
23560
23561 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23562 }
23563
23564 return DAG.getBuildVector(VT, DL, NewVecC);
23565}
23566
23567/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23568/// Op0 u<= Op1:
23569/// t = psubus Op0, Op1
23570/// pcmpeq t, <0..0>
23572 ISD::CondCode Cond, const SDLoc &dl,
23573 const X86Subtarget &Subtarget,
23574 SelectionDAG &DAG) {
23575 if (!Subtarget.hasSSE2())
23576 return SDValue();
23577
23578 MVT VET = VT.getVectorElementType();
23579 if (VET != MVT::i8 && VET != MVT::i16)
23580 return SDValue();
23581
23582 switch (Cond) {
23583 default:
23584 return SDValue();
23585 case ISD::SETULT: {
23586 // If the comparison is against a constant we can turn this into a
23587 // setule. With psubus, setule does not require a swap. This is
23588 // beneficial because the constant in the register is no longer
23589 // destructed as the destination so it can be hoisted out of a loop.
23590 // Only do this pre-AVX since vpcmp* is no longer destructive.
23591 if (Subtarget.hasAVX())
23592 return SDValue();
23593 SDValue ULEOp1 =
23594 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23595 if (!ULEOp1)
23596 return SDValue();
23597 Op1 = ULEOp1;
23598 break;
23599 }
23600 case ISD::SETUGT: {
23601 // If the comparison is against a constant, we can turn this into a setuge.
23602 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23603 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23604 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23605 SDValue UGEOp1 =
23606 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
23607 if (!UGEOp1)
23608 return SDValue();
23609 Op1 = Op0;
23610 Op0 = UGEOp1;
23611 break;
23612 }
23613 // Psubus is better than flip-sign because it requires no inversion.
23614 case ISD::SETUGE:
23615 std::swap(Op0, Op1);
23616 break;
23617 case ISD::SETULE:
23618 break;
23619 }
23620
23621 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23622 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23623 DAG.getConstant(0, dl, VT));
23624}
23625
23626static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23627 SelectionDAG &DAG) {
23628 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23629 Op.getOpcode() == ISD::STRICT_FSETCCS;
23630 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23631 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23632 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23633 MVT VT = Op->getSimpleValueType(0);
23634 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23635 MVT OpVT = Op0.getSimpleValueType();
23636 SDLoc dl(Op);
23637
23638 if (OpVT.isFloatingPoint()) {
23639 MVT EltVT = OpVT.getVectorElementType();
23640 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
23641 EltVT == MVT::f64);
23642
23643 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23644 if (isSoftF16(EltVT, Subtarget)) {
23645 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
23646 return SDValue();
23647
23648 // Break 256-bit FP vector compare into smaller ones.
23649 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
23650 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23651
23652 // Break 512-bit FP vector compare into smaller ones.
23653 if (OpVT.is512BitVector())
23654 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23655
23656 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
23657 if (IsStrict) {
23658 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
23659 {Chain, Op0});
23660 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
23661 {Chain, Op1});
23662 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
23663 {Chain, Op0, Op1, CC});
23664 }
23665 MVT DVT = VT.getVectorElementType() == MVT::i16
23666 ? VT.changeVectorElementType(MVT::i32)
23667 : VT;
23668 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
23669 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
23670 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
23671 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
23672 }
23673
23674 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23675
23676 // If we have a strict compare with a vXi1 result and the input is 128/256
23677 // bits we can't use a masked compare unless we have VLX. If we use a wider
23678 // compare like we do for non-strict, we might trigger spurious exceptions
23679 // from the upper elements. Instead emit a AVX compare and convert to mask.
23680 unsigned Opc;
23681 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23682 (!IsStrict || Subtarget.hasVLX() ||
23684#ifndef NDEBUG
23685 unsigned Num = VT.getVectorNumElements();
23686 assert(Num <= 16 ||
23687 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
23688#endif
23689 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23690 } else {
23691 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23692 // The SSE/AVX packed FP comparison nodes are defined with a
23693 // floating-point vector result that matches the operand type. This allows
23694 // them to work with an SSE1 target (integer vector types are not legal).
23695 VT = Op0.getSimpleValueType();
23696 }
23697
23698 SDValue Cmp;
23699 bool IsAlwaysSignaling;
23700 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23701 if (!Subtarget.hasAVX()) {
23702 // TODO: We could use following steps to handle a quiet compare with
23703 // signaling encodings.
23704 // 1. Get ordered masks from a quiet ISD::SETO
23705 // 2. Use the masks to mask potential unordered elements in operand A, B
23706 // 3. Get the compare results of masked A, B
23707 // 4. Calculating final result using the mask and result from 3
23708 // But currently, we just fall back to scalar operations.
23709 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23710 return SDValue();
23711
23712 // Insert an extra signaling instruction to raise exception.
23713 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23714 SDValue SignalCmp = DAG.getNode(
23715 Opc, dl, {VT, MVT::Other},
23716 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23717 // FIXME: It seems we need to update the flags of all new strict nodes.
23718 // Otherwise, mayRaiseFPException in MI will return false due to
23719 // NoFPExcept = false by default. However, I didn't find it in other
23720 // patches.
23721 SignalCmp->setFlags(Op->getFlags());
23722 Chain = SignalCmp.getValue(1);
23723 }
23724
23725 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23726 // emit two comparisons and a logic op to tie them together.
23727 if (!cheapX86FSETCC_SSE(Cond)) {
23728 // LLVM predicate is SETUEQ or SETONE.
23729 unsigned CC0, CC1;
23730 unsigned CombineOpc;
23731 if (Cond == ISD::SETUEQ) {
23732 CC0 = 3; // UNORD
23733 CC1 = 0; // EQ
23734 CombineOpc = X86ISD::FOR;
23735 } else {
23737 CC0 = 7; // ORD
23738 CC1 = 4; // NEQ
23739 CombineOpc = X86ISD::FAND;
23740 }
23741
23742 SDValue Cmp0, Cmp1;
23743 if (IsStrict) {
23744 Cmp0 = DAG.getNode(
23745 Opc, dl, {VT, MVT::Other},
23746 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23747 Cmp1 = DAG.getNode(
23748 Opc, dl, {VT, MVT::Other},
23749 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23750 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23751 Cmp1.getValue(1));
23752 } else {
23753 Cmp0 = DAG.getNode(
23754 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23755 Cmp1 = DAG.getNode(
23756 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23757 }
23758 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23759 } else {
23760 if (IsStrict) {
23761 Cmp = DAG.getNode(
23762 Opc, dl, {VT, MVT::Other},
23763 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23764 Chain = Cmp.getValue(1);
23765 } else
23766 Cmp = DAG.getNode(
23767 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23768 }
23769 } else {
23770 // Handle all other FP comparisons here.
23771 if (IsStrict) {
23772 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23773 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23774 Cmp = DAG.getNode(
23775 Opc, dl, {VT, MVT::Other},
23776 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23777 Chain = Cmp.getValue(1);
23778 } else
23779 Cmp = DAG.getNode(
23780 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23781 }
23782
23783 if (VT.getFixedSizeInBits() >
23784 Op.getSimpleValueType().getFixedSizeInBits()) {
23785 // We emitted a compare with an XMM/YMM result. Finish converting to a
23786 // mask register using a vptestm.
23788 Cmp = DAG.getBitcast(CastVT, Cmp);
23789 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23790 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23791 } else {
23792 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23793 // the result type of SETCC. The bitcast is expected to be optimized
23794 // away during combining/isel.
23795 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23796 }
23797
23798 if (IsStrict)
23799 return DAG.getMergeValues({Cmp, Chain}, dl);
23800
23801 return Cmp;
23802 }
23803
23804 assert(!IsStrict && "Strict SETCC only handles FP operands.");
23805
23806 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
23807 assert(VTOp0 == Op1.getSimpleValueType() &&
23808 "Expected operands with same type!");
23810 "Invalid number of packed elements for source and destination!");
23811
23812 // The non-AVX512 code below works under the assumption that source and
23813 // destination types are the same.
23814 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23815 "Value types for source and destination must be the same!");
23816
23817 // The result is boolean, but operands are int/float
23818 if (VT.getVectorElementType() == MVT::i1) {
23819 // In AVX-512 architecture setcc returns mask with i1 elements,
23820 // But there is no compare instruction for i8 and i16 elements in KNL.
23821 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23822 "Unexpected operand type");
23823 return LowerIntVSETCC_AVX512(Op, dl, DAG);
23824 }
23825
23826 // Lower using XOP integer comparisons.
23827 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23828 // Translate compare code to XOP PCOM compare mode.
23829 unsigned CmpMode = 0;
23830 switch (Cond) {
23831 // clang-format off
23832 default: llvm_unreachable("Unexpected SETCC condition");
23833 case ISD::SETULT:
23834 case ISD::SETLT: CmpMode = 0x00; break;
23835 case ISD::SETULE:
23836 case ISD::SETLE: CmpMode = 0x01; break;
23837 case ISD::SETUGT:
23838 case ISD::SETGT: CmpMode = 0x02; break;
23839 case ISD::SETUGE:
23840 case ISD::SETGE: CmpMode = 0x03; break;
23841 case ISD::SETEQ: CmpMode = 0x04; break;
23842 case ISD::SETNE: CmpMode = 0x05; break;
23843 // clang-format on
23844 }
23845
23846 // Are we comparing unsigned or signed integers?
23847 unsigned Opc =
23849
23850 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23851 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23852 }
23853
23854 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23855 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23857 SDValue BC0 = peekThroughBitcasts(Op0);
23858 if (BC0.getOpcode() == ISD::AND &&
23860 /*AllowUndefs=*/false)) {
23861 Cond = ISD::SETEQ;
23862 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23863 }
23864 }
23865
23866 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23867 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23868 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23870 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23871 unsigned BitWidth = VT.getScalarSizeInBits();
23872 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23873
23874 SDValue Result = Op0.getOperand(0);
23875 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23876 DAG.getConstant(ShiftAmt, dl, VT));
23877 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23878 DAG.getConstant(BitWidth - 1, dl, VT));
23879 return Result;
23880 }
23881 }
23882
23883 // Break 256-bit integer vector compare into smaller ones.
23884 if (VT.is256BitVector() && !Subtarget.hasInt256())
23885 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23886
23887 // Break 512-bit integer vector compare into smaller ones.
23888 // TODO: Try harder to use VPCMPx + VPMOV2x?
23889 if (VT.is512BitVector())
23890 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23891
23892 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23893 // not-of-PCMPEQ:
23894 // X != INT_MIN --> X >s INT_MIN
23895 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23896 // +X != 0 --> +X >s 0
23897 APInt ConstValue;
23898 if (Cond == ISD::SETNE &&
23899 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23900 if (ConstValue.isMinSignedValue())
23901 Cond = ISD::SETGT;
23902 else if (ConstValue.isMaxSignedValue())
23903 Cond = ISD::SETLT;
23904 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
23905 Cond = ISD::SETGT;
23906 }
23907
23908 // If both operands are known non-negative, then an unsigned compare is the
23909 // same as a signed compare and there's no need to flip signbits.
23910 // TODO: We could check for more general simplifications here since we're
23911 // computing known bits.
23912 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23913 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23914
23915 // Special case: Use min/max operations for unsigned compares.
23916 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23918 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23919 TLI.isOperationLegal(ISD::UMIN, VT)) {
23920 // If we have a constant operand, increment/decrement it and change the
23921 // condition to avoid an invert.
23922 if (Cond == ISD::SETUGT) {
23923 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23924 if (SDValue UGTOp1 =
23925 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
23926 Op1 = UGTOp1;
23927 Cond = ISD::SETUGE;
23928 }
23929 }
23930 if (Cond == ISD::SETULT) {
23931 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23932 if (SDValue ULTOp1 =
23933 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
23934 Op1 = ULTOp1;
23935 Cond = ISD::SETULE;
23936 }
23937 }
23938 bool Invert = false;
23939 unsigned Opc;
23940 switch (Cond) {
23941 // clang-format off
23942 default: llvm_unreachable("Unexpected condition code");
23943 case ISD::SETUGT: Invert = true; [[fallthrough]];
23944 case ISD::SETULE: Opc = ISD::UMIN; break;
23945 case ISD::SETULT: Invert = true; [[fallthrough]];
23946 case ISD::SETUGE: Opc = ISD::UMAX; break;
23947 // clang-format on
23948 }
23949
23950 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23951 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23952
23953 // If the logical-not of the result is required, perform that now.
23954 if (Invert)
23955 Result = DAG.getNOT(dl, Result, VT);
23956
23957 return Result;
23958 }
23959
23960 // Try to use SUBUS and PCMPEQ.
23961 if (FlipSigns)
23962 if (SDValue V =
23963 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23964 return V;
23965
23966 // We are handling one of the integer comparisons here. Since SSE only has
23967 // GT and EQ comparisons for integer, swapping operands and multiple
23968 // operations may be required for some comparisons.
23969 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23971 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23973 bool Invert = Cond == ISD::SETNE ||
23975
23976 if (Swap)
23977 std::swap(Op0, Op1);
23978
23979 // Check that the operation in question is available (most are plain SSE2,
23980 // but PCMPGTQ and PCMPEQQ have different requirements).
23981 if (VT == MVT::v2i64) {
23982 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23983 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23984
23985 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23986 // the odd elements over the even elements.
23987 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23988 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23989 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23990
23991 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23992 static const int MaskHi[] = { 1, 1, 3, 3 };
23993 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23994
23995 return DAG.getBitcast(VT, Result);
23996 }
23997
23998 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23999 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24000 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24001
24002 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24003 static const int MaskHi[] = { 1, 1, 3, 3 };
24004 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24005
24006 return DAG.getBitcast(VT, Result);
24007 }
24008
24009 // If the i64 elements are sign-extended enough to be representable as i32
24010 // then we can compare the lower i32 bits and splat.
24011 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24012 DAG.ComputeNumSignBits(Op1) > 32) {
24013 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24014 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24015
24016 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24017 static const int MaskLo[] = {0, 0, 2, 2};
24018 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24019
24020 return DAG.getBitcast(VT, Result);
24021 }
24022
24023 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24024 // bits of the inputs before performing those operations. The lower
24025 // compare is always unsigned.
24026 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24027 : 0x0000000080000000ULL,
24028 dl, MVT::v2i64);
24029
24030 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24031 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24032
24033 // Cast everything to the right type.
24034 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24035 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24036
24037 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24038 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24039 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24040
24041 // Create masks for only the low parts/high parts of the 64 bit integers.
24042 static const int MaskHi[] = { 1, 1, 3, 3 };
24043 static const int MaskLo[] = { 0, 0, 2, 2 };
24044 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24045 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24046 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24047
24048 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24049 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24050
24051 if (Invert)
24052 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24053
24054 return DAG.getBitcast(VT, Result);
24055 }
24056
24057 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24058 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24059 // pcmpeqd + pshufd + pand.
24060 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24061
24062 // First cast everything to the right type.
24063 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24064 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24065
24066 // Do the compare.
24067 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24068
24069 // Make sure the lower and upper halves are both all-ones.
24070 static const int Mask[] = { 1, 0, 3, 2 };
24071 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24072 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24073
24074 if (Invert)
24075 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24076
24077 return DAG.getBitcast(VT, Result);
24078 }
24079 }
24080
24081 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24082 // bits of the inputs before performing those operations.
24083 if (FlipSigns) {
24084 MVT EltVT = VT.getVectorElementType();
24086 VT);
24087 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24088 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24089 }
24090
24091 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24092
24093 // If the logical-not of the result is required, perform that now.
24094 if (Invert)
24095 Result = DAG.getNOT(dl, Result, VT);
24096
24097 return Result;
24098}
24099
24100// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24102 const SDLoc &dl, SelectionDAG &DAG,
24103 const X86Subtarget &Subtarget,
24104 SDValue &X86CC) {
24105 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24106
24107 // Must be a bitcast from vXi1.
24108 if (Op0.getOpcode() != ISD::BITCAST)
24109 return SDValue();
24110
24111 Op0 = Op0.getOperand(0);
24112 MVT VT = Op0.getSimpleValueType();
24113 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24114 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24115 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24116 return SDValue();
24117
24118 X86::CondCode X86Cond;
24119 if (isNullConstant(Op1)) {
24120 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24121 } else if (isAllOnesConstant(Op1)) {
24122 // C flag is set for all ones.
24123 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24124 } else
24125 return SDValue();
24126
24127 // If the input is an AND, we can combine it's operands into the KTEST.
24128 bool KTestable = false;
24129 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24130 KTestable = true;
24131 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24132 KTestable = true;
24133 if (!isNullConstant(Op1))
24134 KTestable = false;
24135 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24136 SDValue LHS = Op0.getOperand(0);
24137 SDValue RHS = Op0.getOperand(1);
24138 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24139 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24140 }
24141
24142 // If the input is an OR, we can combine it's operands into the KORTEST.
24143 SDValue LHS = Op0;
24144 SDValue RHS = Op0;
24145 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24146 LHS = Op0.getOperand(0);
24147 RHS = Op0.getOperand(1);
24148 }
24149
24150 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24151 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24152}
24153
24154/// Emit flags for the given setcc condition and operands. Also returns the
24155/// corresponding X86 condition code constant in X86CC.
24156SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24157 ISD::CondCode CC, const SDLoc &dl,
24158 SelectionDAG &DAG,
24159 SDValue &X86CC) const {
24160 // Equality Combines.
24161 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24162 X86::CondCode X86CondCode;
24163
24164 // Optimize to BT if possible.
24165 // Lower (X & (1 << N)) == 0 to BT(X, N).
24166 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24167 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24168 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24169 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24170 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24171 return BT;
24172 }
24173 }
24174
24175 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24176 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24177 X86CondCode)) {
24178 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24179 return CmpZ;
24180 }
24181
24182 // Try to lower using KORTEST or KTEST.
24183 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24184 return Test;
24185
24186 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24187 // of these.
24188 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24189 // If the input is a setcc, then reuse the input setcc or use a new one
24190 // with the inverted condition.
24191 if (Op0.getOpcode() == X86ISD::SETCC) {
24192 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24193
24194 X86CC = Op0.getOperand(0);
24195 if (Invert) {
24196 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24197 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24198 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24199 }
24200
24201 return Op0.getOperand(1);
24202 }
24203 }
24204
24205 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24206 // overflow.
24207 if (isMinSignedConstant(Op1)) {
24208 EVT VT = Op0.getValueType();
24209 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24210 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24212 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24213 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24214 DAG.getConstant(0, dl, VT), Op0);
24215 return SDValue(Neg.getNode(), 1);
24216 }
24217 }
24218
24219 // Try to use the carry flag from the add in place of an separate CMP for:
24220 // (seteq (add X, -1), -1). Similar for setne.
24221 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24222 Op0.getOperand(1) == Op1) {
24223 if (isProfitableToUseFlagOp(Op0)) {
24224 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24225
24226 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24227 Op0.getOperand(1));
24228 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24229 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24230 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24231 return SDValue(New.getNode(), 1);
24232 }
24233 }
24234 }
24235
24237 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24238 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24239
24240 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24241 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24242 return EFLAGS;
24243}
24244
24245SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24246
24247 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24248 Op.getOpcode() == ISD::STRICT_FSETCCS;
24249 MVT VT = Op->getSimpleValueType(0);
24250
24251 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24252
24253 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24254 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24255 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24256 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24257 SDLoc dl(Op);
24259 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24260
24261 if (isSoftF16(Op0.getValueType(), Subtarget))
24262 return SDValue();
24263
24264 // Handle f128 first, since one possible outcome is a normal integer
24265 // comparison which gets handled by emitFlagsForSetcc.
24266 if (Op0.getValueType() == MVT::f128) {
24267 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24268 Op.getOpcode() == ISD::STRICT_FSETCCS);
24269
24270 // If softenSetCCOperands returned a scalar, use it.
24271 if (!Op1.getNode()) {
24272 assert(Op0.getValueType() == Op.getValueType() &&
24273 "Unexpected setcc expansion!");
24274 if (IsStrict)
24275 return DAG.getMergeValues({Op0, Chain}, dl);
24276 return Op0;
24277 }
24278 }
24279
24280 if (Op0.getSimpleValueType().isInteger()) {
24281 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24282 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24283 // this may translate to less uops depending on uarch implementation. The
24284 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24285 // canonicalize to that CondCode.
24286 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24287 // encoding size - so it must either already be a i8 or i32 immediate, or it
24288 // shrinks down to that. We don't do this for any i64's to avoid additional
24289 // constant materializations.
24290 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24291 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24292 const APInt &Op1Val = Op1C->getAPIntValue();
24293 if (!Op1Val.isZero()) {
24294 // Ensure the constant+1 doesn't overflow.
24295 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24296 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24297 APInt Op1ValPlusOne = Op1Val + 1;
24298 if (Op1ValPlusOne.isSignedIntN(32) &&
24299 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24300 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24303 }
24304 }
24305 }
24306 }
24307
24308 SDValue X86CC;
24309 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24310 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24311 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24312 }
24313
24314 if (Subtarget.hasAVX10_2()) {
24315 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24316 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24317 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24318 if (Op0.getSimpleValueType() != MVT::f80)
24319 return getSETCC(
24320 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24321 }
24322 }
24323 // Handle floating point.
24324 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24325 if (CondCode == X86::COND_INVALID)
24326 return SDValue();
24327
24328 SDValue EFLAGS;
24329 if (IsStrict) {
24330 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24331 EFLAGS =
24333 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24334 Chain = EFLAGS.getValue(1);
24335 } else {
24336 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24337 }
24338
24339 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24340 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24341 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24342}
24343
24344SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24345 SDValue LHS = Op.getOperand(0);
24346 SDValue RHS = Op.getOperand(1);
24347 SDValue Carry = Op.getOperand(2);
24348 SDValue Cond = Op.getOperand(3);
24349 SDLoc DL(Op);
24350
24351 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24352 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24353
24354 // Recreate the carry if needed.
24355 EVT CarryVT = Carry.getValueType();
24356 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24357 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24358
24359 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24360 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24361 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24362}
24363
24364// This function returns three things: the arithmetic computation itself
24365// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24366// flag and the condition code define the case in which the arithmetic
24367// computation overflows.
24368static std::pair<SDValue, SDValue>
24370 assert(Op.getResNo() == 0 && "Unexpected result number!");
24371 SDValue Value, Overflow;
24372 SDValue LHS = Op.getOperand(0);
24373 SDValue RHS = Op.getOperand(1);
24374 unsigned BaseOp = 0;
24375 SDLoc DL(Op);
24376 switch (Op.getOpcode()) {
24377 default: llvm_unreachable("Unknown ovf instruction!");
24378 case ISD::SADDO:
24379 BaseOp = X86ISD::ADD;
24380 Cond = X86::COND_O;
24381 break;
24382 case ISD::UADDO:
24383 BaseOp = X86ISD::ADD;
24385 break;
24386 case ISD::SSUBO:
24387 BaseOp = X86ISD::SUB;
24388 Cond = X86::COND_O;
24389 break;
24390 case ISD::USUBO:
24391 BaseOp = X86ISD::SUB;
24392 Cond = X86::COND_B;
24393 break;
24394 case ISD::SMULO:
24395 BaseOp = X86ISD::SMUL;
24396 Cond = X86::COND_O;
24397 break;
24398 case ISD::UMULO:
24399 BaseOp = X86ISD::UMUL;
24400 Cond = X86::COND_O;
24401 break;
24402 }
24403
24404 if (BaseOp) {
24405 // Also sets EFLAGS.
24406 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24407 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24408 Overflow = Value.getValue(1);
24409 }
24410
24411 return std::make_pair(Value, Overflow);
24412}
24413
24415 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24416 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24417 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24418 // has only one use.
24419 SDLoc DL(Op);
24421 SDValue Value, Overflow;
24422 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24423
24424 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24425 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24426 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24427}
24428
24429/// Return true if opcode is a X86 logical comparison.
24431 unsigned Opc = Op.getOpcode();
24432 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24433 Opc == X86ISD::FCMP)
24434 return true;
24435 if (Op.getResNo() == 1 &&
24436 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24437 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
24438 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24439 return true;
24440
24441 return false;
24442}
24443
24445 if (V.getOpcode() != ISD::TRUNCATE)
24446 return false;
24447
24448 SDValue VOp0 = V.getOperand(0);
24449 unsigned InBits = VOp0.getValueSizeInBits();
24450 unsigned Bits = V.getValueSizeInBits();
24451 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24452}
24453
24454// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24456 unsigned X86CC, const SDLoc &DL,
24457 SelectionDAG &DAG,
24458 const X86Subtarget &Subtarget) {
24459 EVT CmpVT = CmpVal.getValueType();
24460 EVT VT = LHS.getValueType();
24461 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24462 return SDValue();
24463
24464 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24465 isOneConstant(CmpVal.getOperand(1))) {
24466 auto SplatLSB = [&](EVT SplatVT) {
24467 // we need mask of all zeros or ones with same size of the other
24468 // operands.
24469 SDValue Neg = CmpVal;
24470 if (CmpVT.bitsGT(SplatVT))
24471 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24472 else if (CmpVT.bitsLT(SplatVT))
24473 Neg = DAG.getNode(
24474 ISD::AND, DL, SplatVT,
24475 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24476 DAG.getConstant(1, DL, SplatVT));
24477 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24478 };
24479
24480 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24482 return SplatLSB(VT);
24483
24484 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24485 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24486 isa<ConstantSDNode>(RHS)) {
24487 SDValue Mask = SplatLSB(VT);
24488 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24489 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24490 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24491 }
24492
24493 SDValue Src1, Src2;
24494 auto isIdentityPatternZero = [&]() {
24495 switch (RHS.getOpcode()) {
24496 default:
24497 break;
24498 case ISD::OR:
24499 case ISD::XOR:
24500 case ISD::ADD:
24501 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24502 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24503 Src2 = LHS;
24504 return true;
24505 }
24506 break;
24507 case ISD::SHL:
24508 case ISD::SRA:
24509 case ISD::SRL:
24510 case ISD::SUB:
24511 if (RHS.getOperand(0) == LHS) {
24512 Src1 = RHS.getOperand(1);
24513 Src2 = LHS;
24514 return true;
24515 }
24516 break;
24517 }
24518 return false;
24519 };
24520
24521 auto isIdentityPatternOnes = [&]() {
24522 switch (LHS.getOpcode()) {
24523 default:
24524 break;
24525 case ISD::AND:
24526 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24527 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24528 Src2 = RHS;
24529 return true;
24530 }
24531 break;
24532 }
24533 return false;
24534 };
24535
24536 // Convert 'identity' patterns (iff X is 0 or 1):
24537 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24538 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24539 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24540 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24541 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24542 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24543 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24544 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24545 SDValue Mask = SplatLSB(Src1.getValueType());
24546 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24547 Src1); // Mask & z
24548 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24549 }
24550 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24551 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24552 SDValue Mask = SplatLSB(VT);
24553 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24554 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24555 }
24556 }
24557
24558 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24561 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24562
24563 // 'X - 1' sets the carry flag if X == 0.
24564 // '0 - X' sets the carry flag if X != 0.
24565 // Convert the carry flag to a -1/0 mask with sbb:
24566 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24567 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24568 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24569 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24570 SDValue Sub;
24571 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24572 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24573 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24574 } else {
24575 SDValue One = DAG.getConstant(1, DL, CmpVT);
24576 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24577 }
24578 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24579 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24580 Sub.getValue(1));
24581 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24582 }
24583
24584 return SDValue();
24585}
24586
24587SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24588 bool AddTest = true;
24589 SDValue Cond = Op.getOperand(0);
24590 SDValue Op1 = Op.getOperand(1);
24591 SDValue Op2 = Op.getOperand(2);
24592 SDLoc DL(Op);
24593 MVT VT = Op1.getSimpleValueType();
24594 SDValue CC;
24595
24596 if (isSoftF16(VT, Subtarget)) {
24597 MVT NVT = VT.changeTypeToInteger();
24598 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24599 DAG.getBitcast(NVT, Op1),
24600 DAG.getBitcast(NVT, Op2)));
24601 }
24602
24603 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24604 // are available or VBLENDV if AVX is available.
24605 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24606 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24607 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24608 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24609 bool IsAlwaysSignaling;
24610 unsigned SSECC =
24611 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24612 CondOp0, CondOp1, IsAlwaysSignaling);
24613
24614 if (Subtarget.hasAVX512()) {
24615 SDValue Cmp =
24616 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24617 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24618 assert(!VT.isVector() && "Not a scalar type?");
24619 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24620 }
24621
24622 if (SSECC < 8 || Subtarget.hasAVX()) {
24623 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24624 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24625
24626 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24627 // of 3 logic instructions for size savings and potentially speed.
24628 // Unfortunately, there is no scalar form of VBLENDV.
24629
24630 // If either operand is a +0.0 constant, don't try this. We can expect to
24631 // optimize away at least one of the logic instructions later in that
24632 // case, so that sequence would be faster than a variable blend.
24633
24634 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24635 // uses XMM0 as the selection register. That may need just as many
24636 // instructions as the AND/ANDN/OR sequence due to register moves, so
24637 // don't bother.
24638 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24639 !isNullFPConstant(Op2)) {
24640 // Convert to vectors, do a VSELECT, and convert back to scalar.
24641 // All of the conversions should be optimized away.
24642 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24643 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24644 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24645 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24646
24647 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24648 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24649
24650 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24651
24652 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
24653 DAG.getVectorIdxConstant(0, DL));
24654 }
24655 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24656 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24657 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24658 }
24659 }
24660
24661 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24662 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24663 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24664 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24665 }
24666
24667 if (Cond.getOpcode() == ISD::SETCC &&
24668 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
24669 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24670 Cond = NewCond;
24671 // If the condition was updated, it's possible that the operands of the
24672 // select were also updated (for example, EmitTest has a RAUW). Refresh
24673 // the local references to the select operands in case they got stale.
24674 Op1 = Op.getOperand(1);
24675 Op2 = Op.getOperand(2);
24676 }
24677 }
24678
24679 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24680 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24681 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24682 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24683 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24684 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24685 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24686 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24687 if (Cond.getOpcode() == X86ISD::SETCC &&
24688 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24689 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24690 SDValue Cmp = Cond.getOperand(1);
24691 SDValue CmpOp0 = Cmp.getOperand(0);
24692 unsigned CondCode = Cond.getConstantOperandVal(0);
24693
24694 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24695 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24696 // handle to keep the CMP with 0. This should be removed by
24697 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24698 // cttz_zero_undef.
24699 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24700 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24701 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24702 };
24703 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
24704 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24705 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24706 // Keep Cmp.
24707 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
24708 DL, DAG, Subtarget)) {
24709 return R;
24710 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
24711 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24712 ((CondCode == X86::COND_S) || // smin(x, 0)
24713 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
24714 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24715 //
24716 // If the comparison is testing for a positive value, we have to invert
24717 // the sign bit mask, so only do that transform if the target has a
24718 // bitwise 'and not' instruction (the invert is free).
24719 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24720 unsigned ShCt = VT.getSizeInBits() - 1;
24721 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
24722 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
24723 if (CondCode == X86::COND_G)
24724 Shift = DAG.getNOT(DL, Shift, VT);
24725 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
24726 }
24727 }
24728
24729 // Look past (and (setcc_carry (cmp ...)), 1).
24730 if (Cond.getOpcode() == ISD::AND &&
24731 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24732 isOneConstant(Cond.getOperand(1)))
24733 Cond = Cond.getOperand(0);
24734
24735 // Attempt to fold "raw cond" cases by treating them as:
24736 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
24737 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
24738 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
24739 Subtarget))
24740 return R;
24741
24742 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24743 // setting operand in place of the X86ISD::SETCC.
24744 unsigned CondOpcode = Cond.getOpcode();
24745 if (CondOpcode == X86ISD::SETCC ||
24746 CondOpcode == X86ISD::SETCC_CARRY) {
24747 CC = Cond.getOperand(0);
24748
24749 SDValue Cmp = Cond.getOperand(1);
24750 bool IllegalFPCMov = false;
24751 if (VT.isFloatingPoint() && !VT.isVector() &&
24752 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
24753 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24754
24755 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24756 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24757 Cond = Cmp;
24758 AddTest = false;
24759 }
24760 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24761 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24762 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24763 SDValue Value;
24764 X86::CondCode X86Cond;
24765 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24766
24767 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24768 AddTest = false;
24769 }
24770
24771 if (AddTest) {
24772 // Look past the truncate if the high bits are known zero.
24774 Cond = Cond.getOperand(0);
24775
24776 // We know the result of AND is compared against zero. Try to match
24777 // it to BT.
24778 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24779 X86::CondCode X86CondCode;
24780 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
24781 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
24782 Cond = BT;
24783 AddTest = false;
24784 }
24785 }
24786 }
24787
24788 if (AddTest) {
24789 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24790 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24791 }
24792
24793 // a < b ? -1 : 0 -> RES = ~setcc_carry
24794 // a < b ? 0 : -1 -> RES = setcc_carry
24795 // a >= b ? -1 : 0 -> RES = setcc_carry
24796 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24797 if (Cond.getOpcode() == X86ISD::SUB) {
24798 unsigned CondCode = CC->getAsZExtVal();
24799
24800 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24801 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24802 (isNullConstant(Op1) || isNullConstant(Op2))) {
24803 SDValue Res =
24804 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24805 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24806 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24807 return DAG.getNOT(DL, Res, Res.getValueType());
24808 return Res;
24809 }
24810 }
24811
24812 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24813 // widen the cmov and push the truncate through. This avoids introducing a new
24814 // branch during isel and doesn't add any extensions.
24815 if (Op.getValueType() == MVT::i8 &&
24816 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24817 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24818 if (T1.getValueType() == T2.getValueType() &&
24819 // Exclude CopyFromReg to avoid partial register stalls.
24820 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24821 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24822 CC, Cond);
24823 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24824 }
24825 }
24826
24827 // Or finally, promote i8 cmovs if we have CMOV,
24828 // or i16 cmovs if it won't prevent folding a load.
24829 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24830 // legal, but EmitLoweredSelect() can not deal with these extensions
24831 // being inserted between two CMOV's. (in i16 case too TBN)
24832 // https://bugs.llvm.org/show_bug.cgi?id=40974
24833 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
24834 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24835 !X86::mayFoldLoad(Op2, Subtarget))) {
24836 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24837 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24838 SDValue Ops[] = { Op2, Op1, CC, Cond };
24839 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24840 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24841 }
24842
24843 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24844 // condition is true.
24845 SDValue Ops[] = { Op2, Op1, CC, Cond };
24846 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24847}
24848
24850 const X86Subtarget &Subtarget,
24851 SelectionDAG &DAG) {
24852 MVT VT = Op->getSimpleValueType(0);
24853 SDValue In = Op->getOperand(0);
24854 MVT InVT = In.getSimpleValueType();
24855 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24856 MVT VTElt = VT.getVectorElementType();
24857 unsigned NumElts = VT.getVectorNumElements();
24858
24859 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24860 MVT ExtVT = VT;
24861 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24862 // If v16i32 is to be avoided, we'll need to split and concatenate.
24863 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24864 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24865
24866 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24867 }
24868
24869 // Widen to 512-bits if VLX is not supported.
24870 MVT WideVT = ExtVT;
24871 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24872 NumElts *= 512 / ExtVT.getSizeInBits();
24873 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24874 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
24875 DAG.getVectorIdxConstant(0, dl));
24876 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24877 }
24878
24879 SDValue V;
24880 MVT WideEltVT = WideVT.getVectorElementType();
24881 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24882 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24883 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24884 } else {
24885 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
24886 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24887 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24888 }
24889
24890 // Truncate if we had to extend i16/i8 above.
24891 if (VT != ExtVT) {
24892 WideVT = MVT::getVectorVT(VTElt, NumElts);
24893 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24894 }
24895
24896 // Extract back to 128/256-bit if we widened.
24897 if (WideVT != VT)
24898 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24899 DAG.getVectorIdxConstant(0, dl));
24900
24901 return V;
24902}
24903
24905 SelectionDAG &DAG) {
24906 SDValue In = Op->getOperand(0);
24907 MVT InVT = In.getSimpleValueType();
24908 SDLoc DL(Op);
24909
24910 if (InVT.getVectorElementType() == MVT::i1)
24911 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
24912
24913 assert(Subtarget.hasAVX() && "Expected AVX support");
24914 return LowerAVXExtend(Op, DL, DAG, Subtarget);
24915}
24916
24917// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24918// For sign extend this needs to handle all vector sizes and SSE4.1 and
24919// non-SSE4.1 targets. For zero extend this should only handle inputs of
24920// MVT::v64i8 when BWI is not supported, but AVX512 is.
24922 const X86Subtarget &Subtarget,
24923 SelectionDAG &DAG) {
24924 SDValue In = Op->getOperand(0);
24925 MVT VT = Op->getSimpleValueType(0);
24926 MVT InVT = In.getSimpleValueType();
24927
24928 MVT SVT = VT.getVectorElementType();
24929 MVT InSVT = InVT.getVectorElementType();
24931
24932 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24933 return SDValue();
24934 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24935 return SDValue();
24936 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24937 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24938 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24939 return SDValue();
24940
24941 SDLoc dl(Op);
24942 unsigned Opc = Op.getOpcode();
24943 unsigned NumElts = VT.getVectorNumElements();
24944
24945 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24946 // For 512-bit vectors, we need 128-bits or 256-bits.
24947 if (InVT.getSizeInBits() > 128) {
24948 // Input needs to be at least the same number of elements as output, and
24949 // at least 128-bits.
24950 int InSize = InSVT.getSizeInBits() * NumElts;
24951 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24952 InVT = In.getSimpleValueType();
24953 }
24954
24955 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24956 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24957 // need to be handled here for 256/512-bit results.
24958 if (Subtarget.hasInt256()) {
24959 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24960
24961 if (InVT.getVectorNumElements() != NumElts)
24962 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24963
24964 // FIXME: Apparently we create inreg operations that could be regular
24965 // extends.
24966 unsigned ExtOpc =
24969 return DAG.getNode(ExtOpc, dl, VT, In);
24970 }
24971
24972 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24973 if (Subtarget.hasAVX()) {
24974 assert(VT.is256BitVector() && "256-bit vector expected");
24975 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24976 int HalfNumElts = HalfVT.getVectorNumElements();
24977
24978 unsigned NumSrcElts = InVT.getVectorNumElements();
24979 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24980 for (int i = 0; i != HalfNumElts; ++i)
24981 HiMask[i] = HalfNumElts + i;
24982
24983 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24984 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24985 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24986 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24987 }
24988
24989 // We should only get here for sign extend.
24990 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24991 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24992 unsigned InNumElts = InVT.getVectorNumElements();
24993
24994 // If the source elements are already all-signbits, we don't need to extend,
24995 // just splat the elements.
24996 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
24997 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
24998 unsigned Scale = InNumElts / NumElts;
24999 SmallVector<int, 16> ShuffleMask;
25000 for (unsigned I = 0; I != NumElts; ++I)
25001 ShuffleMask.append(Scale, I);
25002 return DAG.getBitcast(VT,
25003 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25004 }
25005
25006 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25007 SDValue Curr = In;
25008 SDValue SignExt = Curr;
25009
25010 // As SRAI is only available on i16/i32 types, we expand only up to i32
25011 // and handle i64 separately.
25012 if (InVT != MVT::v4i32) {
25013 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25014
25015 unsigned DestWidth = DestVT.getScalarSizeInBits();
25016 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25017 unsigned DestElts = DestVT.getVectorNumElements();
25018
25019 // Build a shuffle mask that takes each input element and places it in the
25020 // MSBs of the new element size.
25021 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25022 for (unsigned i = 0; i != DestElts; ++i)
25023 Mask[i * Scale + (Scale - 1)] = i;
25024
25025 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25026 Curr = DAG.getBitcast(DestVT, Curr);
25027
25028 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25029 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25030 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25031 }
25032
25033 if (VT == MVT::v2i64) {
25034 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25035 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25036 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25037 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25038 SignExt = DAG.getBitcast(VT, SignExt);
25039 }
25040
25041 return SignExt;
25042}
25043
25045 SelectionDAG &DAG) {
25046 MVT VT = Op->getSimpleValueType(0);
25047 SDValue In = Op->getOperand(0);
25048 MVT InVT = In.getSimpleValueType();
25049 SDLoc dl(Op);
25050
25051 if (InVT.getVectorElementType() == MVT::i1)
25052 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25053
25054 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25056 "Expected same number of elements");
25057 assert((VT.getVectorElementType() == MVT::i16 ||
25058 VT.getVectorElementType() == MVT::i32 ||
25059 VT.getVectorElementType() == MVT::i64) &&
25060 "Unexpected element type");
25061 assert((InVT.getVectorElementType() == MVT::i8 ||
25062 InVT.getVectorElementType() == MVT::i16 ||
25063 InVT.getVectorElementType() == MVT::i32) &&
25064 "Unexpected element type");
25065
25066 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25067 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25068 return splitVectorIntUnary(Op, DAG, dl);
25069 }
25070
25071 if (Subtarget.hasInt256())
25072 return Op;
25073
25074 // Optimize vectors in AVX mode
25075 // Sign extend v8i16 to v8i32 and
25076 // v4i32 to v4i64
25077 //
25078 // Divide input vector into two parts
25079 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25080 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25081 // concat the vectors to original VT
25082 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25083 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25084
25085 unsigned NumElems = InVT.getVectorNumElements();
25086 SmallVector<int,8> ShufMask(NumElems, -1);
25087 for (unsigned i = 0; i != NumElems/2; ++i)
25088 ShufMask[i] = i + NumElems/2;
25089
25090 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25091 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25092
25093 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25094}
25095
25096/// Change a vector store into a pair of half-size vector stores.
25098 SDValue StoredVal = Store->getValue();
25099 assert((StoredVal.getValueType().is256BitVector() ||
25100 StoredVal.getValueType().is512BitVector()) &&
25101 "Expecting 256/512-bit op");
25102
25103 // Splitting volatile memory ops is not allowed unless the operation was not
25104 // legal to begin with. Assume the input store is legal (this transform is
25105 // only used for targets with AVX). Note: It is possible that we have an
25106 // illegal type like v2i128, and so we could allow splitting a volatile store
25107 // in that case if that is important.
25108 if (!Store->isSimple())
25109 return SDValue();
25110
25111 SDLoc DL(Store);
25112 SDValue Value0, Value1;
25113 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25114 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25115 SDValue Ptr0 = Store->getBasePtr();
25116 SDValue Ptr1 =
25117 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25118 SDValue Ch0 =
25119 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25120 Store->getOriginalAlign(),
25121 Store->getMemOperand()->getFlags());
25122 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25123 Store->getPointerInfo().getWithOffset(HalfOffset),
25124 Store->getOriginalAlign(),
25125 Store->getMemOperand()->getFlags());
25126 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25127}
25128
25129/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25130/// type.
25132 SelectionDAG &DAG) {
25133 SDValue StoredVal = Store->getValue();
25134 assert(StoreVT.is128BitVector() &&
25135 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25136 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25137
25138 // Splitting volatile memory ops is not allowed unless the operation was not
25139 // legal to begin with. We are assuming the input op is legal (this transform
25140 // is only used for targets with AVX).
25141 if (!Store->isSimple())
25142 return SDValue();
25143
25144 MVT StoreSVT = StoreVT.getScalarType();
25145 unsigned NumElems = StoreVT.getVectorNumElements();
25146 unsigned ScalarSize = StoreSVT.getStoreSize();
25147
25148 SDLoc DL(Store);
25150 for (unsigned i = 0; i != NumElems; ++i) {
25151 unsigned Offset = i * ScalarSize;
25152 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25154 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25155 DAG.getVectorIdxConstant(i, DL));
25156 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25157 Store->getPointerInfo().getWithOffset(Offset),
25158 Store->getOriginalAlign(),
25159 Store->getMemOperand()->getFlags());
25160 Stores.push_back(Ch);
25161 }
25162 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25163}
25164
25165static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25166 SelectionDAG &DAG) {
25167 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25168 SDLoc dl(St);
25169 SDValue StoredVal = St->getValue();
25170
25171 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25172 if (StoredVal.getValueType().isVector() &&
25173 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25174 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25175 assert(NumElts <= 8 && "Unexpected VT");
25176 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25177 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25178 "Expected AVX512F without AVX512DQI");
25179
25180 // We must pad with zeros to ensure we store zeroes to any unused bits.
25181 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25182 DAG.getUNDEF(MVT::v16i1), StoredVal,
25183 DAG.getVectorIdxConstant(0, dl));
25184 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25185 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25186 // Make sure we store zeros in the extra bits.
25187 if (NumElts < 8)
25188 StoredVal = DAG.getZeroExtendInReg(
25189 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25190
25191 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25192 St->getPointerInfo(), St->getOriginalAlign(),
25193 St->getMemOperand()->getFlags());
25194 }
25195
25196 if (St->isTruncatingStore())
25197 return SDValue();
25198
25199 // If this is a 256-bit store of concatenated ops, we are better off splitting
25200 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
25201 // and each half can execute independently. Some cores would split the op into
25202 // halves anyway, so the concat (vinsertf128) is purely an extra op.
25203 MVT StoreVT = StoredVal.getSimpleValueType();
25204 if (StoreVT.is256BitVector() ||
25205 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
25206 !Subtarget.hasBWI())) {
25207 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
25208 return splitVectorStore(St, DAG);
25209 return SDValue();
25210 }
25211
25212 if (StoreVT.is32BitVector())
25213 return SDValue();
25214
25215 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25216 assert(StoreVT.is64BitVector() && "Unexpected VT");
25217 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25219 "Unexpected type action!");
25220
25221 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25222 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25223 DAG.getUNDEF(StoreVT));
25224
25225 if (Subtarget.hasSSE2()) {
25226 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25227 // and store it.
25228 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25229 MVT CastVT = MVT::getVectorVT(StVT, 2);
25230 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25231 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25232 DAG.getVectorIdxConstant(0, dl));
25233
25234 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25235 St->getPointerInfo(), St->getOriginalAlign(),
25236 St->getMemOperand()->getFlags());
25237 }
25238 assert(Subtarget.hasSSE1() && "Expected SSE");
25239 SDVTList Tys = DAG.getVTList(MVT::Other);
25240 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25241 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25242 St->getMemOperand());
25243}
25244
25245// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25246// may emit an illegal shuffle but the expansion is still better than scalar
25247// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25248// we'll emit a shuffle and a arithmetic shift.
25249// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25250// TODO: It is possible to support ZExt by zeroing the undef values during
25251// the shuffle phase or after the shuffle.
25252static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25253 SelectionDAG &DAG) {
25254 MVT RegVT = Op.getSimpleValueType();
25255 assert(RegVT.isVector() && "We only custom lower vector loads.");
25256 assert(RegVT.isInteger() &&
25257 "We only custom lower integer vector loads.");
25258
25259 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25260 SDLoc dl(Ld);
25261
25262 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25263 if (RegVT.getVectorElementType() == MVT::i1) {
25264 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25265 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25266 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25267 "Expected AVX512F without AVX512DQI");
25268
25269 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25270 Ld->getPointerInfo(), Ld->getOriginalAlign(),
25271 Ld->getMemOperand()->getFlags());
25272
25273 // Replace chain users with the new chain.
25274 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25275
25276 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25277 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25278 DAG.getBitcast(MVT::v16i1, Val),
25279 DAG.getVectorIdxConstant(0, dl));
25280 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25281 }
25282
25283 return SDValue();
25284}
25285
25286/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25287/// each of which has no other use apart from the AND / OR.
25288static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25289 Opc = Op.getOpcode();
25290 if (Opc != ISD::OR && Opc != ISD::AND)
25291 return false;
25292 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25293 Op.getOperand(0).hasOneUse() &&
25294 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25295 Op.getOperand(1).hasOneUse());
25296}
25297
25298SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25299 SDValue Chain = Op.getOperand(0);
25300 SDValue Cond = Op.getOperand(1);
25301 SDValue Dest = Op.getOperand(2);
25302 SDLoc dl(Op);
25303
25304 // Bail out when we don't have native compare instructions.
25305 if (Cond.getOpcode() == ISD::SETCC &&
25306 Cond.getOperand(0).getValueType() != MVT::f128 &&
25307 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25308 SDValue LHS = Cond.getOperand(0);
25309 SDValue RHS = Cond.getOperand(1);
25310 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25311
25312 // Special case for
25313 // setcc([su]{add,sub,mul}o == 0)
25314 // setcc([su]{add,sub,mul}o != 1)
25315 if (ISD::isOverflowIntrOpRes(LHS) &&
25316 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25317 (isNullConstant(RHS) || isOneConstant(RHS))) {
25318 SDValue Value, Overflow;
25319 X86::CondCode X86Cond;
25320 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25321
25322 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25323 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25324
25325 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25326 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25327 Overflow, Op->getFlags());
25328 }
25329
25330 if (LHS.getSimpleValueType().isInteger()) {
25331 SDValue CCVal;
25332 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25333 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25334 EFLAGS, Op->getFlags());
25335 }
25336
25337 if (CC == ISD::SETOEQ) {
25338 // For FCMP_OEQ, we can emit
25339 // two branches instead of an explicit AND instruction with a
25340 // separate test. However, we only do this if this block doesn't
25341 // have a fall-through edge, because this requires an explicit
25342 // jmp when the condition is false.
25343 if (Op.getNode()->hasOneUse()) {
25344 SDNode *User = *Op.getNode()->user_begin();
25345 // Look for an unconditional branch following this conditional branch.
25346 // We need this because we need to reverse the successors in order
25347 // to implement FCMP_OEQ.
25348 if (User->getOpcode() == ISD::BR) {
25349 SDValue FalseBB = User->getOperand(1);
25350 SDNode *NewBR =
25351 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25352 assert(NewBR == User);
25353 (void)NewBR;
25354 Dest = FalseBB;
25355
25356 SDValue Cmp =
25357 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25358 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25359 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25360 CCVal, Cmp, Op->getFlags());
25361 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25362 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25363 Cmp, Op->getFlags());
25364 }
25365 }
25366 } else if (CC == ISD::SETUNE) {
25367 // For FCMP_UNE, we can emit
25368 // two branches instead of an explicit OR instruction with a
25369 // separate test.
25370 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25371 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25372 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25373 Cmp, Op->getFlags());
25374 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25375 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25376 Cmp, Op->getFlags());
25377 } else {
25378 X86::CondCode X86Cond =
25379 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25380 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25381 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25382 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25383 Cmp, Op->getFlags());
25384 }
25385 }
25386
25388 SDValue Value, Overflow;
25389 X86::CondCode X86Cond;
25390 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25391
25392 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25393 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25394 Overflow, Op->getFlags());
25395 }
25396
25397 // Look past the truncate if the high bits are known zero.
25399 Cond = Cond.getOperand(0);
25400
25401 EVT CondVT = Cond.getValueType();
25402
25403 // Add an AND with 1 if we don't already have one.
25404 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25405 Cond =
25406 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25407
25408 SDValue LHS = Cond;
25409 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25410
25411 SDValue CCVal;
25412 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25413 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25414 Op->getFlags());
25415}
25416
25417// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25418// Calls to _alloca are needed to probe the stack when allocating more than 4k
25419// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25420// that the guard pages used by the OS virtual memory manager are allocated in
25421// correct sequence.
25422SDValue
25423X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25424 SelectionDAG &DAG) const {
25426 bool SplitStack = MF.shouldSplitStack();
25427 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25428 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25429 SplitStack || EmitStackProbeCall;
25430 SDLoc dl(Op);
25431
25432 // Get the inputs.
25433 SDNode *Node = Op.getNode();
25434 SDValue Chain = Op.getOperand(0);
25435 SDValue Size = Op.getOperand(1);
25436 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25437 EVT VT = Node->getValueType(0);
25438
25439 // Chain the dynamic stack allocation so that it doesn't modify the stack
25440 // pointer when other instructions are using the stack.
25441 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25442
25443 bool Is64Bit = Subtarget.is64Bit();
25444 MVT SPTy = getPointerTy(DAG.getDataLayout());
25445
25447 if (!Lower) {
25448 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25450 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25451 " not tell us which reg is the stack pointer!");
25452
25453 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25454 const Align StackAlign = TFI.getStackAlign();
25455 if (hasInlineStackProbe(MF)) {
25456 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25457 {Chain, Size});
25458 Chain = Result.getValue(1);
25459 } else {
25460 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25461 Chain = SP.getValue(1);
25462 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25463 }
25464 if (Alignment && *Alignment > StackAlign)
25465 Result = DAG.getNode(
25466 ISD::AND, dl, VT, Result,
25467 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25468 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25469 } else if (SplitStack) {
25470 if (Is64Bit) {
25471 // The 64 bit implementation of segmented stacks needs to clobber both r10
25472 // r11. This makes it impossible to use it along with nested parameters.
25473 const Function &F = MF.getFunction();
25474 for (const auto &A : F.args()) {
25475 if (A.hasNestAttr())
25476 report_fatal_error("Cannot use segmented stacks with functions that "
25477 "have nested arguments.");
25478 }
25479 }
25480
25481 Result =
25482 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25483 Chain = Result.getValue(1);
25484 } else {
25485 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25486 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25487 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25488
25489 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25490 Register SPReg = RegInfo->getStackRegister();
25491 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25492 Chain = SP.getValue(1);
25493
25494 if (Alignment) {
25495 SP = DAG.getNode(
25496 ISD::AND, dl, VT, SP.getValue(0),
25497 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25498 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25499 }
25500
25501 Result = SP;
25502 }
25503
25504 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25505
25506 SDValue Ops[2] = {Result, Chain};
25507 return DAG.getMergeValues(Ops, dl);
25508}
25509
25510SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25512 auto PtrVT = getPointerTy(MF.getDataLayout());
25514
25515 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25516 SDLoc DL(Op);
25517
25518 if (!Subtarget.is64Bit() ||
25519 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25520 // vastart just stores the address of the VarArgsFrameIndex slot into the
25521 // memory location argument.
25522 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25523 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
25524 MachinePointerInfo(SV));
25525 }
25526
25527 // __va_list_tag:
25528 // gp_offset (0 - 6 * 8)
25529 // fp_offset (48 - 48 + 8 * 16)
25530 // overflow_arg_area (point to parameters coming in memory).
25531 // reg_save_area
25533 SDValue FIN = Op.getOperand(1);
25534 // Store gp_offset
25535 SDValue Store = DAG.getStore(
25536 Op.getOperand(0), DL,
25537 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25538 MachinePointerInfo(SV));
25539 MemOps.push_back(Store);
25540
25541 // Store fp_offset
25542 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25543 Store = DAG.getStore(
25544 Op.getOperand(0), DL,
25545 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25546 MachinePointerInfo(SV, 4));
25547 MemOps.push_back(Store);
25548
25549 // Store ptr to overflow_arg_area
25550 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25551 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25552 Store =
25553 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25554 MemOps.push_back(Store);
25555
25556 // Store ptr to reg_save_area.
25557 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25558 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25559 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25560 Store = DAG.getStore(
25561 Op.getOperand(0), DL, RSFIN, FIN,
25562 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25563 MemOps.push_back(Store);
25564 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25565}
25566
25567SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25568 assert(Subtarget.is64Bit() &&
25569 "LowerVAARG only handles 64-bit va_arg!");
25570 assert(Op.getNumOperands() == 4);
25571
25573 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25574 // The Win64 ABI uses char* instead of a structure.
25575 return DAG.expandVAArg(Op.getNode());
25576
25577 SDValue Chain = Op.getOperand(0);
25578 SDValue SrcPtr = Op.getOperand(1);
25579 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25580 unsigned Align = Op.getConstantOperandVal(3);
25581 SDLoc dl(Op);
25582
25583 EVT ArgVT = Op.getNode()->getValueType(0);
25584 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25585 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25586 uint8_t ArgMode;
25587
25588 // Decide which area this value should be read from.
25589 // TODO: Implement the AMD64 ABI in its entirety. This simple
25590 // selection mechanism works only for the basic types.
25591 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25592 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25593 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25594 } else {
25595 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25596 "Unhandled argument type in LowerVAARG");
25597 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25598 }
25599
25600 if (ArgMode == 2) {
25601 // Make sure using fp_offset makes sense.
25602 assert(!Subtarget.useSoftFloat() &&
25603 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25604 Subtarget.hasSSE1());
25605 }
25606
25607 // Insert VAARG node into the DAG
25608 // VAARG returns two values: Variable Argument Address, Chain
25609 SDValue InstOps[] = {Chain, SrcPtr,
25610 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25611 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25612 DAG.getTargetConstant(Align, dl, MVT::i32)};
25613 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25616 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25617 /*Alignment=*/std::nullopt,
25619 Chain = VAARG.getValue(1);
25620
25621 // Load the next argument and return it
25622 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25623}
25624
25625static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25626 SelectionDAG &DAG) {
25627 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25628 // where a va_list is still an i8*.
25629 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25630 if (Subtarget.isCallingConvWin64(
25632 // Probably a Win64 va_copy.
25633 return DAG.expandVACopy(Op.getNode());
25634
25635 SDValue Chain = Op.getOperand(0);
25636 SDValue DstPtr = Op.getOperand(1);
25637 SDValue SrcPtr = Op.getOperand(2);
25638 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25639 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25640 SDLoc DL(Op);
25641
25642 return DAG.getMemcpy(
25643 Chain, DL, DstPtr, SrcPtr,
25644 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25645 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25646 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
25647 MachinePointerInfo(SrcSV));
25648}
25649
25650// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25651static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25652 switch (Opc) {
25653 case ISD::SHL:
25654 case X86ISD::VSHL:
25655 case X86ISD::VSHLI:
25656 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25657 case ISD::SRL:
25658 case X86ISD::VSRL:
25659 case X86ISD::VSRLI:
25660 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25661 case ISD::SRA:
25662 case X86ISD::VSRA:
25663 case X86ISD::VSRAI:
25664 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25665 }
25666 llvm_unreachable("Unknown target vector shift node");
25667}
25668
25669/// Handle vector element shifts where the shift amount is a constant.
25670/// Takes immediate version of shift as input.
25671static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25672 SDValue SrcOp, uint64_t ShiftAmt,
25673 SelectionDAG &DAG) {
25674 MVT ElementType = VT.getVectorElementType();
25675
25676 // Bitcast the source vector to the output type, this is mainly necessary for
25677 // vXi8/vXi64 shifts.
25678 if (VT != SrcOp.getSimpleValueType())
25679 SrcOp = DAG.getBitcast(VT, SrcOp);
25680
25681 // Fold this packed shift into its first operand if ShiftAmt is 0.
25682 if (ShiftAmt == 0)
25683 return SrcOp;
25684
25685 // Check for ShiftAmt >= element width
25686 if (ShiftAmt >= ElementType.getSizeInBits()) {
25687 if (Opc == X86ISD::VSRAI)
25688 ShiftAmt = ElementType.getSizeInBits() - 1;
25689 else
25690 return DAG.getConstant(0, dl, VT);
25691 }
25692
25693 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
25694 && "Unknown target vector shift-by-constant node");
25695
25696 // Fold this packed vector shift into a build vector if SrcOp is a
25697 // vector of Constants or UNDEFs.
25699 unsigned ShiftOpc;
25700 switch (Opc) {
25701 default: llvm_unreachable("Unknown opcode!");
25702 case X86ISD::VSHLI:
25703 ShiftOpc = ISD::SHL;
25704 break;
25705 case X86ISD::VSRLI:
25706 ShiftOpc = ISD::SRL;
25707 break;
25708 case X86ISD::VSRAI:
25709 ShiftOpc = ISD::SRA;
25710 break;
25711 }
25712
25713 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
25714 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
25715 return C;
25716 }
25717
25718 return DAG.getNode(Opc, dl, VT, SrcOp,
25719 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25720}
25721
25722/// Handle vector element shifts by a splat shift amount
25723static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25724 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
25725 const X86Subtarget &Subtarget,
25726 SelectionDAG &DAG) {
25727 MVT AmtVT = ShAmt.getSimpleValueType();
25728 assert(AmtVT.isVector() && "Vector shift type mismatch");
25729 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
25730 "Illegal vector splat index");
25731
25732 // Move the splat element to the bottom element.
25733 if (ShAmtIdx != 0) {
25734 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25735 Mask[0] = ShAmtIdx;
25736 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
25737 }
25738
25739 // Peek through any zext node if we can get back to a 128-bit source.
25740 if (AmtVT.getScalarSizeInBits() == 64 &&
25741 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
25743 ShAmt.getOperand(0).getValueType().isSimple() &&
25744 ShAmt.getOperand(0).getValueType().is128BitVector()) {
25745 ShAmt = ShAmt.getOperand(0);
25746 AmtVT = ShAmt.getSimpleValueType();
25747 }
25748
25749 // See if we can mask off the upper elements using the existing source node.
25750 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25751 // do this for vXi64 types.
25752 bool IsMasked = false;
25753 if (AmtVT.getScalarSizeInBits() < 64) {
25754 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
25755 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
25756 // If the shift amount has come from a scalar, then zero-extend the scalar
25757 // before moving to the vector.
25758 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
25759 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25760 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
25761 AmtVT = MVT::v4i32;
25762 IsMasked = true;
25763 } else if (ShAmt.getOpcode() == ISD::AND) {
25764 // See if the shift amount is already masked (e.g. for rotation modulo),
25765 // then we can zero-extend it by setting all the other mask elements to
25766 // zero.
25767 SmallVector<SDValue> MaskElts(
25768 AmtVT.getVectorNumElements(),
25769 DAG.getConstant(0, dl, AmtVT.getScalarType()));
25770 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
25771 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
25772 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
25773 {ShAmt.getOperand(1), Mask}))) {
25774 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
25775 IsMasked = true;
25776 }
25777 }
25778 }
25779
25780 // Extract if the shift amount vector is larger than 128-bits.
25781 if (AmtVT.getSizeInBits() > 128) {
25782 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
25783 AmtVT = ShAmt.getSimpleValueType();
25784 }
25785
25786 // Zero-extend bottom element to v2i64 vector type, either by extension or
25787 // shuffle masking.
25788 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
25789 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
25790 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
25791 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25792 } else if (Subtarget.hasSSE41()) {
25793 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25794 MVT::v2i64, ShAmt);
25795 } else {
25796 SDValue ByteShift = DAG.getTargetConstant(
25797 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25798 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25799 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25800 ByteShift);
25801 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25802 ByteShift);
25803 }
25804 }
25805
25806 // Change opcode to non-immediate version.
25807 Opc = getTargetVShiftUniformOpcode(Opc, true);
25808
25809 // The return type has to be a 128-bit type with the same element
25810 // type as the input type.
25811 MVT EltVT = VT.getVectorElementType();
25812 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25813
25814 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25815 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25816}
25817
25818/// Return Mask with the necessary casting or extending
25819/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25820static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25821 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25822 const SDLoc &dl) {
25823
25824 if (isAllOnesConstant(Mask))
25825 return DAG.getConstant(1, dl, MaskVT);
25826 if (X86::isZeroNode(Mask))
25827 return DAG.getConstant(0, dl, MaskVT);
25828
25829 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25830
25831 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25832 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25833 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25834 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25835 SDValue Lo, Hi;
25836 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
25837 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25838 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25839 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25840 } else {
25841 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25842 Mask.getSimpleValueType().getSizeInBits());
25843 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25844 // are extracted by EXTRACT_SUBVECTOR.
25845 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25846 DAG.getBitcast(BitcastVT, Mask),
25847 DAG.getVectorIdxConstant(0, dl));
25848 }
25849}
25850
25851/// Return (and \p Op, \p Mask) for compare instructions or
25852/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25853/// necessary casting or extending for \p Mask when lowering masking intrinsics
25855 SDValue PreservedSrc,
25856 const X86Subtarget &Subtarget,
25857 SelectionDAG &DAG) {
25858 MVT VT = Op.getSimpleValueType();
25859 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25860 unsigned OpcodeSelect = ISD::VSELECT;
25861 SDLoc dl(Op);
25862
25863 if (isAllOnesConstant(Mask))
25864 return Op;
25865
25866 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25867
25868 if (PreservedSrc.isUndef())
25869 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25870 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25871}
25872
25873/// Creates an SDNode for a predicated scalar operation.
25874/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25875/// The mask is coming as MVT::i8 and it should be transformed
25876/// to MVT::v1i1 while lowering masking intrinsics.
25877/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25878/// "X86select" instead of "vselect". We just can't create the "vselect" node
25879/// for a scalar instruction.
25881 SDValue PreservedSrc,
25882 const X86Subtarget &Subtarget,
25883 SelectionDAG &DAG) {
25884
25885 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25886 if (MaskConst->getZExtValue() & 0x1)
25887 return Op;
25888
25889 MVT VT = Op.getSimpleValueType();
25890 SDLoc dl(Op);
25891
25892 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25893 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25894 DAG.getBitcast(MVT::v8i1, Mask),
25895 DAG.getVectorIdxConstant(0, dl));
25896 if (Op.getOpcode() == X86ISD::FSETCCM ||
25897 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25898 Op.getOpcode() == X86ISD::VFPCLASSS)
25899 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25900
25901 if (PreservedSrc.isUndef())
25902 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25903 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25904}
25905
25907 if (!Fn->hasPersonalityFn())
25909 "querying registration node size for function without personality");
25910 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25911 // WinEHStatePass for the full struct definition.
25912 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25913 case EHPersonality::MSVC_X86SEH: return 24;
25914 case EHPersonality::MSVC_CXX: return 16;
25915 default: break;
25916 }
25918 "can only recover FP for 32-bit MSVC EH personality functions");
25919}
25920
25921/// When the MSVC runtime transfers control to us, either to an outlined
25922/// function or when returning to a parent frame after catching an exception, we
25923/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25924/// Here's the math:
25925/// RegNodeBase = EntryEBP - RegNodeSize
25926/// ParentFP = RegNodeBase - ParentFrameOffset
25927/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25928/// subtracting the offset (negative on x86) takes us back to the parent FP.
25930 SDValue EntryEBP) {
25932 SDLoc dl;
25933
25934 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25935 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25936
25937 // It's possible that the parent function no longer has a personality function
25938 // if the exceptional code was optimized away, in which case we just return
25939 // the incoming EBP.
25940 if (!Fn->hasPersonalityFn())
25941 return EntryEBP;
25942
25943 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25944 // registration, or the .set_setframe offset.
25947 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25948 SDValue ParentFrameOffset =
25949 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25950
25951 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25952 // prologue to RBP in the parent function.
25953 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
25954 if (Subtarget.is64Bit())
25955 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25956
25957 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25958 // RegNodeBase = EntryEBP - RegNodeSize
25959 // ParentFP = RegNodeBase - ParentFrameOffset
25960 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25961 DAG.getConstant(RegNodeSize, dl, PtrVT));
25962 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25963}
25964
25965SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25966 SelectionDAG &DAG) const {
25967 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25968 auto isRoundModeCurDirection = [](SDValue Rnd) {
25969 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25970 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25971
25972 return false;
25973 };
25974 auto isRoundModeSAE = [](SDValue Rnd) {
25975 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25976 unsigned RC = C->getZExtValue();
25978 // Clear the NO_EXC bit and check remaining bits.
25980 // As a convenience we allow no other bits or explicitly
25981 // current direction.
25982 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25983 }
25984 }
25985
25986 return false;
25987 };
25988 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25989 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25990 RC = C->getZExtValue();
25992 // Clear the NO_EXC bit and check remaining bits.
25998 }
25999 }
26000
26001 return false;
26002 };
26003
26004 SDLoc dl(Op);
26005 unsigned IntNo = Op.getConstantOperandVal(0);
26006 MVT VT = Op.getSimpleValueType();
26007 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26008
26009 // Propagate flags from original node to transformed node(s).
26010 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26011
26012 if (IntrData) {
26013 switch(IntrData->Type) {
26014 case INTR_TYPE_1OP: {
26015 // We specify 2 possible opcodes for intrinsics with rounding modes.
26016 // First, we check if the intrinsic may have non-default rounding mode,
26017 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26018 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26019 if (IntrWithRoundingModeOpcode != 0) {
26020 SDValue Rnd = Op.getOperand(2);
26021 unsigned RC = 0;
26022 if (isRoundModeSAEToX(Rnd, RC))
26023 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26024 Op.getOperand(1),
26025 DAG.getTargetConstant(RC, dl, MVT::i32));
26026 if (!isRoundModeCurDirection(Rnd))
26027 return SDValue();
26028 }
26029 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26030 Op.getOperand(1));
26031 }
26032 case INTR_TYPE_1OP_SAE: {
26033 SDValue Sae = Op.getOperand(2);
26034
26035 unsigned Opc;
26036 if (isRoundModeCurDirection(Sae))
26037 Opc = IntrData->Opc0;
26038 else if (isRoundModeSAE(Sae))
26039 Opc = IntrData->Opc1;
26040 else
26041 return SDValue();
26042
26043 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26044 }
26045 case INTR_TYPE_2OP: {
26046 SDValue Src2 = Op.getOperand(2);
26047
26048 // We specify 2 possible opcodes for intrinsics with rounding modes.
26049 // First, we check if the intrinsic may have non-default rounding mode,
26050 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26051 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26052 if (IntrWithRoundingModeOpcode != 0) {
26053 SDValue Rnd = Op.getOperand(3);
26054 unsigned RC = 0;
26055 if (isRoundModeSAEToX(Rnd, RC))
26056 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26057 Op.getOperand(1), Src2,
26058 DAG.getTargetConstant(RC, dl, MVT::i32));
26059 if (!isRoundModeCurDirection(Rnd))
26060 return SDValue();
26061 }
26062
26063 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26064 Op.getOperand(1), Src2);
26065 }
26066 case INTR_TYPE_2OP_SAE: {
26067 SDValue Sae = Op.getOperand(3);
26068
26069 unsigned Opc;
26070 if (isRoundModeCurDirection(Sae))
26071 Opc = IntrData->Opc0;
26072 else if (isRoundModeSAE(Sae))
26073 Opc = IntrData->Opc1;
26074 else
26075 return SDValue();
26076
26077 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26078 Op.getOperand(2));
26079 }
26080 case INTR_TYPE_3OP:
26081 case INTR_TYPE_3OP_IMM8: {
26082 SDValue Src1 = Op.getOperand(1);
26083 SDValue Src2 = Op.getOperand(2);
26084 SDValue Src3 = Op.getOperand(3);
26085
26086 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26087 Src3.getValueType() != MVT::i8) {
26088 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26089 }
26090
26091 // We specify 2 possible opcodes for intrinsics with rounding modes.
26092 // First, we check if the intrinsic may have non-default rounding mode,
26093 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26094 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26095 if (IntrWithRoundingModeOpcode != 0) {
26096 SDValue Rnd = Op.getOperand(4);
26097 unsigned RC = 0;
26098 if (isRoundModeSAEToX(Rnd, RC))
26099 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26100 Src1, Src2, Src3,
26101 DAG.getTargetConstant(RC, dl, MVT::i32));
26102 if (!isRoundModeCurDirection(Rnd))
26103 return SDValue();
26104 }
26105
26106 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26107 {Src1, Src2, Src3});
26108 }
26109 case INTR_TYPE_4OP_IMM8: {
26110 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26111 SDValue Src4 = Op.getOperand(4);
26112 if (Src4.getValueType() != MVT::i8) {
26113 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26114 }
26115
26116 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26117 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26118 Src4);
26119 }
26120 case INTR_TYPE_1OP_MASK: {
26121 SDValue Src = Op.getOperand(1);
26122 SDValue PassThru = Op.getOperand(2);
26123 SDValue Mask = Op.getOperand(3);
26124 // We add rounding mode to the Node when
26125 // - RC Opcode is specified and
26126 // - RC is not "current direction".
26127 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26128 if (IntrWithRoundingModeOpcode != 0) {
26129 SDValue Rnd = Op.getOperand(4);
26130 unsigned RC = 0;
26131 if (isRoundModeSAEToX(Rnd, RC))
26132 return getVectorMaskingNode(
26133 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26134 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26135 Mask, PassThru, Subtarget, DAG);
26136 if (!isRoundModeCurDirection(Rnd))
26137 return SDValue();
26138 }
26139 return getVectorMaskingNode(
26140 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26141 Subtarget, DAG);
26142 }
26144 SDValue Src = Op.getOperand(1);
26145 SDValue PassThru = Op.getOperand(2);
26146 SDValue Mask = Op.getOperand(3);
26147 SDValue Rnd = Op.getOperand(4);
26148
26149 unsigned Opc;
26150 if (isRoundModeCurDirection(Rnd))
26151 Opc = IntrData->Opc0;
26152 else if (isRoundModeSAE(Rnd))
26153 Opc = IntrData->Opc1;
26154 else
26155 return SDValue();
26156
26157 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26158 Subtarget, DAG);
26159 }
26160 case INTR_TYPE_SCALAR_MASK: {
26161 SDValue Src1 = Op.getOperand(1);
26162 SDValue Src2 = Op.getOperand(2);
26163 SDValue passThru = Op.getOperand(3);
26164 SDValue Mask = Op.getOperand(4);
26165 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26166 // There are 2 kinds of intrinsics in this group:
26167 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26168 // (2) With rounding mode and sae - 7 operands.
26169 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26170 if (Op.getNumOperands() == (5U + HasRounding)) {
26171 if (HasRounding) {
26172 SDValue Rnd = Op.getOperand(5);
26173 unsigned RC = 0;
26174 if (isRoundModeSAEToX(Rnd, RC))
26175 return getScalarMaskingNode(
26176 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26177 DAG.getTargetConstant(RC, dl, MVT::i32)),
26178 Mask, passThru, Subtarget, DAG);
26179 if (!isRoundModeCurDirection(Rnd))
26180 return SDValue();
26181 }
26182 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26183 Src2),
26184 Mask, passThru, Subtarget, DAG);
26185 }
26186
26187 assert(Op.getNumOperands() == (6U + HasRounding) &&
26188 "Unexpected intrinsic form");
26189 SDValue RoundingMode = Op.getOperand(5);
26190 unsigned Opc = IntrData->Opc0;
26191 if (HasRounding) {
26192 SDValue Sae = Op.getOperand(6);
26193 if (isRoundModeSAE(Sae))
26194 Opc = IntrWithRoundingModeOpcode;
26195 else if (!isRoundModeCurDirection(Sae))
26196 return SDValue();
26197 }
26198 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26199 Src2, RoundingMode),
26200 Mask, passThru, Subtarget, DAG);
26201 }
26203 SDValue Src1 = Op.getOperand(1);
26204 SDValue Src2 = Op.getOperand(2);
26205 SDValue passThru = Op.getOperand(3);
26206 SDValue Mask = Op.getOperand(4);
26207 SDValue Rnd = Op.getOperand(5);
26208
26209 SDValue NewOp;
26210 unsigned RC = 0;
26211 if (isRoundModeCurDirection(Rnd))
26212 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26213 else if (isRoundModeSAEToX(Rnd, RC))
26214 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26215 DAG.getTargetConstant(RC, dl, MVT::i32));
26216 else
26217 return SDValue();
26218
26219 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26220 }
26222 SDValue Src1 = Op.getOperand(1);
26223 SDValue Src2 = Op.getOperand(2);
26224 SDValue passThru = Op.getOperand(3);
26225 SDValue Mask = Op.getOperand(4);
26226 SDValue Sae = Op.getOperand(5);
26227 unsigned Opc;
26228 if (isRoundModeCurDirection(Sae))
26229 Opc = IntrData->Opc0;
26230 else if (isRoundModeSAE(Sae))
26231 Opc = IntrData->Opc1;
26232 else
26233 return SDValue();
26234
26235 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26236 Mask, passThru, Subtarget, DAG);
26237 }
26238 case INTR_TYPE_2OP_MASK: {
26239 SDValue Src1 = Op.getOperand(1);
26240 SDValue Src2 = Op.getOperand(2);
26241 SDValue PassThru = Op.getOperand(3);
26242 SDValue Mask = Op.getOperand(4);
26243 SDValue NewOp;
26244 if (IntrData->Opc1 != 0) {
26245 SDValue Rnd = Op.getOperand(5);
26246 unsigned RC = 0;
26247 if (isRoundModeSAEToX(Rnd, RC))
26248 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26249 DAG.getTargetConstant(RC, dl, MVT::i32));
26250 else if (!isRoundModeCurDirection(Rnd))
26251 return SDValue();
26252 }
26253 if (!NewOp)
26254 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26255 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26256 }
26258 SDValue Src1 = Op.getOperand(1);
26259 SDValue Src2 = Op.getOperand(2);
26260 SDValue PassThru = Op.getOperand(3);
26261 SDValue Mask = Op.getOperand(4);
26262
26263 unsigned Opc = IntrData->Opc0;
26264 if (IntrData->Opc1 != 0) {
26265 SDValue Sae = Op.getOperand(5);
26266 if (isRoundModeSAE(Sae))
26267 Opc = IntrData->Opc1;
26268 else if (!isRoundModeCurDirection(Sae))
26269 return SDValue();
26270 }
26271
26272 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26273 Mask, PassThru, Subtarget, DAG);
26274 }
26276 SDValue Src1 = Op.getOperand(1);
26277 SDValue Src2 = Op.getOperand(2);
26278 SDValue Src3 = Op.getOperand(3);
26279 SDValue PassThru = Op.getOperand(4);
26280 SDValue Mask = Op.getOperand(5);
26281 SDValue Sae = Op.getOperand(6);
26282 unsigned Opc;
26283 if (isRoundModeCurDirection(Sae))
26284 Opc = IntrData->Opc0;
26285 else if (isRoundModeSAE(Sae))
26286 Opc = IntrData->Opc1;
26287 else
26288 return SDValue();
26289
26290 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26291 Mask, PassThru, Subtarget, DAG);
26292 }
26294 SDValue Src1 = Op.getOperand(1);
26295 SDValue Src2 = Op.getOperand(2);
26296 SDValue Src3 = Op.getOperand(3);
26297 SDValue PassThru = Op.getOperand(4);
26298 SDValue Mask = Op.getOperand(5);
26299
26300 unsigned Opc = IntrData->Opc0;
26301 if (IntrData->Opc1 != 0) {
26302 SDValue Sae = Op.getOperand(6);
26303 if (isRoundModeSAE(Sae))
26304 Opc = IntrData->Opc1;
26305 else if (!isRoundModeCurDirection(Sae))
26306 return SDValue();
26307 }
26308 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26309 Mask, PassThru, Subtarget, DAG);
26310 }
26311 case BLENDV: {
26312 SDValue Src1 = Op.getOperand(1);
26313 SDValue Src2 = Op.getOperand(2);
26314 SDValue Src3 = Op.getOperand(3);
26315
26317 Src3 = DAG.getBitcast(MaskVT, Src3);
26318
26319 // Reverse the operands to match VSELECT order.
26320 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26321 }
26322 case VPERM_2OP : {
26323 SDValue Src1 = Op.getOperand(1);
26324 SDValue Src2 = Op.getOperand(2);
26325
26326 // Swap Src1 and Src2 in the node creation
26327 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26328 }
26329 case CFMA_OP_MASKZ:
26330 case CFMA_OP_MASK: {
26331 SDValue Src1 = Op.getOperand(1);
26332 SDValue Src2 = Op.getOperand(2);
26333 SDValue Src3 = Op.getOperand(3);
26334 SDValue Mask = Op.getOperand(4);
26335 MVT VT = Op.getSimpleValueType();
26336
26337 SDValue PassThru = Src3;
26338 if (IntrData->Type == CFMA_OP_MASKZ)
26339 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26340
26341 // We add rounding mode to the Node when
26342 // - RC Opcode is specified and
26343 // - RC is not "current direction".
26344 SDValue NewOp;
26345 if (IntrData->Opc1 != 0) {
26346 SDValue Rnd = Op.getOperand(5);
26347 unsigned RC = 0;
26348 if (isRoundModeSAEToX(Rnd, RC))
26349 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26350 DAG.getTargetConstant(RC, dl, MVT::i32));
26351 else if (!isRoundModeCurDirection(Rnd))
26352 return SDValue();
26353 }
26354 if (!NewOp)
26355 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26356 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26357 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26358 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26359 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26360 }
26361 case IFMA_OP:
26362 // NOTE: We need to swizzle the operands to pass the multiply operands
26363 // first.
26364 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26365 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26366 case FPCLASSS: {
26367 SDValue Src1 = Op.getOperand(1);
26368 SDValue Imm = Op.getOperand(2);
26369 SDValue Mask = Op.getOperand(3);
26370 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26371 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26372 Subtarget, DAG);
26373 // Need to fill with zeros to ensure the bitcast will produce zeroes
26374 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26375 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26376 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26377 DAG.getVectorIdxConstant(0, dl));
26378 return DAG.getBitcast(MVT::i8, Ins);
26379 }
26380
26381 case CMP_MASK_CC: {
26382 MVT MaskVT = Op.getSimpleValueType();
26383 SDValue CC = Op.getOperand(3);
26384 SDValue Mask = Op.getOperand(4);
26385 // We specify 2 possible opcodes for intrinsics with rounding modes.
26386 // First, we check if the intrinsic may have non-default rounding mode,
26387 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26388 if (IntrData->Opc1 != 0) {
26389 SDValue Sae = Op.getOperand(5);
26390 if (isRoundModeSAE(Sae))
26391 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26392 Op.getOperand(2), CC, Mask, Sae);
26393 if (!isRoundModeCurDirection(Sae))
26394 return SDValue();
26395 }
26396 //default rounding mode
26397 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26398 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26399 }
26400 case CMP_MASK_SCALAR_CC: {
26401 SDValue Src1 = Op.getOperand(1);
26402 SDValue Src2 = Op.getOperand(2);
26403 SDValue CC = Op.getOperand(3);
26404 SDValue Mask = Op.getOperand(4);
26405
26406 SDValue Cmp;
26407 if (IntrData->Opc1 != 0) {
26408 SDValue Sae = Op.getOperand(5);
26409 if (isRoundModeSAE(Sae))
26410 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26411 else if (!isRoundModeCurDirection(Sae))
26412 return SDValue();
26413 }
26414 //default rounding mode
26415 if (!Cmp.getNode())
26416 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26417
26418 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26419 Subtarget, DAG);
26420 // Need to fill with zeros to ensure the bitcast will produce zeroes
26421 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26422 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26423 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26424 DAG.getVectorIdxConstant(0, dl));
26425 return DAG.getBitcast(MVT::i8, Ins);
26426 }
26427 case COMI: { // Comparison intrinsics
26428 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26429 SDValue LHS = Op.getOperand(1);
26430 SDValue RHS = Op.getOperand(2);
26431 // Some conditions require the operands to be swapped.
26432 if (CC == ISD::SETLT || CC == ISD::SETLE)
26433 std::swap(LHS, RHS);
26434
26435 // For AVX10.2, Support EQ and NE.
26436 bool HasAVX10_2_COMX =
26437 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26438
26439 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26440 // For BF type we need to fall back.
26441 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26442
26443 auto ComiOpCode = IntrData->Opc0;
26444 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26445
26446 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26447 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26448
26449 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26450
26451 SDValue SetCC;
26452 switch (CC) {
26453 case ISD::SETEQ: {
26454 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26455 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26456 break;
26457 // (ZF = 1 and PF = 0)
26458 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26459 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26460 break;
26461 }
26462 case ISD::SETNE: {
26463 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26464 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26465 break;
26466 // (ZF = 0 or PF = 1)
26467 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26468 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26469 break;
26470 }
26471 case ISD::SETGT: // (CF = 0 and ZF = 0)
26472 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26473 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26474 break;
26475 }
26476 case ISD::SETGE: // CF = 0
26477 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26478 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26479 break;
26480 default:
26481 llvm_unreachable("Unexpected illegal condition!");
26482 }
26483 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26484 }
26485 case COMI_RM: { // Comparison intrinsics with Sae
26486 SDValue LHS = Op.getOperand(1);
26487 SDValue RHS = Op.getOperand(2);
26488 unsigned CondVal = Op.getConstantOperandVal(3);
26489 SDValue Sae = Op.getOperand(4);
26490
26491 SDValue FCmp;
26492 if (isRoundModeCurDirection(Sae))
26493 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26494 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26495 else if (isRoundModeSAE(Sae))
26496 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26497 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26498 else
26499 return SDValue();
26500 // Need to fill with zeros to ensure the bitcast will produce zeroes
26501 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26502 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26503 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26504 DAG.getVectorIdxConstant(0, dl));
26505 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26506 DAG.getBitcast(MVT::i16, Ins));
26507 }
26508 case VSHIFT: {
26509 SDValue SrcOp = Op.getOperand(1);
26510 SDValue ShAmt = Op.getOperand(2);
26511 assert(ShAmt.getValueType() == MVT::i32 &&
26512 "Unexpected VSHIFT amount type");
26513
26514 // Catch shift-by-constant.
26515 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26516 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26517 Op.getSimpleValueType(), SrcOp,
26518 CShAmt->getZExtValue(), DAG);
26519
26520 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26521 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26522 SrcOp, ShAmt, 0, Subtarget, DAG);
26523 }
26525 SDValue Mask = Op.getOperand(3);
26526 SDValue DataToCompress = Op.getOperand(1);
26527 SDValue PassThru = Op.getOperand(2);
26528 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26529 return Op.getOperand(1);
26530
26531 // Avoid false dependency.
26532 if (PassThru.isUndef())
26533 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26534
26535 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26536 Mask);
26537 }
26538 case FIXUPIMM:
26539 case FIXUPIMM_MASKZ: {
26540 SDValue Src1 = Op.getOperand(1);
26541 SDValue Src2 = Op.getOperand(2);
26542 SDValue Src3 = Op.getOperand(3);
26543 SDValue Imm = Op.getOperand(4);
26544 SDValue Mask = Op.getOperand(5);
26545 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26546 ? Src1
26547 : getZeroVector(VT, Subtarget, DAG, dl);
26548
26549 unsigned Opc = IntrData->Opc0;
26550 if (IntrData->Opc1 != 0) {
26551 SDValue Sae = Op.getOperand(6);
26552 if (isRoundModeSAE(Sae))
26553 Opc = IntrData->Opc1;
26554 else if (!isRoundModeCurDirection(Sae))
26555 return SDValue();
26556 }
26557
26558 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26559
26560 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
26561 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26562
26563 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26564 }
26565 case ROUNDP: {
26566 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26567 // Clear the upper bits of the rounding immediate so that the legacy
26568 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26569 uint64_t Round = Op.getConstantOperandVal(2);
26570 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26571 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26572 Op.getOperand(1), RoundingMode);
26573 }
26574 case ROUNDS: {
26575 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26576 // Clear the upper bits of the rounding immediate so that the legacy
26577 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26578 uint64_t Round = Op.getConstantOperandVal(3);
26579 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26580 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26581 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26582 }
26583 case BEXTRI: {
26584 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26585
26586 uint64_t Imm = Op.getConstantOperandVal(2);
26587 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26588 Op.getValueType());
26589 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26590 Op.getOperand(1), Control);
26591 }
26592 // ADC/SBB
26593 case ADX: {
26594 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26595 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26596
26597 SDValue Res;
26598 // If the carry in is zero, then we should just use ADD/SUB instead of
26599 // ADC/SBB.
26600 if (isNullConstant(Op.getOperand(1))) {
26601 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26602 Op.getOperand(3));
26603 } else {
26604 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26605 DAG.getAllOnesConstant(dl, MVT::i8));
26606 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26607 Op.getOperand(3), GenCF.getValue(1));
26608 }
26609 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26610 SDValue Results[] = { SetCC, Res };
26611 return DAG.getMergeValues(Results, dl);
26612 }
26613 case CVTPD2PS_MASK:
26614 case CVTPD2DQ_MASK:
26615 case CVTQQ2PS_MASK:
26616 case TRUNCATE_TO_REG: {
26617 SDValue Src = Op.getOperand(1);
26618 SDValue PassThru = Op.getOperand(2);
26619 SDValue Mask = Op.getOperand(3);
26620
26621 if (isAllOnesConstant(Mask))
26622 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26623
26624 MVT SrcVT = Src.getSimpleValueType();
26625 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26626 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26627 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26628 {Src, PassThru, Mask});
26629 }
26630 case TRUNCATE2_TO_REG: {
26631 SDValue Src = Op.getOperand(1);
26632 SDValue Src2 = Op.getOperand(2);
26633 SDValue PassThru = Op.getOperand(3);
26634 SDValue Mask = Op.getOperand(4);
26635
26636 if (isAllOnesConstant(Mask))
26637 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
26638
26639 MVT Src2VT = Src2.getSimpleValueType();
26640 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
26641 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26642 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26643 {Src, Src2, PassThru, Mask});
26644 }
26645 case CVTPS2PH_MASK: {
26646 SDValue Src = Op.getOperand(1);
26647 SDValue Rnd = Op.getOperand(2);
26648 SDValue PassThru = Op.getOperand(3);
26649 SDValue Mask = Op.getOperand(4);
26650
26651 unsigned RC = 0;
26652 unsigned Opc = IntrData->Opc0;
26653 bool SAE = Src.getValueType().is512BitVector() &&
26654 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
26655 if (SAE) {
26657 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
26658 }
26659
26660 if (isAllOnesConstant(Mask))
26661 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
26662
26663 if (SAE)
26665 else
26666 Opc = IntrData->Opc1;
26667 MVT SrcVT = Src.getSimpleValueType();
26668 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26669 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26670 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
26671 }
26672 case CVTNEPS2BF16_MASK: {
26673 SDValue Src = Op.getOperand(1);
26674 SDValue PassThru = Op.getOperand(2);
26675 SDValue Mask = Op.getOperand(3);
26676
26677 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26678 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26679
26680 // Break false dependency.
26681 if (PassThru.isUndef())
26682 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26683
26684 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26685 Mask);
26686 }
26687 default:
26688 break;
26689 }
26690 }
26691
26692 switch (IntNo) {
26693 default: return SDValue(); // Don't custom lower most intrinsics.
26694
26695 // ptest and testp intrinsics. The intrinsic these come from are designed to
26696 // return an integer value, not just an instruction so lower it to the ptest
26697 // or testp pattern and a setcc for the result.
26698 case Intrinsic::x86_avx512_ktestc_b:
26699 case Intrinsic::x86_avx512_ktestc_w:
26700 case Intrinsic::x86_avx512_ktestc_d:
26701 case Intrinsic::x86_avx512_ktestc_q:
26702 case Intrinsic::x86_avx512_ktestz_b:
26703 case Intrinsic::x86_avx512_ktestz_w:
26704 case Intrinsic::x86_avx512_ktestz_d:
26705 case Intrinsic::x86_avx512_ktestz_q:
26706 case Intrinsic::x86_sse41_ptestz:
26707 case Intrinsic::x86_sse41_ptestc:
26708 case Intrinsic::x86_sse41_ptestnzc:
26709 case Intrinsic::x86_avx_ptestz_256:
26710 case Intrinsic::x86_avx_ptestc_256:
26711 case Intrinsic::x86_avx_ptestnzc_256:
26712 case Intrinsic::x86_avx_vtestz_ps:
26713 case Intrinsic::x86_avx_vtestc_ps:
26714 case Intrinsic::x86_avx_vtestnzc_ps:
26715 case Intrinsic::x86_avx_vtestz_pd:
26716 case Intrinsic::x86_avx_vtestc_pd:
26717 case Intrinsic::x86_avx_vtestnzc_pd:
26718 case Intrinsic::x86_avx_vtestz_ps_256:
26719 case Intrinsic::x86_avx_vtestc_ps_256:
26720 case Intrinsic::x86_avx_vtestnzc_ps_256:
26721 case Intrinsic::x86_avx_vtestz_pd_256:
26722 case Intrinsic::x86_avx_vtestc_pd_256:
26723 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26724 unsigned TestOpc = X86ISD::PTEST;
26725 X86::CondCode X86CC;
26726 switch (IntNo) {
26727 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
26728 case Intrinsic::x86_avx512_ktestc_b:
26729 case Intrinsic::x86_avx512_ktestc_w:
26730 case Intrinsic::x86_avx512_ktestc_d:
26731 case Intrinsic::x86_avx512_ktestc_q:
26732 // CF = 1
26733 TestOpc = X86ISD::KTEST;
26734 X86CC = X86::COND_B;
26735 break;
26736 case Intrinsic::x86_avx512_ktestz_b:
26737 case Intrinsic::x86_avx512_ktestz_w:
26738 case Intrinsic::x86_avx512_ktestz_d:
26739 case Intrinsic::x86_avx512_ktestz_q:
26740 TestOpc = X86ISD::KTEST;
26741 X86CC = X86::COND_E;
26742 break;
26743 case Intrinsic::x86_avx_vtestz_ps:
26744 case Intrinsic::x86_avx_vtestz_pd:
26745 case Intrinsic::x86_avx_vtestz_ps_256:
26746 case Intrinsic::x86_avx_vtestz_pd_256:
26747 TestOpc = X86ISD::TESTP;
26748 [[fallthrough]];
26749 case Intrinsic::x86_sse41_ptestz:
26750 case Intrinsic::x86_avx_ptestz_256:
26751 // ZF = 1
26752 X86CC = X86::COND_E;
26753 break;
26754 case Intrinsic::x86_avx_vtestc_ps:
26755 case Intrinsic::x86_avx_vtestc_pd:
26756 case Intrinsic::x86_avx_vtestc_ps_256:
26757 case Intrinsic::x86_avx_vtestc_pd_256:
26758 TestOpc = X86ISD::TESTP;
26759 [[fallthrough]];
26760 case Intrinsic::x86_sse41_ptestc:
26761 case Intrinsic::x86_avx_ptestc_256:
26762 // CF = 1
26763 X86CC = X86::COND_B;
26764 break;
26765 case Intrinsic::x86_avx_vtestnzc_ps:
26766 case Intrinsic::x86_avx_vtestnzc_pd:
26767 case Intrinsic::x86_avx_vtestnzc_ps_256:
26768 case Intrinsic::x86_avx_vtestnzc_pd_256:
26769 TestOpc = X86ISD::TESTP;
26770 [[fallthrough]];
26771 case Intrinsic::x86_sse41_ptestnzc:
26772 case Intrinsic::x86_avx_ptestnzc_256:
26773 // ZF and CF = 0
26774 X86CC = X86::COND_A;
26775 break;
26776 }
26777
26778 SDValue LHS = Op.getOperand(1);
26779 SDValue RHS = Op.getOperand(2);
26780 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26781 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26782 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26783 }
26784
26785 case Intrinsic::x86_sse42_pcmpistria128:
26786 case Intrinsic::x86_sse42_pcmpestria128:
26787 case Intrinsic::x86_sse42_pcmpistric128:
26788 case Intrinsic::x86_sse42_pcmpestric128:
26789 case Intrinsic::x86_sse42_pcmpistrio128:
26790 case Intrinsic::x86_sse42_pcmpestrio128:
26791 case Intrinsic::x86_sse42_pcmpistris128:
26792 case Intrinsic::x86_sse42_pcmpestris128:
26793 case Intrinsic::x86_sse42_pcmpistriz128:
26794 case Intrinsic::x86_sse42_pcmpestriz128: {
26795 unsigned Opcode;
26796 X86::CondCode X86CC;
26797 switch (IntNo) {
26798 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26799 case Intrinsic::x86_sse42_pcmpistria128:
26800 Opcode = X86ISD::PCMPISTR;
26801 X86CC = X86::COND_A;
26802 break;
26803 case Intrinsic::x86_sse42_pcmpestria128:
26804 Opcode = X86ISD::PCMPESTR;
26805 X86CC = X86::COND_A;
26806 break;
26807 case Intrinsic::x86_sse42_pcmpistric128:
26808 Opcode = X86ISD::PCMPISTR;
26809 X86CC = X86::COND_B;
26810 break;
26811 case Intrinsic::x86_sse42_pcmpestric128:
26812 Opcode = X86ISD::PCMPESTR;
26813 X86CC = X86::COND_B;
26814 break;
26815 case Intrinsic::x86_sse42_pcmpistrio128:
26816 Opcode = X86ISD::PCMPISTR;
26817 X86CC = X86::COND_O;
26818 break;
26819 case Intrinsic::x86_sse42_pcmpestrio128:
26820 Opcode = X86ISD::PCMPESTR;
26821 X86CC = X86::COND_O;
26822 break;
26823 case Intrinsic::x86_sse42_pcmpistris128:
26824 Opcode = X86ISD::PCMPISTR;
26825 X86CC = X86::COND_S;
26826 break;
26827 case Intrinsic::x86_sse42_pcmpestris128:
26828 Opcode = X86ISD::PCMPESTR;
26829 X86CC = X86::COND_S;
26830 break;
26831 case Intrinsic::x86_sse42_pcmpistriz128:
26832 Opcode = X86ISD::PCMPISTR;
26833 X86CC = X86::COND_E;
26834 break;
26835 case Intrinsic::x86_sse42_pcmpestriz128:
26836 Opcode = X86ISD::PCMPESTR;
26837 X86CC = X86::COND_E;
26838 break;
26839 }
26841 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26842 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26843 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26844 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26845 }
26846
26847 case Intrinsic::x86_sse42_pcmpistri128:
26848 case Intrinsic::x86_sse42_pcmpestri128: {
26849 unsigned Opcode;
26850 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26851 Opcode = X86ISD::PCMPISTR;
26852 else
26853 Opcode = X86ISD::PCMPESTR;
26854
26856 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26857 return DAG.getNode(Opcode, dl, VTs, NewOps);
26858 }
26859
26860 case Intrinsic::x86_sse42_pcmpistrm128:
26861 case Intrinsic::x86_sse42_pcmpestrm128: {
26862 unsigned Opcode;
26863 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26864 Opcode = X86ISD::PCMPISTR;
26865 else
26866 Opcode = X86ISD::PCMPESTR;
26867
26869 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26870 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26871 }
26872
26873 case Intrinsic::eh_sjlj_lsda: {
26875 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26876 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26877 auto &Context = MF.getContext();
26878 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26879 Twine(MF.getFunctionNumber()));
26880 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
26881 DAG.getMCSymbol(S, PtrVT));
26882 }
26883
26884 case Intrinsic::x86_seh_lsda: {
26885 // Compute the symbol for the LSDA. We know it'll get emitted later.
26887 SDValue Op1 = Op.getOperand(1);
26888 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26891
26892 // Generate a simple absolute symbol reference. This intrinsic is only
26893 // supported on 32-bit Windows, which isn't PIC.
26894 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26895 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26896 }
26897
26898 case Intrinsic::eh_recoverfp: {
26899 SDValue FnOp = Op.getOperand(1);
26900 SDValue IncomingFPOp = Op.getOperand(2);
26901 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26902 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26903 if (!Fn)
26905 "llvm.eh.recoverfp must take a function as the first argument");
26906 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26907 }
26908
26909 case Intrinsic::localaddress: {
26910 // Returns one of the stack, base, or frame pointer registers, depending on
26911 // which is used to reference local variables.
26913 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26914 unsigned Reg;
26915 if (RegInfo->hasBasePointer(MF))
26916 Reg = RegInfo->getBaseRegister();
26917 else { // Handles the SP or FP case.
26918 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26919 if (CantUseFP)
26920 Reg = RegInfo->getPtrSizedStackRegister(MF);
26921 else
26922 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26923 }
26924 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26925 }
26926 case Intrinsic::x86_avx512_vp2intersect_q_512:
26927 case Intrinsic::x86_avx512_vp2intersect_q_256:
26928 case Intrinsic::x86_avx512_vp2intersect_q_128:
26929 case Intrinsic::x86_avx512_vp2intersect_d_512:
26930 case Intrinsic::x86_avx512_vp2intersect_d_256:
26931 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26932 MVT MaskVT = Op.getSimpleValueType();
26933
26934 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26935 SDLoc DL(Op);
26936
26939 Op->getOperand(1), Op->getOperand(2));
26940
26941 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26942 MaskVT, Operation);
26943 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26944 MaskVT, Operation);
26945 return DAG.getMergeValues({Result0, Result1}, DL);
26946 }
26947 case Intrinsic::x86_mmx_pslli_w:
26948 case Intrinsic::x86_mmx_pslli_d:
26949 case Intrinsic::x86_mmx_pslli_q:
26950 case Intrinsic::x86_mmx_psrli_w:
26951 case Intrinsic::x86_mmx_psrli_d:
26952 case Intrinsic::x86_mmx_psrli_q:
26953 case Intrinsic::x86_mmx_psrai_w:
26954 case Intrinsic::x86_mmx_psrai_d: {
26955 SDLoc DL(Op);
26956 SDValue ShAmt = Op.getOperand(2);
26957 // If the argument is a constant, convert it to a target constant.
26958 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26959 // Clamp out of bounds shift amounts since they will otherwise be masked
26960 // to 8-bits which may make it no longer out of bounds.
26961 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26962 if (ShiftAmount == 0)
26963 return Op.getOperand(1);
26964
26965 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26966 Op.getOperand(0), Op.getOperand(1),
26967 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26968 }
26969
26970 unsigned NewIntrinsic;
26971 switch (IntNo) {
26972 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26973 case Intrinsic::x86_mmx_pslli_w:
26974 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26975 break;
26976 case Intrinsic::x86_mmx_pslli_d:
26977 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26978 break;
26979 case Intrinsic::x86_mmx_pslli_q:
26980 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26981 break;
26982 case Intrinsic::x86_mmx_psrli_w:
26983 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26984 break;
26985 case Intrinsic::x86_mmx_psrli_d:
26986 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26987 break;
26988 case Intrinsic::x86_mmx_psrli_q:
26989 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26990 break;
26991 case Intrinsic::x86_mmx_psrai_w:
26992 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26993 break;
26994 case Intrinsic::x86_mmx_psrai_d:
26995 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26996 break;
26997 }
26998
26999 // The vector shift intrinsics with scalars uses 32b shift amounts but
27000 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27001 // MMX register.
27002 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27003 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27004 DAG.getTargetConstant(NewIntrinsic, DL,
27006 Op.getOperand(1), ShAmt);
27007 }
27008 case Intrinsic::thread_pointer: {
27009 if (Subtarget.isTargetELF()) {
27010 SDLoc dl(Op);
27011 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27012 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27014 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27015 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27017 }
27019 "Target OS doesn't support __builtin_thread_pointer() yet.");
27020 }
27021 }
27022}
27023
27025 SDValue Src, SDValue Mask, SDValue Base,
27026 SDValue Index, SDValue ScaleOp, SDValue Chain,
27027 const X86Subtarget &Subtarget) {
27028 SDLoc dl(Op);
27029 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27030 // Scale must be constant.
27031 if (!C)
27032 return SDValue();
27033 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27034 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27035 TLI.getPointerTy(DAG.getDataLayout()));
27036 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27037 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27038 // If source is undef or we know it won't be used, use a zero vector
27039 // to break register dependency.
27040 // TODO: use undef instead and let BreakFalseDeps deal with it?
27041 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27042 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27043
27044 // Cast mask to an integer type.
27045 Mask = DAG.getBitcast(MaskVT, Mask);
27046
27047 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27048
27049 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27050 SDValue Res =
27051 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27052 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27053 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27054}
27055
27057 SDValue Src, SDValue Mask, SDValue Base,
27058 SDValue Index, SDValue ScaleOp, SDValue Chain,
27059 const X86Subtarget &Subtarget) {
27060 MVT VT = Op.getSimpleValueType();
27061 SDLoc dl(Op);
27062 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27063 // Scale must be constant.
27064 if (!C)
27065 return SDValue();
27066 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27067 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27068 TLI.getPointerTy(DAG.getDataLayout()));
27069 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27071 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27072
27073 // We support two versions of the gather intrinsics. One with scalar mask and
27074 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27075 if (Mask.getValueType() != MaskVT)
27076 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27077
27078 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27079 // If source is undef or we know it won't be used, use a zero vector
27080 // to break register dependency.
27081 // TODO: use undef instead and let BreakFalseDeps deal with it?
27082 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27083 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27084
27085 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27086
27087 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27088 SDValue Res =
27089 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27090 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27091 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27092}
27093
27094static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27095 SDValue Src, SDValue Mask, SDValue Base,
27096 SDValue Index, SDValue ScaleOp, SDValue Chain,
27097 const X86Subtarget &Subtarget) {
27098 SDLoc dl(Op);
27099 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27100 // Scale must be constant.
27101 if (!C)
27102 return SDValue();
27103 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27104 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27105 TLI.getPointerTy(DAG.getDataLayout()));
27106 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27107 Src.getSimpleValueType().getVectorNumElements());
27108 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27109
27110 // We support two versions of the scatter intrinsics. One with scalar mask and
27111 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27112 if (Mask.getValueType() != MaskVT)
27113 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27114
27115 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27116
27117 SDVTList VTs = DAG.getVTList(MVT::Other);
27118 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27119 SDValue Res =
27120 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
27121 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27122 return Res;
27123}
27124
27125static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27126 SDValue Mask, SDValue Base, SDValue Index,
27127 SDValue ScaleOp, SDValue Chain,
27128 const X86Subtarget &Subtarget) {
27129 SDLoc dl(Op);
27130 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27131 // Scale must be constant.
27132 if (!C)
27133 return SDValue();
27134 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27135 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27136 TLI.getPointerTy(DAG.getDataLayout()));
27137 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27138 SDValue Segment = DAG.getRegister(0, MVT::i32);
27139 MVT MaskVT =
27140 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27141 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27142 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27143 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27144 return SDValue(Res, 0);
27145}
27146
27147/// Handles the lowering of builtin intrinsics with chain that return their
27148/// value into registers EDX:EAX.
27149/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27150/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27151/// TargetOpcode.
27152/// Returns a Glue value which can be used to add extra copy-from-reg if the
27153/// expanded intrinsics implicitly defines extra registers (i.e. not just
27154/// EDX:EAX).
27156 SelectionDAG &DAG,
27157 unsigned TargetOpcode,
27158 unsigned SrcReg,
27159 const X86Subtarget &Subtarget,
27161 SDValue Chain = N->getOperand(0);
27162 SDValue Glue;
27163
27164 if (SrcReg) {
27165 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27166 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27167 Glue = Chain.getValue(1);
27168 }
27169
27170 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27171 SDValue N1Ops[] = {Chain, Glue};
27172 SDNode *N1 = DAG.getMachineNode(
27173 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27174 Chain = SDValue(N1, 0);
27175
27176 // Reads the content of XCR and returns it in registers EDX:EAX.
27177 SDValue LO, HI;
27178 if (Subtarget.is64Bit()) {
27179 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27180 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27181 LO.getValue(2));
27182 } else {
27183 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27184 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27185 LO.getValue(2));
27186 }
27187 Chain = HI.getValue(1);
27188 Glue = HI.getValue(2);
27189
27190 if (Subtarget.is64Bit()) {
27191 // Merge the two 32-bit values into a 64-bit one.
27192 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27193 DAG.getConstant(32, DL, MVT::i8));
27194 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27195 Results.push_back(Chain);
27196 return Glue;
27197 }
27198
27199 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27200 SDValue Ops[] = { LO, HI };
27201 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27202 Results.push_back(Pair);
27203 Results.push_back(Chain);
27204 return Glue;
27205}
27206
27207/// Handles the lowering of builtin intrinsics that read the time stamp counter
27208/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27209/// READCYCLECOUNTER nodes.
27210static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27211 SelectionDAG &DAG,
27212 const X86Subtarget &Subtarget,
27214 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27215 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27216 // and the EAX register is loaded with the low-order 32 bits.
27217 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27218 /* NoRegister */0, Subtarget,
27219 Results);
27220 if (Opcode != X86::RDTSCP)
27221 return;
27222
27223 SDValue Chain = Results[1];
27224 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27225 // the ECX register. Add 'ecx' explicitly to the chain.
27226 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27227 Results[1] = ecx;
27228 Results.push_back(ecx.getValue(1));
27229}
27230
27232 SelectionDAG &DAG) {
27234 SDLoc DL(Op);
27235 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27236 Results);
27237 return DAG.getMergeValues(Results, DL);
27238}
27239
27242 SDValue Chain = Op.getOperand(0);
27243 SDValue RegNode = Op.getOperand(2);
27244 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27245 if (!EHInfo)
27246 report_fatal_error("EH registrations only live in functions using WinEH");
27247
27248 // Cast the operand to an alloca, and remember the frame index.
27249 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27250 if (!FINode)
27251 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27252 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27253
27254 // Return the chain operand without making any DAG nodes.
27255 return Chain;
27256}
27257
27260 SDValue Chain = Op.getOperand(0);
27261 SDValue EHGuard = Op.getOperand(2);
27262 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27263 if (!EHInfo)
27264 report_fatal_error("EHGuard only live in functions using WinEH");
27265
27266 // Cast the operand to an alloca, and remember the frame index.
27267 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27268 if (!FINode)
27269 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27270 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27271
27272 // Return the chain operand without making any DAG nodes.
27273 return Chain;
27274}
27275
27276/// Emit Truncating Store with signed or unsigned saturation.
27277static SDValue
27278EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27279 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27280 SelectionDAG &DAG) {
27281 SDVTList VTs = DAG.getVTList(MVT::Other);
27282 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27283 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27284 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27285 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27286}
27287
27288/// Emit Masked Truncating Store with signed or unsigned saturation.
27289static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27290 const SDLoc &DL,
27291 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27292 MachineMemOperand *MMO, SelectionDAG &DAG) {
27293 SDVTList VTs = DAG.getVTList(MVT::Other);
27294 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27295 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27296 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27297}
27298
27300 const MachineFunction &MF) {
27301 if (!Subtarget.is64Bit())
27302 return false;
27303 // 64-bit targets support extended Swift async frame setup,
27304 // except for targets that use the windows 64 prologue.
27305 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27306}
27307
27309 SelectionDAG &DAG) {
27310 unsigned IntNo = Op.getConstantOperandVal(1);
27311 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27312 if (!IntrData) {
27313 switch (IntNo) {
27314
27315 case Intrinsic::swift_async_context_addr: {
27316 SDLoc dl(Op);
27317 auto &MF = DAG.getMachineFunction();
27318 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27319 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27321 X86FI->setHasSwiftAsyncContext(true);
27322 SDValue Chain = Op->getOperand(0);
27323 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27324 SDValue Result =
27325 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27326 DAG.getTargetConstant(8, dl, MVT::i32)),
27327 0);
27328 // Return { result, chain }.
27329 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27330 CopyRBP.getValue(1));
27331 } else {
27332 // No special extended frame, create or reuse an existing stack slot.
27333 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27334 if (!X86FI->getSwiftAsyncContextFrameIdx())
27335 X86FI->setSwiftAsyncContextFrameIdx(
27336 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27337 false));
27338 SDValue Result =
27339 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27340 PtrSize == 8 ? MVT::i64 : MVT::i32);
27341 // Return { result, chain }.
27342 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27343 Op->getOperand(0));
27344 }
27345 }
27346
27347 case llvm::Intrinsic::x86_seh_ehregnode:
27348 return MarkEHRegistrationNode(Op, DAG);
27349 case llvm::Intrinsic::x86_seh_ehguard:
27350 return MarkEHGuard(Op, DAG);
27351 case llvm::Intrinsic::x86_rdpkru: {
27352 SDLoc dl(Op);
27353 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27354 // Create a RDPKRU node and pass 0 to the ECX parameter.
27355 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27356 DAG.getConstant(0, dl, MVT::i32));
27357 }
27358 case llvm::Intrinsic::x86_wrpkru: {
27359 SDLoc dl(Op);
27360 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27361 // to the EDX and ECX parameters.
27362 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27363 Op.getOperand(0), Op.getOperand(2),
27364 DAG.getConstant(0, dl, MVT::i32),
27365 DAG.getConstant(0, dl, MVT::i32));
27366 }
27367 case llvm::Intrinsic::asan_check_memaccess: {
27368 // Mark this as adjustsStack because it will be lowered to a call.
27370 // Don't do anything here, we will expand these intrinsics out later.
27371 return Op;
27372 }
27373 case llvm::Intrinsic::x86_flags_read_u32:
27374 case llvm::Intrinsic::x86_flags_read_u64:
27375 case llvm::Intrinsic::x86_flags_write_u32:
27376 case llvm::Intrinsic::x86_flags_write_u64: {
27377 // We need a frame pointer because this will get lowered to a PUSH/POP
27378 // sequence.
27381 // Don't do anything here, we will expand these intrinsics out later
27382 // during FinalizeISel in EmitInstrWithCustomInserter.
27383 return Op;
27384 }
27385 case Intrinsic::x86_lwpins32:
27386 case Intrinsic::x86_lwpins64:
27387 case Intrinsic::x86_umwait:
27388 case Intrinsic::x86_tpause: {
27389 SDLoc dl(Op);
27390 SDValue Chain = Op->getOperand(0);
27391 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27392 unsigned Opcode;
27393
27394 switch (IntNo) {
27395 default: llvm_unreachable("Impossible intrinsic");
27396 case Intrinsic::x86_umwait:
27397 Opcode = X86ISD::UMWAIT;
27398 break;
27399 case Intrinsic::x86_tpause:
27400 Opcode = X86ISD::TPAUSE;
27401 break;
27402 case Intrinsic::x86_lwpins32:
27403 case Intrinsic::x86_lwpins64:
27404 Opcode = X86ISD::LWPINS;
27405 break;
27406 }
27407
27409 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27410 Op->getOperand(3), Op->getOperand(4));
27411 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27412 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27413 Operation.getValue(1));
27414 }
27415 case Intrinsic::x86_enqcmd:
27416 case Intrinsic::x86_enqcmds: {
27417 SDLoc dl(Op);
27418 SDValue Chain = Op.getOperand(0);
27419 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27420 unsigned Opcode;
27421 switch (IntNo) {
27422 default: llvm_unreachable("Impossible intrinsic!");
27423 case Intrinsic::x86_enqcmd:
27424 Opcode = X86ISD::ENQCMD;
27425 break;
27426 case Intrinsic::x86_enqcmds:
27427 Opcode = X86ISD::ENQCMDS;
27428 break;
27429 }
27430 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27431 Op.getOperand(3));
27432 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27433 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27434 Operation.getValue(1));
27435 }
27436 case Intrinsic::x86_aesenc128kl:
27437 case Intrinsic::x86_aesdec128kl:
27438 case Intrinsic::x86_aesenc256kl:
27439 case Intrinsic::x86_aesdec256kl: {
27440 SDLoc DL(Op);
27441 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27442 SDValue Chain = Op.getOperand(0);
27443 unsigned Opcode;
27444
27445 switch (IntNo) {
27446 default: llvm_unreachable("Impossible intrinsic");
27447 case Intrinsic::x86_aesenc128kl:
27448 Opcode = X86ISD::AESENC128KL;
27449 break;
27450 case Intrinsic::x86_aesdec128kl:
27451 Opcode = X86ISD::AESDEC128KL;
27452 break;
27453 case Intrinsic::x86_aesenc256kl:
27454 Opcode = X86ISD::AESENC256KL;
27455 break;
27456 case Intrinsic::x86_aesdec256kl:
27457 Opcode = X86ISD::AESDEC256KL;
27458 break;
27459 }
27460
27461 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27462 MachineMemOperand *MMO = MemIntr->getMemOperand();
27463 EVT MemVT = MemIntr->getMemoryVT();
27465 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27466 MMO);
27467 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27468
27469 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27470 {ZF, Operation.getValue(0), Operation.getValue(2)});
27471 }
27472 case Intrinsic::x86_aesencwide128kl:
27473 case Intrinsic::x86_aesdecwide128kl:
27474 case Intrinsic::x86_aesencwide256kl:
27475 case Intrinsic::x86_aesdecwide256kl: {
27476 SDLoc DL(Op);
27477 SDVTList VTs = DAG.getVTList(
27478 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27479 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27480 SDValue Chain = Op.getOperand(0);
27481 unsigned Opcode;
27482
27483 switch (IntNo) {
27484 default: llvm_unreachable("Impossible intrinsic");
27485 case Intrinsic::x86_aesencwide128kl:
27486 Opcode = X86ISD::AESENCWIDE128KL;
27487 break;
27488 case Intrinsic::x86_aesdecwide128kl:
27489 Opcode = X86ISD::AESDECWIDE128KL;
27490 break;
27491 case Intrinsic::x86_aesencwide256kl:
27492 Opcode = X86ISD::AESENCWIDE256KL;
27493 break;
27494 case Intrinsic::x86_aesdecwide256kl:
27495 Opcode = X86ISD::AESDECWIDE256KL;
27496 break;
27497 }
27498
27499 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27500 MachineMemOperand *MMO = MemIntr->getMemOperand();
27501 EVT MemVT = MemIntr->getMemoryVT();
27503 Opcode, DL, VTs,
27504 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27505 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27506 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27507 MemVT, MMO);
27508 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27509
27510 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27511 {ZF, Operation.getValue(1), Operation.getValue(2),
27512 Operation.getValue(3), Operation.getValue(4),
27513 Operation.getValue(5), Operation.getValue(6),
27514 Operation.getValue(7), Operation.getValue(8),
27515 Operation.getValue(9)});
27516 }
27517 case Intrinsic::x86_testui: {
27518 SDLoc dl(Op);
27519 SDValue Chain = Op.getOperand(0);
27520 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27521 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27522 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27523 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27524 Operation.getValue(1));
27525 }
27526 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27527 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27528 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27529 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27530 case Intrinsic::x86_t2rpntlvwz0_internal:
27531 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27532 case Intrinsic::x86_t2rpntlvwz1_internal:
27533 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27534 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27536 unsigned IntNo = Op.getConstantOperandVal(1);
27537 unsigned Opc = 0;
27538 switch (IntNo) {
27539 default:
27540 llvm_unreachable("Unexpected intrinsic!");
27541 case Intrinsic::x86_t2rpntlvwz0_internal:
27542 Opc = X86::PT2RPNTLVWZ0V;
27543 break;
27544 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27545 Opc = X86::PT2RPNTLVWZ0T1V;
27546 break;
27547 case Intrinsic::x86_t2rpntlvwz1_internal:
27548 Opc = X86::PT2RPNTLVWZ1V;
27549 break;
27550 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27551 Opc = X86::PT2RPNTLVWZ1T1V;
27552 break;
27553 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27554 Opc = X86::PT2RPNTLVWZ0RSV;
27555 break;
27556 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27557 Opc = X86::PT2RPNTLVWZ0RST1V;
27558 break;
27559 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27560 Opc = X86::PT2RPNTLVWZ1RSV;
27561 break;
27562 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27563 Opc = X86::PT2RPNTLVWZ1RST1V;
27564 break;
27565 }
27566
27567 SDLoc DL(Op);
27568 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27569
27570 SDValue Ops[] = {Op.getOperand(2), // Row
27571 Op.getOperand(3), // Col0
27572 Op.getOperand(4), // Col1
27573 Op.getOperand(5), // Base
27574 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27575 Op.getOperand(6), // Index
27576 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27577 DAG.getRegister(0, MVT::i16), // Segment
27578 Op.getOperand(0)}; // Chain
27579
27580 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27581 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27582 SDValue(Res, 0));
27583 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27584 SDValue(Res, 0));
27585 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
27586 }
27587 case Intrinsic::x86_atomic_bts_rm:
27588 case Intrinsic::x86_atomic_btc_rm:
27589 case Intrinsic::x86_atomic_btr_rm: {
27590 SDLoc DL(Op);
27591 MVT VT = Op.getSimpleValueType();
27592 SDValue Chain = Op.getOperand(0);
27593 SDValue Op1 = Op.getOperand(2);
27594 SDValue Op2 = Op.getOperand(3);
27595 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
27596 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
27598 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27599 SDValue Res =
27600 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27601 {Chain, Op1, Op2}, VT, MMO);
27602 Chain = Res.getValue(1);
27603 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27604 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27605 }
27606 case Intrinsic::x86_atomic_bts:
27607 case Intrinsic::x86_atomic_btc:
27608 case Intrinsic::x86_atomic_btr: {
27609 SDLoc DL(Op);
27610 MVT VT = Op.getSimpleValueType();
27611 SDValue Chain = Op.getOperand(0);
27612 SDValue Op1 = Op.getOperand(2);
27613 SDValue Op2 = Op.getOperand(3);
27614 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
27615 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
27616 : X86ISD::LBTR;
27617 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
27618 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27619 SDValue Res =
27620 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27621 {Chain, Op1, Op2, Size}, VT, MMO);
27622 Chain = Res.getValue(1);
27623 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27624 unsigned Imm = Op2->getAsZExtVal();
27625 if (Imm)
27626 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
27627 DAG.getShiftAmountConstant(Imm, VT, DL));
27628 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27629 }
27630 case Intrinsic::x86_cmpccxadd32:
27631 case Intrinsic::x86_cmpccxadd64: {
27632 SDLoc DL(Op);
27633 SDValue Chain = Op.getOperand(0);
27634 SDValue Addr = Op.getOperand(2);
27635 SDValue Src1 = Op.getOperand(3);
27636 SDValue Src2 = Op.getOperand(4);
27637 SDValue CC = Op.getOperand(5);
27638 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27640 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
27641 MVT::i32, MMO);
27642 return Operation;
27643 }
27644 case Intrinsic::x86_aadd32:
27645 case Intrinsic::x86_aadd64:
27646 case Intrinsic::x86_aand32:
27647 case Intrinsic::x86_aand64:
27648 case Intrinsic::x86_aor32:
27649 case Intrinsic::x86_aor64:
27650 case Intrinsic::x86_axor32:
27651 case Intrinsic::x86_axor64: {
27652 SDLoc DL(Op);
27653 SDValue Chain = Op.getOperand(0);
27654 SDValue Op1 = Op.getOperand(2);
27655 SDValue Op2 = Op.getOperand(3);
27656 MVT VT = Op2.getSimpleValueType();
27657 unsigned Opc = 0;
27658 switch (IntNo) {
27659 default:
27660 llvm_unreachable("Unknown Intrinsic");
27661 case Intrinsic::x86_aadd32:
27662 case Intrinsic::x86_aadd64:
27663 Opc = X86ISD::AADD;
27664 break;
27665 case Intrinsic::x86_aand32:
27666 case Intrinsic::x86_aand64:
27667 Opc = X86ISD::AAND;
27668 break;
27669 case Intrinsic::x86_aor32:
27670 case Intrinsic::x86_aor64:
27671 Opc = X86ISD::AOR;
27672 break;
27673 case Intrinsic::x86_axor32:
27674 case Intrinsic::x86_axor64:
27675 Opc = X86ISD::AXOR;
27676 break;
27677 }
27678 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27679 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27680 {Chain, Op1, Op2}, VT, MMO);
27681 }
27682 case Intrinsic::x86_atomic_add_cc:
27683 case Intrinsic::x86_atomic_sub_cc:
27684 case Intrinsic::x86_atomic_or_cc:
27685 case Intrinsic::x86_atomic_and_cc:
27686 case Intrinsic::x86_atomic_xor_cc: {
27687 SDLoc DL(Op);
27688 SDValue Chain = Op.getOperand(0);
27689 SDValue Op1 = Op.getOperand(2);
27690 SDValue Op2 = Op.getOperand(3);
27691 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
27692 MVT VT = Op2.getSimpleValueType();
27693 unsigned Opc = 0;
27694 switch (IntNo) {
27695 default:
27696 llvm_unreachable("Unknown Intrinsic");
27697 case Intrinsic::x86_atomic_add_cc:
27698 Opc = X86ISD::LADD;
27699 break;
27700 case Intrinsic::x86_atomic_sub_cc:
27701 Opc = X86ISD::LSUB;
27702 break;
27703 case Intrinsic::x86_atomic_or_cc:
27704 Opc = X86ISD::LOR;
27705 break;
27706 case Intrinsic::x86_atomic_and_cc:
27707 Opc = X86ISD::LAND;
27708 break;
27709 case Intrinsic::x86_atomic_xor_cc:
27710 Opc = X86ISD::LXOR;
27711 break;
27712 }
27713 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27714 SDValue LockArith =
27715 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27716 {Chain, Op1, Op2}, VT, MMO);
27717 Chain = LockArith.getValue(1);
27718 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
27719 }
27720 }
27721 return SDValue();
27722 }
27723
27724 SDLoc dl(Op);
27725 switch(IntrData->Type) {
27726 default: llvm_unreachable("Unknown Intrinsic Type");
27727 case RDSEED:
27728 case RDRAND: {
27729 // Emit the node with the right value type.
27730 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27731 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27732
27733 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27734 // Otherwise return the value from Rand, which is always 0, casted to i32.
27735 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27736 DAG.getConstant(1, dl, Op->getValueType(1)),
27737 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27738 SDValue(Result.getNode(), 1)};
27739 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27740
27741 // Return { result, isValid, chain }.
27742 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27743 SDValue(Result.getNode(), 2));
27744 }
27745 case GATHER_AVX2: {
27746 SDValue Chain = Op.getOperand(0);
27747 SDValue Src = Op.getOperand(2);
27748 SDValue Base = Op.getOperand(3);
27749 SDValue Index = Op.getOperand(4);
27750 SDValue Mask = Op.getOperand(5);
27751 SDValue Scale = Op.getOperand(6);
27752 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27753 Scale, Chain, Subtarget);
27754 }
27755 case GATHER: {
27756 //gather(v1, mask, index, base, scale);
27757 SDValue Chain = Op.getOperand(0);
27758 SDValue Src = Op.getOperand(2);
27759 SDValue Base = Op.getOperand(3);
27760 SDValue Index = Op.getOperand(4);
27761 SDValue Mask = Op.getOperand(5);
27762 SDValue Scale = Op.getOperand(6);
27763 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27764 Chain, Subtarget);
27765 }
27766 case SCATTER: {
27767 //scatter(base, mask, index, v1, scale);
27768 SDValue Chain = Op.getOperand(0);
27769 SDValue Base = Op.getOperand(2);
27770 SDValue Mask = Op.getOperand(3);
27771 SDValue Index = Op.getOperand(4);
27772 SDValue Src = Op.getOperand(5);
27773 SDValue Scale = Op.getOperand(6);
27774 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27775 Scale, Chain, Subtarget);
27776 }
27777 case PREFETCH: {
27778 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27779 assert((HintVal == 2 || HintVal == 3) &&
27780 "Wrong prefetch hint in intrinsic: should be 2 or 3");
27781 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27782 SDValue Chain = Op.getOperand(0);
27783 SDValue Mask = Op.getOperand(2);
27784 SDValue Index = Op.getOperand(3);
27785 SDValue Base = Op.getOperand(4);
27786 SDValue Scale = Op.getOperand(5);
27787 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27788 Subtarget);
27789 }
27790 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27791 case RDTSC: {
27793 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27794 Results);
27795 return DAG.getMergeValues(Results, dl);
27796 }
27797 // Read Performance Monitoring Counters.
27798 case RDPMC:
27799 // Read Processor Register.
27800 case RDPRU:
27801 // GetExtended Control Register.
27802 case XGETBV: {
27804
27805 // RDPMC uses ECX to select the index of the performance counter to read.
27806 // RDPRU uses ECX to select the processor register to read.
27807 // XGETBV uses ECX to select the index of the XCR register to return.
27808 // The result is stored into registers EDX:EAX.
27809 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27810 Subtarget, Results);
27811 return DAG.getMergeValues(Results, dl);
27812 }
27813 // XTEST intrinsics.
27814 case XTEST: {
27815 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27816 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27817
27818 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27819 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27820 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27821 Ret, SDValue(InTrans.getNode(), 1));
27822 }
27825 case TRUNCATE_TO_MEM_VI32: {
27826 SDValue Mask = Op.getOperand(4);
27827 SDValue DataToTruncate = Op.getOperand(3);
27828 SDValue Addr = Op.getOperand(2);
27829 SDValue Chain = Op.getOperand(0);
27830
27831 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27832 assert(MemIntr && "Expected MemIntrinsicSDNode!");
27833
27834 EVT MemVT = MemIntr->getMemoryVT();
27835
27836 uint16_t TruncationOp = IntrData->Opc0;
27837 switch (TruncationOp) {
27838 case X86ISD::VTRUNC: {
27839 if (isAllOnesConstant(Mask)) // return just a truncate store
27840 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27841 MemIntr->getMemOperand());
27842
27843 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27844 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27845 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27846
27847 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27848 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27849 true /* truncating */);
27850 }
27851 case X86ISD::VTRUNCUS:
27852 case X86ISD::VTRUNCS: {
27853 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27854 if (isAllOnesConstant(Mask))
27855 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27856 MemIntr->getMemOperand(), DAG);
27857
27858 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27859 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27860
27861 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27862 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27863 }
27864 default:
27865 llvm_unreachable("Unsupported truncstore intrinsic");
27866 }
27867 }
27868 case INTR_TYPE_CAST_MMX:
27869 return SDValue(); // handled in combineINTRINSIC_*
27870 }
27871}
27872
27873SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27874 SelectionDAG &DAG) const {
27876 MFI.setReturnAddressIsTaken(true);
27877
27879 return SDValue();
27880
27881 unsigned Depth = Op.getConstantOperandVal(0);
27882 SDLoc dl(Op);
27883 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27884
27885 if (Depth > 0) {
27886 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27887 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27888 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27889 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27890 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27892 }
27893
27894 // Just load the return address.
27895 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27896 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27898}
27899
27900SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27901 SelectionDAG &DAG) const {
27903 return getReturnAddressFrameIndex(DAG);
27904}
27905
27906SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27908 MachineFrameInfo &MFI = MF.getFrameInfo();
27910 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27911 EVT VT = Op.getValueType();
27912
27913 MFI.setFrameAddressIsTaken(true);
27914
27915 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27916 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27917 // is not possible to crawl up the stack without looking at the unwind codes
27918 // simultaneously.
27919 int FrameAddrIndex = FuncInfo->getFAIndex();
27920 if (!FrameAddrIndex) {
27921 // Set up a frame object for the return address.
27922 unsigned SlotSize = RegInfo->getSlotSize();
27923 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27924 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27925 FuncInfo->setFAIndex(FrameAddrIndex);
27926 }
27927 return DAG.getFrameIndex(FrameAddrIndex, VT);
27928 }
27929
27930 unsigned FrameReg =
27931 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27932 SDLoc dl(Op); // FIXME probably not meaningful
27933 unsigned Depth = Op.getConstantOperandVal(0);
27934 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
27935 (FrameReg == X86::EBP && VT == MVT::i32)) &&
27936 "Invalid Frame Register!");
27937 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27938 while (Depth--)
27939 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27941 return FrameAddr;
27942}
27943
27944// FIXME? Maybe this could be a TableGen attribute on some registers and
27945// this table could be generated automatically from RegInfo.
27947 const MachineFunction &MF) const {
27948 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27949
27951 .Case("esp", X86::ESP)
27952 .Case("rsp", X86::RSP)
27953 .Case("ebp", X86::EBP)
27954 .Case("rbp", X86::RBP)
27955 .Case("r14", X86::R14)
27956 .Case("r15", X86::R15)
27957 .Default(0);
27958
27959 if (Reg == X86::EBP || Reg == X86::RBP) {
27960 if (!TFI.hasFP(MF))
27961 report_fatal_error("register " + StringRef(RegName) +
27962 " is allocatable: function has no frame pointer");
27963#ifndef NDEBUG
27964 else {
27965 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27966 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27967 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
27968 "Invalid Frame Register!");
27969 }
27970#endif
27971 }
27972
27973 if (Reg)
27974 return Reg;
27975
27976 report_fatal_error("Invalid register name global variable");
27977}
27978
27979SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27980 SelectionDAG &DAG) const {
27981 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27982 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27983}
27984
27986 const Constant *PersonalityFn) const {
27987 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27988 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27989
27990 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27991}
27992
27994 const Constant *PersonalityFn) const {
27995 // Funclet personalities don't use selectors (the runtime does the selection).
27997 return X86::NoRegister;
27998 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27999}
28000
28002 return Subtarget.isTargetWin64();
28003}
28004
28005SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28006 SDValue Chain = Op.getOperand(0);
28007 SDValue Offset = Op.getOperand(1);
28008 SDValue Handler = Op.getOperand(2);
28009 SDLoc dl (Op);
28010
28011 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28012 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28013 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28014 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28015 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28016 "Invalid Frame Register!");
28017 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28018 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28019
28020 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28021 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28022 dl));
28023 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28024 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28025 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28026
28027 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28028 DAG.getRegister(StoreAddrReg, PtrVT));
28029}
28030
28031SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28032 SelectionDAG &DAG) const {
28033 SDLoc DL(Op);
28034 // If the subtarget is not 64bit, we may need the global base reg
28035 // after isel expand pseudo, i.e., after CGBR pass ran.
28036 // Therefore, ask for the GlobalBaseReg now, so that the pass
28037 // inserts the code for us in case we need it.
28038 // Otherwise, we will end up in a situation where we will
28039 // reference a virtual register that is not defined!
28040 if (!Subtarget.is64Bit()) {
28041 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28042 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28043 }
28044 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28045 DAG.getVTList(MVT::i32, MVT::Other),
28046 Op.getOperand(0), Op.getOperand(1));
28047}
28048
28049SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28050 SelectionDAG &DAG) const {
28051 SDLoc DL(Op);
28052 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28053 Op.getOperand(0), Op.getOperand(1));
28054}
28055
28056SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28057 SelectionDAG &DAG) const {
28058 SDLoc DL(Op);
28059 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28060 Op.getOperand(0));
28061}
28062
28064 return Op.getOperand(0);
28065}
28066
28067SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28068 SelectionDAG &DAG) const {
28069 SDValue Root = Op.getOperand(0);
28070 SDValue Trmp = Op.getOperand(1); // trampoline
28071 SDValue FPtr = Op.getOperand(2); // nested function
28072 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28073 SDLoc dl (Op);
28074
28075 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28076 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28077
28078 if (Subtarget.is64Bit()) {
28079 SDValue OutChains[6];
28080
28081 // Large code-model.
28082 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28083 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28084
28085 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28086 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28087
28088 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28089
28090 // Load the pointer to the nested function into R11.
28091 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28092 SDValue Addr = Trmp;
28093 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28094 Addr, MachinePointerInfo(TrmpAddr));
28095
28096 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28097 DAG.getConstant(2, dl, MVT::i64));
28098 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28099 MachinePointerInfo(TrmpAddr, 2), Align(2));
28100
28101 // Load the 'nest' parameter value into R10.
28102 // R10 is specified in X86CallingConv.td
28103 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28104 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28105 DAG.getConstant(10, dl, MVT::i64));
28106 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28107 Addr, MachinePointerInfo(TrmpAddr, 10));
28108
28109 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28110 DAG.getConstant(12, dl, MVT::i64));
28111 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28112 MachinePointerInfo(TrmpAddr, 12), Align(2));
28113
28114 // Jump to the nested function.
28115 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28116 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28117 DAG.getConstant(20, dl, MVT::i64));
28118 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28119 Addr, MachinePointerInfo(TrmpAddr, 20));
28120
28121 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28122 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28123 DAG.getConstant(22, dl, MVT::i64));
28124 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28125 Addr, MachinePointerInfo(TrmpAddr, 22));
28126
28127 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28128 } else {
28129 const Function *Func =
28130 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28131 CallingConv::ID CC = Func->getCallingConv();
28132 unsigned NestReg;
28133
28134 switch (CC) {
28135 default:
28136 llvm_unreachable("Unsupported calling convention");
28137 case CallingConv::C:
28139 // Pass 'nest' parameter in ECX.
28140 // Must be kept in sync with X86CallingConv.td
28141 NestReg = X86::ECX;
28142
28143 // Check that ECX wasn't needed by an 'inreg' parameter.
28144 FunctionType *FTy = Func->getFunctionType();
28145 const AttributeList &Attrs = Func->getAttributes();
28146
28147 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28148 unsigned InRegCount = 0;
28149 unsigned Idx = 0;
28150
28151 for (FunctionType::param_iterator I = FTy->param_begin(),
28152 E = FTy->param_end(); I != E; ++I, ++Idx)
28153 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28154 const DataLayout &DL = DAG.getDataLayout();
28155 // FIXME: should only count parameters that are lowered to integers.
28156 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28157 }
28158
28159 if (InRegCount > 2) {
28160 report_fatal_error("Nest register in use - reduce number of inreg"
28161 " parameters!");
28162 }
28163 }
28164 break;
28165 }
28168 case CallingConv::Fast:
28169 case CallingConv::Tail:
28171 // Pass 'nest' parameter in EAX.
28172 // Must be kept in sync with X86CallingConv.td
28173 NestReg = X86::EAX;
28174 break;
28175 }
28176
28177 SDValue OutChains[4];
28178 SDValue Addr, Disp;
28179
28180 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28181 DAG.getConstant(10, dl, MVT::i32));
28182 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28183
28184 // This is storing the opcode for MOV32ri.
28185 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28186 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28187 OutChains[0] =
28188 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28189 Trmp, MachinePointerInfo(TrmpAddr));
28190
28191 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28192 DAG.getConstant(1, dl, MVT::i32));
28193 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28194 MachinePointerInfo(TrmpAddr, 1), Align(1));
28195
28196 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28197 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28198 DAG.getConstant(5, dl, MVT::i32));
28199 OutChains[2] =
28200 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28201 MachinePointerInfo(TrmpAddr, 5), Align(1));
28202
28203 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28204 DAG.getConstant(6, dl, MVT::i32));
28205 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28206 MachinePointerInfo(TrmpAddr, 6), Align(1));
28207
28208 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28209 }
28210}
28211
28212SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28213 SelectionDAG &DAG) const {
28214 /*
28215 The rounding mode is in bits 11:10 of FPSR, and has the following
28216 settings:
28217 00 Round to nearest
28218 01 Round to -inf
28219 10 Round to +inf
28220 11 Round to 0
28221
28222 GET_ROUNDING, on the other hand, expects the following:
28223 -1 Undefined
28224 0 Round to 0
28225 1 Round to nearest
28226 2 Round to +inf
28227 3 Round to -inf
28228
28229 To perform the conversion, we use a packed lookup table of the four 2-bit
28230 values that we can index by FPSP[11:10]
28231 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28232
28233 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28234 */
28235
28237 MVT VT = Op.getSimpleValueType();
28238 SDLoc DL(Op);
28239
28240 // Save FP Control Word to stack slot
28241 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28242 SDValue StackSlot =
28243 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28244
28246
28247 SDValue Chain = Op.getOperand(0);
28248 SDValue Ops[] = {Chain, StackSlot};
28250 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28252
28253 // Load FP Control Word from stack slot
28254 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28255 Chain = CWD.getValue(1);
28256
28257 // Mask and turn the control bits into a shift for the lookup table.
28258 SDValue Shift =
28259 DAG.getNode(ISD::SRL, DL, MVT::i16,
28260 DAG.getNode(ISD::AND, DL, MVT::i16,
28261 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28262 DAG.getConstant(9, DL, MVT::i8));
28263 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28264
28265 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28266 SDValue RetVal =
28267 DAG.getNode(ISD::AND, DL, MVT::i32,
28268 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28269 DAG.getConstant(3, DL, MVT::i32));
28270
28271 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28272
28273 return DAG.getMergeValues({RetVal, Chain}, DL);
28274}
28275
28276SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28277 SelectionDAG &DAG) const {
28279 SDLoc DL(Op);
28280 SDValue Chain = Op.getNode()->getOperand(0);
28281
28282 // FP control word may be set only from data in memory. So we need to allocate
28283 // stack space to save/load FP control word.
28284 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28285 SDValue StackSlot =
28286 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28288 MachineMemOperand *MMO =
28290
28291 // Store FP control word into memory.
28292 SDValue Ops[] = {Chain, StackSlot};
28293 Chain = DAG.getMemIntrinsicNode(
28294 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28295
28296 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28297 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28298 Chain = CWD.getValue(1);
28299 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28300 DAG.getConstant(0xf3ff, DL, MVT::i16));
28301
28302 // Calculate new rounding mode.
28303 SDValue NewRM = Op.getNode()->getOperand(1);
28304 SDValue RMBits;
28305 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28306 uint64_t RM = CVal->getZExtValue();
28307 int FieldVal;
28308 switch (static_cast<RoundingMode>(RM)) {
28309 // clang-format off
28310 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
28311 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
28312 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
28313 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
28314 default:
28315 llvm_unreachable("rounding mode is not supported by X86 hardware");
28316 // clang-format on
28317 }
28318 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28319 } else {
28320 // Need to convert argument into bits of control word:
28321 // 0 Round to 0 -> 11
28322 // 1 Round to nearest -> 00
28323 // 2 Round to +inf -> 10
28324 // 3 Round to -inf -> 01
28325 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28326 // To make the conversion, put all these values into a value 0xc9 and shift
28327 // it left depending on the rounding mode:
28328 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28329 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28330 // ...
28331 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28332 SDValue ShiftValue =
28333 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28334 DAG.getNode(ISD::ADD, DL, MVT::i32,
28335 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28336 DAG.getConstant(1, DL, MVT::i8)),
28337 DAG.getConstant(4, DL, MVT::i32)));
28338 SDValue Shifted =
28339 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28340 ShiftValue);
28341 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28342 DAG.getConstant(0xc00, DL, MVT::i16));
28343 }
28344
28345 // Update rounding mode bits and store the new FP Control Word into stack.
28346 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28347 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28348
28349 // Load FP control word from the slot.
28350 SDValue OpsLD[] = {Chain, StackSlot};
28351 MachineMemOperand *MMOL =
28353 Chain = DAG.getMemIntrinsicNode(
28354 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28355
28356 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28357 // same way but in bits 14:13.
28358 if (Subtarget.hasSSE1()) {
28359 // Store MXCSR into memory.
28360 Chain = DAG.getNode(
28361 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28362 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28363 StackSlot);
28364
28365 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28366 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28367 Chain = CWD.getValue(1);
28368 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28369 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28370
28371 // Shift X87 RM bits from 11:10 to 14:13.
28372 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28373 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28374 DAG.getConstant(3, DL, MVT::i8));
28375
28376 // Update rounding mode bits and store the new FP Control Word into stack.
28377 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28378 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28379
28380 // Load MXCSR from the slot.
28381 Chain = DAG.getNode(
28382 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28383 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28384 StackSlot);
28385 }
28386
28387 return Chain;
28388}
28389
28390const unsigned X87StateSize = 28;
28391const unsigned FPStateSize = 32;
28392[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28393
28394SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28395 SelectionDAG &DAG) const {
28397 SDLoc DL(Op);
28398 SDValue Chain = Op->getOperand(0);
28399 SDValue Ptr = Op->getOperand(1);
28400 auto *Node = cast<FPStateAccessSDNode>(Op);
28401 EVT MemVT = Node->getMemoryVT();
28403 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28404
28405 // Get x87 state, if it presents.
28406 if (Subtarget.hasX87()) {
28407 Chain =
28408 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28409 {Chain, Ptr}, MemVT, MMO);
28410
28411 // FNSTENV changes the exception mask, so load back the stored environment.
28412 MachineMemOperand::Flags NewFlags =
28414 (MMO->getFlags() & ~MachineMemOperand::MOStore);
28415 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28416 Chain =
28417 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28418 {Chain, Ptr}, MemVT, MMO);
28419 }
28420
28421 // If target supports SSE, get MXCSR as well.
28422 if (Subtarget.hasSSE1()) {
28423 // Get pointer to the MXCSR location in memory.
28425 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28426 DAG.getConstant(X87StateSize, DL, PtrVT));
28427 // Store MXCSR into memory.
28428 Chain = DAG.getNode(
28429 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28430 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28431 MXCSRAddr);
28432 }
28433
28434 return Chain;
28435}
28436
28438 EVT MemVT, MachineMemOperand *MMO,
28439 SelectionDAG &DAG,
28440 const X86Subtarget &Subtarget) {
28441 // Set x87 state, if it presents.
28442 if (Subtarget.hasX87())
28443 Chain =
28444 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28445 {Chain, Ptr}, MemVT, MMO);
28446 // If target supports SSE, set MXCSR as well.
28447 if (Subtarget.hasSSE1()) {
28448 // Get pointer to the MXCSR location in memory.
28450 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28451 DAG.getConstant(X87StateSize, DL, PtrVT));
28452 // Load MXCSR from memory.
28453 Chain = DAG.getNode(
28454 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28455 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28456 MXCSRAddr);
28457 }
28458 return Chain;
28459}
28460
28461SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28462 SelectionDAG &DAG) const {
28463 SDLoc DL(Op);
28464 SDValue Chain = Op->getOperand(0);
28465 SDValue Ptr = Op->getOperand(1);
28466 auto *Node = cast<FPStateAccessSDNode>(Op);
28467 EVT MemVT = Node->getMemoryVT();
28469 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28470 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28471}
28472
28473SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28474 SelectionDAG &DAG) const {
28476 SDLoc DL(Op);
28477 SDValue Chain = Op.getNode()->getOperand(0);
28478
28479 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28480 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28482
28483 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28484 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28485 // for compatibility with glibc.
28486 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28487 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28488 Constant *Zero = ConstantInt::get(ItemTy, 0);
28489 for (unsigned I = 0; I < 6; ++I)
28490 FPEnvVals.push_back(Zero);
28491
28492 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28493 // all exceptions, sets DAZ and FTZ to 0.
28494 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28495 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28497 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28498 MachinePointerInfo MPI =
28502
28503 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28504}
28505
28506// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28507uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28508 assert((Amt < 8) && "Shift/Rotation amount out of range");
28509 switch (Opcode) {
28510 case ISD::BITREVERSE:
28511 return 0x8040201008040201ULL;
28512 case ISD::SHL:
28513 return ((0x0102040810204080ULL >> (Amt)) &
28514 (0x0101010101010101ULL * (0xFF >> (Amt))));
28515 case ISD::SRL:
28516 return ((0x0102040810204080ULL << (Amt)) &
28517 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28518 case ISD::SRA:
28519 return (getGFNICtrlImm(ISD::SRL, Amt) |
28520 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28521 case ISD::ROTL:
28522 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28523 case ISD::ROTR:
28524 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28525 }
28526 llvm_unreachable("Unsupported GFNI opcode");
28527}
28528
28529// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28530SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28531 MVT VT, unsigned Amt = 0) {
28532 assert(VT.getVectorElementType() == MVT::i8 &&
28533 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28534 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28535 SmallVector<SDValue> MaskBits;
28536 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28537 uint64_t Bits = (Imm >> (I % 64)) & 255;
28538 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28539 }
28540 return DAG.getBuildVector(VT, DL, MaskBits);
28541}
28542
28543/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28544//
28545// i8/i16 vector implemented using dword LZCNT vector instruction
28546// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28547// split the vector, perform operation on it's Lo a Hi part and
28548// concatenate the results.
28550 const X86Subtarget &Subtarget) {
28551 assert(Op.getOpcode() == ISD::CTLZ);
28552 SDLoc dl(Op);
28553 MVT VT = Op.getSimpleValueType();
28554 MVT EltVT = VT.getVectorElementType();
28555 unsigned NumElems = VT.getVectorNumElements();
28556
28557 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28558 "Unsupported element type");
28559
28560 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28561 if (NumElems > 16 ||
28562 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28563 return splitVectorIntUnary(Op, DAG, dl);
28564
28565 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28566 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28567 "Unsupported value type for operation");
28568
28569 // Use native supported vector instruction vplzcntd.
28570 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28571 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28572 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28573 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28574
28575 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28576}
28577
28578// Lower CTLZ using a PSHUFB lookup table implementation.
28580 const X86Subtarget &Subtarget,
28581 SelectionDAG &DAG) {
28582 MVT VT = Op.getSimpleValueType();
28583 int NumElts = VT.getVectorNumElements();
28584 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28585 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28586
28587 // Per-nibble leading zero PSHUFB lookup table.
28588 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28589 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28590 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28591 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28592
28594 for (int i = 0; i < NumBytes; ++i)
28595 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28596 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
28597
28598 // Begin by bitcasting the input to byte vector, then split those bytes
28599 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
28600 // If the hi input nibble is zero then we add both results together, otherwise
28601 // we just take the hi result (by masking the lo result to zero before the
28602 // add).
28603 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
28604 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28605
28606 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
28607 SDValue Lo = Op0;
28608 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
28609 SDValue HiZ;
28610 if (CurrVT.is512BitVector()) {
28611 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28612 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
28613 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28614 } else {
28615 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
28616 }
28617
28618 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
28619 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
28620 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
28621 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
28622
28623 // Merge result back from vXi8 back to VT, working on the lo/hi halves
28624 // of the current vector width in the same way we did for the nibbles.
28625 // If the upper half of the input element is zero then add the halves'
28626 // leading zero counts together, otherwise just use the upper half's.
28627 // Double the width of the result until we are at target width.
28628 while (CurrVT != VT) {
28629 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
28630 int CurrNumElts = CurrVT.getVectorNumElements();
28631 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
28632 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
28633 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
28634
28635 // Check if the upper half of the input element is zero.
28636 if (CurrVT.is512BitVector()) {
28637 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28638 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
28639 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28640 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28641 } else {
28642 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
28643 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28644 }
28645 HiZ = DAG.getBitcast(NextVT, HiZ);
28646
28647 // Move the upper/lower halves to the lower bits as we'll be extending to
28648 // NextVT. Mask the lower result to zero if HiZ is true and add the results
28649 // together.
28650 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
28651 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
28652 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
28653 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
28654 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
28655 CurrVT = NextVT;
28656 }
28657
28658 return Res;
28659}
28660
28662 const X86Subtarget &Subtarget,
28663 SelectionDAG &DAG) {
28664 MVT VT = Op.getSimpleValueType();
28665
28666 if (Subtarget.hasCDI() &&
28667 // vXi8 vectors need to be promoted to 512-bits for vXi32.
28668 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
28669 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
28670
28671 // Decompose 256-bit ops into smaller 128-bit ops.
28672 if (VT.is256BitVector() && !Subtarget.hasInt256())
28673 return splitVectorIntUnary(Op, DAG, DL);
28674
28675 // Decompose 512-bit ops into smaller 256-bit ops.
28676 if (VT.is512BitVector() && !Subtarget.hasBWI())
28677 return splitVectorIntUnary(Op, DAG, DL);
28678
28679 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
28680 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
28681}
28682
28683static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
28684 SelectionDAG &DAG) {
28685 MVT VT = Op.getSimpleValueType();
28686 MVT OpVT = VT;
28687 unsigned NumBits = VT.getSizeInBits();
28688 SDLoc dl(Op);
28689 unsigned Opc = Op.getOpcode();
28690
28691 if (VT.isVector())
28692 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
28693
28694 Op = Op.getOperand(0);
28695 if (VT == MVT::i8) {
28696 // Zero extend to i32 since there is not an i8 bsr.
28697 OpVT = MVT::i32;
28698 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
28699 }
28700
28701 // Check if we can safely pass a result though BSR for zero sources.
28702 SDValue PassThru = DAG.getUNDEF(OpVT);
28703 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
28704 !DAG.isKnownNeverZero(Op))
28705 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
28706
28707 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28708 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28709 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
28710
28711 // Skip CMOV if we're using a pass through value.
28712 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
28713 // If src is zero (i.e. bsr sets ZF), returns NumBits.
28714 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28715 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28716 Op.getValue(1)};
28717 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28718 }
28719
28720 // Finally xor with NumBits-1.
28721 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28722 DAG.getConstant(NumBits - 1, dl, OpVT));
28723
28724 if (VT == MVT::i8)
28725 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28726 return Op;
28727}
28728
28729static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28730 SelectionDAG &DAG) {
28731 MVT VT = Op.getSimpleValueType();
28732 unsigned NumBits = VT.getScalarSizeInBits();
28733 SDValue N0 = Op.getOperand(0);
28734 SDLoc dl(Op);
28735 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
28736
28737 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
28738 "Only scalar CTTZ requires custom lowering");
28739
28740 // Check if we can safely pass a result though BSF for zero sources.
28741 SDValue PassThru = DAG.getUNDEF(VT);
28742 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
28743 PassThru = DAG.getConstant(NumBits, dl, VT);
28744
28745 // Issue a bsf (scan bits forward) which also sets EFLAGS.
28746 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28747 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
28748
28749 // Skip CMOV if src is never zero or we're using a pass through value.
28750 if (NonZeroSrc || !PassThru.isUndef())
28751 return Op;
28752
28753 // If src is zero (i.e. bsf sets ZF), returns NumBits.
28754 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28755 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28756 Op.getValue(1)};
28757 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28758}
28759
28761 const X86Subtarget &Subtarget) {
28762 MVT VT = Op.getSimpleValueType();
28763 SDLoc DL(Op);
28764
28765 if (VT == MVT::i16 || VT == MVT::i32)
28766 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
28767
28768 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28769 return splitVectorIntBinary(Op, DAG, DL);
28770
28771 assert(Op.getSimpleValueType().is256BitVector() &&
28772 Op.getSimpleValueType().isInteger() &&
28773 "Only handle AVX 256-bit vector integer operation");
28774 return splitVectorIntBinary(Op, DAG, DL);
28775}
28776
28778 const X86Subtarget &Subtarget) {
28779 MVT VT = Op.getSimpleValueType();
28780 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28781 unsigned Opcode = Op.getOpcode();
28782 SDLoc DL(Op);
28783
28784 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28785 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28786 assert(Op.getSimpleValueType().isInteger() &&
28787 "Only handle AVX vector integer operation");
28788 return splitVectorIntBinary(Op, DAG, DL);
28789 }
28790
28791 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28792 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28793 EVT SetCCResultType =
28794 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28795
28796 unsigned BitWidth = VT.getScalarSizeInBits();
28797 if (Opcode == ISD::USUBSAT) {
28798 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28799 // Handle a special-case with a bit-hack instead of cmp+select:
28800 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28801 // If the target can use VPTERNLOG, DAGToDAG will match this as
28802 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28803 // "broadcast" constant load.
28805 if (C && C->getAPIntValue().isSignMask()) {
28806 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28807 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28808 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28809 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28810 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28811 }
28812 }
28813 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28814 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28815 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28816 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28817 // TODO: Move this to DAGCombiner?
28818 if (SetCCResultType == VT &&
28819 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28820 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28821 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28822 }
28823 }
28824
28825 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28826 (!VT.isVector() || VT == MVT::v2i64)) {
28829 SDValue Zero = DAG.getConstant(0, DL, VT);
28830 SDValue Result =
28831 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28832 DAG.getVTList(VT, SetCCResultType), X, Y);
28833 SDValue SumDiff = Result.getValue(0);
28834 SDValue Overflow = Result.getValue(1);
28835 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28836 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28837 SDValue SumNeg =
28838 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28839 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28840 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28841 }
28842
28843 // Use default expansion.
28844 return SDValue();
28845}
28846
28847static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28848 SelectionDAG &DAG) {
28849 MVT VT = Op.getSimpleValueType();
28850 SDLoc DL(Op);
28851
28852 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28853 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28854 // 8-bit integer abs to NEG and CMOV.
28855 SDValue N0 = Op.getOperand(0);
28856 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28857 DAG.getConstant(0, DL, VT), N0);
28858 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28859 SDValue(Neg.getNode(), 1)};
28860 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28861 }
28862
28863 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28864 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28865 SDValue Src = Op.getOperand(0);
28866 SDValue Neg = DAG.getNegative(Src, DL, VT);
28867 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
28868 }
28869
28870 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28871 assert(VT.isInteger() &&
28872 "Only handle AVX 256-bit vector integer operation");
28873 return splitVectorIntUnary(Op, DAG, DL);
28874 }
28875
28876 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28877 return splitVectorIntUnary(Op, DAG, DL);
28878
28879 // Default to expand.
28880 return SDValue();
28881}
28882
28883static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
28884 SelectionDAG &DAG) {
28885 MVT VT = Op.getSimpleValueType();
28886 SDLoc DL(Op);
28887
28888 // For AVX1 cases, split to use legal ops.
28889 if (VT.is256BitVector() && !Subtarget.hasInt256())
28890 return splitVectorIntBinary(Op, DAG, DL);
28891
28892 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28893 return splitVectorIntBinary(Op, DAG, DL);
28894
28895 // Default to expand.
28896 return SDValue();
28897}
28898
28899static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
28900 SelectionDAG &DAG) {
28901 MVT VT = Op.getSimpleValueType();
28902 SDLoc DL(Op);
28903
28904 // For AVX1 cases, split to use legal ops.
28905 if (VT.is256BitVector() && !Subtarget.hasInt256())
28906 return splitVectorIntBinary(Op, DAG, DL);
28907
28908 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28909 return splitVectorIntBinary(Op, DAG, DL);
28910
28911 // Default to expand.
28912 return SDValue();
28913}
28914
28916 SelectionDAG &DAG) {
28917 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28918 EVT VT = Op.getValueType();
28919 SDValue X = Op.getOperand(0);
28920 SDValue Y = Op.getOperand(1);
28921 SDLoc DL(Op);
28922 bool IsMaxOp =
28923 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
28924 bool IsNum =
28925 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
28926 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
28927 unsigned Opc = 0;
28928 if (VT.isVector())
28929 Opc = X86ISD::VMINMAX;
28930 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
28931 Opc = X86ISD::VMINMAXS;
28932
28933 if (Opc) {
28934 SDValue Imm =
28935 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
28936 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
28937 }
28938 }
28939
28940 uint64_t SizeInBits = VT.getScalarSizeInBits();
28941 APInt PreferredZero = APInt::getZero(SizeInBits);
28942 APInt OppositeZero = PreferredZero;
28943 EVT IVT = VT.changeTypeToInteger();
28944 X86ISD::NodeType MinMaxOp;
28945 if (IsMaxOp) {
28946 MinMaxOp = X86ISD::FMAX;
28947 OppositeZero.setSignBit();
28948 } else {
28949 PreferredZero.setSignBit();
28950 MinMaxOp = X86ISD::FMIN;
28951 }
28952 EVT SetCCType =
28953 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28954
28955 // The tables below show the expected result of Max in cases of NaN and
28956 // signed zeros.
28957 //
28958 // Y Y
28959 // Num xNaN +0 -0
28960 // --------------- ---------------
28961 // Num | Max | Y | +0 | +0 | +0 |
28962 // X --------------- X ---------------
28963 // xNaN | X | X/Y | -0 | +0 | -0 |
28964 // --------------- ---------------
28965 //
28966 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
28967 // reordering.
28968 //
28969 // We check if any of operands is NaN and return NaN. Then we check if any of
28970 // operands is zero or negative zero (for fmaximum and fminimum respectively)
28971 // to ensure the correct zero is returned.
28972 auto MatchesZero = [](SDValue Op, APInt Zero) {
28974 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
28975 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
28976 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
28977 return CstOp->getAPIntValue() == Zero;
28978 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
28979 Op->getOpcode() == ISD::SPLAT_VECTOR) {
28980 for (const SDValue &OpVal : Op->op_values()) {
28981 if (OpVal.isUndef())
28982 continue;
28983 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
28984 if (!CstOp)
28985 return false;
28986 if (!CstOp->getValueAPF().isZero())
28987 continue;
28988 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
28989 return false;
28990 }
28991 return true;
28992 }
28993 return false;
28994 };
28995
28996 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
28997 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
28998 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
28999 Op->getFlags().hasNoSignedZeros() ||
29000 DAG.isKnownNeverZeroFloat(X) ||
29002 SDValue NewX, NewY;
29003 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29004 MatchesZero(X, OppositeZero)) {
29005 // Operands are already in right order or order does not matter.
29006 NewX = X;
29007 NewY = Y;
29008 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29009 NewX = Y;
29010 NewY = X;
29011 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29012 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29013 if (IsXNeverNaN)
29014 std::swap(X, Y);
29015 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29016 // xmm register.
29017 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29019 // Bits of classes:
29020 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29021 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29022 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29023 DL, MVT::i32);
29024 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29025 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29026 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29027 DAG.getVectorIdxConstant(0, DL));
29028 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29029 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29030 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29031 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29032 } else {
29033 SDValue IsXSigned;
29034 if (Subtarget.is64Bit() || VT != MVT::f64) {
29035 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29036 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29037 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29038 } else {
29039 assert(VT == MVT::f64);
29040 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29041 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29042 DAG.getVectorIdxConstant(0, DL));
29043 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29044 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29045 DAG.getVectorIdxConstant(1, DL));
29046 Hi = DAG.getBitcast(MVT::i32, Hi);
29047 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29048 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29049 *DAG.getContext(), MVT::i32);
29050 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29051 }
29052 if (MinMaxOp == X86ISD::FMAX) {
29053 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29054 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29055 } else {
29056 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29057 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29058 }
29059 }
29060
29061 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29062 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29063
29064 // If we did no ordering operands for signed zero handling and we need
29065 // to process NaN and we know that the second operand is not NaN then put
29066 // it in first operand and we will not need to post handle NaN after max/min.
29067 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
29068 std::swap(NewX, NewY);
29069
29070 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29071
29072 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
29073 return MinMax;
29074
29075 SDValue IsNaN =
29076 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29077
29078 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29079}
29080
29081static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29082 SelectionDAG &DAG) {
29083 MVT VT = Op.getSimpleValueType();
29084 SDLoc dl(Op);
29085
29086 // For AVX1 cases, split to use legal ops.
29087 if (VT.is256BitVector() && !Subtarget.hasInt256())
29088 return splitVectorIntBinary(Op, DAG, dl);
29089
29090 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29091 return splitVectorIntBinary(Op, DAG, dl);
29092
29093 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29094 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29095
29096 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29097 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29098 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29099
29100 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29101 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29102 if (VT.bitsGE(MVT::i32)) {
29103 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29104 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29105 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29106 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29107 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29108 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29109 DAG.getTargetConstant(CC, dl, MVT::i8),
29110 Diff1.getValue(1));
29111 }
29112
29113 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29114 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29115 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29116 MVT WideVT = MVT::getIntegerVT(WideBits);
29117 if (TLI.isTypeLegal(WideVT)) {
29118 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29119 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29120 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29121 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29122 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29123 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29124 DAG.getTargetConstant(CC, dl, MVT::i8),
29125 Diff1.getValue(1));
29126 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29127 }
29128 }
29129
29130 // Default to expand.
29131 return SDValue();
29132}
29133
29134static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29135 SelectionDAG &DAG) {
29136 SDLoc dl(Op);
29137 MVT VT = Op.getSimpleValueType();
29138
29139 // Decompose 256-bit ops into 128-bit ops.
29140 if (VT.is256BitVector() && !Subtarget.hasInt256())
29141 return splitVectorIntBinary(Op, DAG, dl);
29142
29143 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29144 return splitVectorIntBinary(Op, DAG, dl);
29145
29146 SDValue A = Op.getOperand(0);
29147 SDValue B = Op.getOperand(1);
29148
29149 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29150 // vector pairs, multiply and truncate.
29151 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29152 unsigned NumElts = VT.getVectorNumElements();
29153 unsigned NumLanes = VT.getSizeInBits() / 128;
29154 unsigned NumEltsPerLane = NumElts / NumLanes;
29155
29156 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29157 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29158 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29159 return DAG.getNode(
29160 ISD::TRUNCATE, dl, VT,
29161 DAG.getNode(ISD::MUL, dl, ExVT,
29162 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29163 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29164 }
29165
29166 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29167
29168 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29169 // Don't do this if we only need to unpack one half.
29170 if (Subtarget.hasSSSE3()) {
29171 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29172 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29173 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29174 if (BIsBuildVector) {
29175 for (auto [Idx, Val] : enumerate(B->ops())) {
29176 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29177 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29178 else
29179 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29180 }
29181 }
29182 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29183 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29184 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29185 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29186 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29187 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29188 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29189 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29190 DAG.getTargetConstant(8, dl, MVT::i8));
29191 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29192 }
29193 }
29194
29195 // Extract the lo/hi parts to any extend to i16.
29196 // We're going to mask off the low byte of each result element of the
29197 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29198 // element.
29199 SDValue Undef = DAG.getUNDEF(VT);
29200 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29201 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29202
29203 SDValue BLo, BHi;
29204 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29205 // If the RHS is a constant, manually unpackl/unpackh.
29206 SmallVector<SDValue, 16> LoOps, HiOps;
29207 for (unsigned i = 0; i != NumElts; i += 16) {
29208 for (unsigned j = 0; j != 8; ++j) {
29209 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29210 MVT::i16));
29211 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29212 MVT::i16));
29213 }
29214 }
29215
29216 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29217 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29218 } else {
29219 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29220 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29221 }
29222
29223 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29224 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29225 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29226 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29227 }
29228
29229 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29230 if (VT == MVT::v4i32) {
29231 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29232 "Should not custom lower when pmulld is available!");
29233
29234 // Extract the odd parts.
29235 static const int UnpackMask[] = {1, 1, 3, 3};
29236 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29237 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29238
29239 // Multiply the even parts.
29240 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29241 DAG.getBitcast(MVT::v2i64, A),
29242 DAG.getBitcast(MVT::v2i64, B));
29243 // Now multiply odd parts.
29244 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29245 DAG.getBitcast(MVT::v2i64, Aodds),
29246 DAG.getBitcast(MVT::v2i64, Bodds));
29247
29248 Evens = DAG.getBitcast(VT, Evens);
29249 Odds = DAG.getBitcast(VT, Odds);
29250
29251 // Merge the two vectors back together with a shuffle. This expands into 2
29252 // shuffles.
29253 static const int ShufMask[] = { 0, 4, 2, 6 };
29254 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29255 }
29256
29257 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29258 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29259 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29260
29261 // Ahi = psrlqi(a, 32);
29262 // Bhi = psrlqi(b, 32);
29263 //
29264 // AloBlo = pmuludq(a, b);
29265 // AloBhi = pmuludq(a, Bhi);
29266 // AhiBlo = pmuludq(Ahi, b);
29267 //
29268 // Hi = psllqi(AloBhi + AhiBlo, 32);
29269 // return AloBlo + Hi;
29270 KnownBits AKnown = DAG.computeKnownBits(A);
29271 KnownBits BKnown = DAG.computeKnownBits(B);
29272
29273 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29274 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29275 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29276
29277 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29278 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29279 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29280
29281 SDValue Zero = DAG.getConstant(0, dl, VT);
29282
29283 // Only multiply lo/hi halves that aren't known to be zero.
29284 SDValue AloBlo = Zero;
29285 if (!ALoIsZero && !BLoIsZero)
29286 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29287
29288 SDValue AloBhi = Zero;
29289 if (!ALoIsZero && !BHiIsZero) {
29290 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29291 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29292 }
29293
29294 SDValue AhiBlo = Zero;
29295 if (!AHiIsZero && !BLoIsZero) {
29296 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29297 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29298 }
29299
29300 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29301 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29302
29303 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29304}
29305
29307 MVT VT, bool IsSigned,
29308 const X86Subtarget &Subtarget,
29309 SelectionDAG &DAG,
29310 SDValue *Low = nullptr) {
29311 unsigned NumElts = VT.getVectorNumElements();
29312
29313 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29314 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29315 // lane results back together.
29316
29317 // We'll take different approaches for signed and unsigned.
29318 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29319 // and use pmullw to calculate the full 16-bit product.
29320 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29321 // shift them left into the upper byte of each word. This allows us to use
29322 // pmulhw to calculate the full 16-bit product. This trick means we don't
29323 // need to sign extend the bytes to use pmullw.
29324
29325 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29326 SDValue Zero = DAG.getConstant(0, dl, VT);
29327
29328 SDValue ALo, AHi;
29329 if (IsSigned) {
29330 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29331 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29332 } else {
29333 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29334 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29335 }
29336
29337 SDValue BLo, BHi;
29338 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29339 // If the RHS is a constant, manually unpackl/unpackh and extend.
29340 SmallVector<SDValue, 16> LoOps, HiOps;
29341 for (unsigned i = 0; i != NumElts; i += 16) {
29342 for (unsigned j = 0; j != 8; ++j) {
29343 SDValue LoOp = B.getOperand(i + j);
29344 SDValue HiOp = B.getOperand(i + j + 8);
29345
29346 if (IsSigned) {
29347 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29348 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29349 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29350 DAG.getConstant(8, dl, MVT::i16));
29351 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29352 DAG.getConstant(8, dl, MVT::i16));
29353 } else {
29354 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29355 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29356 }
29357
29358 LoOps.push_back(LoOp);
29359 HiOps.push_back(HiOp);
29360 }
29361 }
29362
29363 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29364 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29365 } else if (IsSigned) {
29366 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29367 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29368 } else {
29369 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29370 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29371 }
29372
29373 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29374 // pack back to vXi8.
29375 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29376 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29377 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29378
29379 if (Low)
29380 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29381
29382 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29383}
29384
29385static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29386 SelectionDAG &DAG) {
29387 SDLoc dl(Op);
29388 MVT VT = Op.getSimpleValueType();
29389 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29390 unsigned NumElts = VT.getVectorNumElements();
29391 SDValue A = Op.getOperand(0);
29392 SDValue B = Op.getOperand(1);
29393
29394 // Decompose 256-bit ops into 128-bit ops.
29395 if (VT.is256BitVector() && !Subtarget.hasInt256())
29396 return splitVectorIntBinary(Op, DAG, dl);
29397
29398 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29399 return splitVectorIntBinary(Op, DAG, dl);
29400
29401 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29402 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29403 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29404 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29405
29406 // PMULxD operations multiply each even value (starting at 0) of LHS with
29407 // the related value of RHS and produce a widen result.
29408 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29409 // => <2 x i64> <ae|cg>
29410 //
29411 // In other word, to have all the results, we need to perform two PMULxD:
29412 // 1. one with the even values.
29413 // 2. one with the odd values.
29414 // To achieve #2, with need to place the odd values at an even position.
29415 //
29416 // Place the odd value at an even position (basically, shift all values 1
29417 // step to the left):
29418 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29419 9, -1, 11, -1, 13, -1, 15, -1};
29420 // <a|b|c|d> => <b|undef|d|undef>
29421 SDValue Odd0 =
29422 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29423 // <e|f|g|h> => <f|undef|h|undef>
29424 SDValue Odd1 =
29425 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29426
29427 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29428 // ints.
29429 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29430 unsigned Opcode =
29431 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29432 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29433 // => <2 x i64> <ae|cg>
29434 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29435 DAG.getBitcast(MulVT, A),
29436 DAG.getBitcast(MulVT, B)));
29437 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29438 // => <2 x i64> <bf|dh>
29439 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29440 DAG.getBitcast(MulVT, Odd0),
29441 DAG.getBitcast(MulVT, Odd1)));
29442
29443 // Shuffle it back into the right order.
29444 SmallVector<int, 16> ShufMask(NumElts);
29445 for (int i = 0; i != (int)NumElts; ++i)
29446 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29447
29448 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29449
29450 // If we have a signed multiply but no PMULDQ fix up the result of an
29451 // unsigned multiply.
29452 if (IsSigned && !Subtarget.hasSSE41()) {
29453 SDValue Zero = DAG.getConstant(0, dl, VT);
29454 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29455 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29456 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29457 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29458
29459 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29460 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29461 }
29462
29463 return Res;
29464 }
29465
29466 // Only i8 vectors should need custom lowering after this.
29467 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29468 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29469 "Unsupported vector type");
29470
29471 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29472 // logical shift down the upper half and pack back to i8.
29473
29474 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29475 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29476
29477 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29478 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29479 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29480 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29481 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29482 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29483 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29484 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29485 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29486 }
29487
29488 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29489}
29490
29491// Custom lowering for SMULO/UMULO.
29492static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29493 SelectionDAG &DAG) {
29494 MVT VT = Op.getSimpleValueType();
29495
29496 // Scalars defer to LowerXALUO.
29497 if (!VT.isVector())
29498 return LowerXALUO(Op, DAG);
29499
29500 SDLoc dl(Op);
29501 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29502 SDValue A = Op.getOperand(0);
29503 SDValue B = Op.getOperand(1);
29504 EVT OvfVT = Op->getValueType(1);
29505
29506 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29507 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29508 // Extract the LHS Lo/Hi vectors
29509 SDValue LHSLo, LHSHi;
29510 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29511
29512 // Extract the RHS Lo/Hi vectors
29513 SDValue RHSLo, RHSHi;
29514 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29515
29516 EVT LoOvfVT, HiOvfVT;
29517 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29518 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29519 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29520
29521 // Issue the split operations.
29522 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29523 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29524
29525 // Join the separate data results and the overflow results.
29526 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29527 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29528 Hi.getValue(1));
29529
29530 return DAG.getMergeValues({Res, Ovf}, dl);
29531 }
29532
29533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29534 EVT SetccVT =
29535 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29536
29537 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29538 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29539 unsigned NumElts = VT.getVectorNumElements();
29540 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29541 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29542 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29543 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29544 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29545
29546 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29547
29548 SDValue Ovf;
29549 if (IsSigned) {
29550 SDValue High, LowSign;
29551 if (OvfVT.getVectorElementType() == MVT::i1 &&
29552 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29553 // Rather the truncating try to do the compare on vXi16 or vXi32.
29554 // Shift the high down filling with sign bits.
29555 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29556 // Fill all 16 bits with the sign bit from the low.
29557 LowSign =
29558 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29559 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29560 15, DAG);
29561 SetccVT = OvfVT;
29562 if (!Subtarget.hasBWI()) {
29563 // We can't do a vXi16 compare so sign extend to v16i32.
29564 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29565 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29566 }
29567 } else {
29568 // Otherwise do the compare at vXi8.
29569 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29570 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29571 LowSign =
29572 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29573 }
29574
29575 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29576 } else {
29577 SDValue High =
29578 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29579 if (OvfVT.getVectorElementType() == MVT::i1 &&
29580 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29581 // Rather the truncating try to do the compare on vXi16 or vXi32.
29582 SetccVT = OvfVT;
29583 if (!Subtarget.hasBWI()) {
29584 // We can't do a vXi16 compare so sign extend to v16i32.
29585 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
29586 }
29587 } else {
29588 // Otherwise do the compare at vXi8.
29589 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29590 }
29591
29592 Ovf =
29593 DAG.getSetCC(dl, SetccVT, High,
29594 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
29595 }
29596
29597 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
29598
29599 return DAG.getMergeValues({Low, Ovf}, dl);
29600 }
29601
29602 SDValue Low;
29603 SDValue High =
29604 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
29605
29606 SDValue Ovf;
29607 if (IsSigned) {
29608 // SMULO overflows if the high bits don't match the sign of the low.
29609 SDValue LowSign =
29610 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29611 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29612 } else {
29613 // UMULO overflows if the high bits are non-zero.
29614 Ovf =
29615 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
29616 }
29617
29618 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
29619
29620 return DAG.getMergeValues({Low, Ovf}, dl);
29621}
29622
29623SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
29624 assert(Subtarget.isTargetWin64() && "Unexpected target");
29625 EVT VT = Op.getValueType();
29626 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
29627 "Unexpected return type for lowering");
29628
29629 if (isa<ConstantSDNode>(Op->getOperand(1))) {
29631 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
29632 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
29633 }
29634
29635 RTLIB::Libcall LC;
29636 bool isSigned;
29637 switch (Op->getOpcode()) {
29638 // clang-format off
29639 default: llvm_unreachable("Unexpected request for libcall!");
29640 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
29641 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
29642 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
29643 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
29644 // clang-format on
29645 }
29646
29647 SDLoc dl(Op);
29648 SDValue InChain = DAG.getEntryNode();
29649
29652 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
29653 EVT ArgVT = Op->getOperand(i).getValueType();
29654 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
29655 "Unexpected argument type for lowering");
29656 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29657 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29658 MachinePointerInfo MPI =
29660 Entry.Node = StackPtr;
29661 InChain =
29662 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
29663 Entry.Ty = PointerType::get(*DAG.getContext(), 0);
29664 Entry.IsSExt = false;
29665 Entry.IsZExt = false;
29666 Args.push_back(Entry);
29667 }
29668
29671
29673 CLI.setDebugLoc(dl)
29674 .setChain(InChain)
29675 .setLibCallee(
29677 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
29678 std::move(Args))
29679 .setInRegister()
29680 .setSExtResult(isSigned)
29681 .setZExtResult(!isSigned);
29682
29683 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
29684 return DAG.getBitcast(VT, CallInfo.first);
29685}
29686
29687SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
29688 SelectionDAG &DAG,
29689 SDValue &Chain) const {
29690 assert(Subtarget.isTargetWin64() && "Unexpected target");
29691 EVT VT = Op.getValueType();
29692 bool IsStrict = Op->isStrictFPOpcode();
29693
29694 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29695 EVT ArgVT = Arg.getValueType();
29696
29697 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
29698 "Unexpected return type for lowering");
29699
29700 RTLIB::Libcall LC;
29701 if (Op->getOpcode() == ISD::FP_TO_SINT ||
29702 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
29703 LC = RTLIB::getFPTOSINT(ArgVT, VT);
29704 else
29705 LC = RTLIB::getFPTOUINT(ArgVT, VT);
29706 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
29707
29708 SDLoc dl(Op);
29709 MakeLibCallOptions CallOptions;
29710 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29711
29713 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
29714 // expected VT (i128).
29715 std::tie(Result, Chain) =
29716 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
29717 Result = DAG.getBitcast(VT, Result);
29718 return Result;
29719}
29720
29721SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
29722 SelectionDAG &DAG) const {
29723 assert(Subtarget.isTargetWin64() && "Unexpected target");
29724 EVT VT = Op.getValueType();
29725 bool IsStrict = Op->isStrictFPOpcode();
29726
29727 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29728 EVT ArgVT = Arg.getValueType();
29729
29730 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
29731 "Unexpected argument type for lowering");
29732
29733 RTLIB::Libcall LC;
29734 if (Op->getOpcode() == ISD::SINT_TO_FP ||
29735 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
29736 LC = RTLIB::getSINTTOFP(ArgVT, VT);
29737 else
29738 LC = RTLIB::getUINTTOFP(ArgVT, VT);
29739 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
29740
29741 SDLoc dl(Op);
29742 MakeLibCallOptions CallOptions;
29743 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29744
29745 // Pass the i128 argument as an indirect argument on the stack.
29746 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29747 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29748 MachinePointerInfo MPI =
29750 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
29751
29753 std::tie(Result, Chain) =
29754 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
29755 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
29756}
29757
29758// Return true if the required (according to Opcode) shift-imm form is natively
29759// supported by the Subtarget
29760static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
29761 unsigned Opcode) {
29762 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29763 "Unexpected shift opcode");
29764
29765 if (!VT.isSimple())
29766 return false;
29767
29768 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29769 return false;
29770
29771 if (VT.getScalarSizeInBits() < 16)
29772 return false;
29773
29774 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29775 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29776 return true;
29777
29778 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29779 (VT.is256BitVector() && Subtarget.hasInt256());
29780
29781 bool AShift = LShift && (Subtarget.hasAVX512() ||
29782 (VT != MVT::v2i64 && VT != MVT::v4i64));
29783 return (Opcode == ISD::SRA) ? AShift : LShift;
29784}
29785
29786// The shift amount is a variable, but it is the same for all vector lanes.
29787// These instructions are defined together with shift-immediate.
29788static
29790 unsigned Opcode) {
29791 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29792}
29793
29794// Return true if the required (according to Opcode) variable-shift form is
29795// natively supported by the Subtarget
29796static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
29797 unsigned Opcode) {
29798 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29799 "Unexpected shift opcode");
29800
29801 if (!VT.isSimple())
29802 return false;
29803
29804 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29805 return false;
29806
29807 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29808 return false;
29809
29810 // vXi16 supported only on AVX-512, BWI
29811 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29812 return false;
29813
29814 if (Subtarget.hasAVX512() &&
29815 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29816 return true;
29817
29818 bool LShift = VT.is128BitVector() || VT.is256BitVector();
29819 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
29820 return (Opcode == ISD::SRA) ? AShift : LShift;
29821}
29822
29824 const X86Subtarget &Subtarget) {
29825 MVT VT = Op.getSimpleValueType();
29826 SDLoc dl(Op);
29827 SDValue R = Op.getOperand(0);
29828 SDValue Amt = Op.getOperand(1);
29829 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29830 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29831
29832 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29833 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
29834 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29835 SDValue Ex = DAG.getBitcast(ExVT, R);
29836
29837 // ashr(R, 63) === cmp_slt(R, 0)
29838 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29839 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
29840 "Unsupported PCMPGT op");
29841 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29842 }
29843
29844 if (ShiftAmt >= 32) {
29845 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29846 SDValue Upper =
29847 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29849 ShiftAmt - 32, DAG);
29850 if (VT == MVT::v2i64)
29851 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29852 if (VT == MVT::v4i64)
29853 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29854 {9, 1, 11, 3, 13, 5, 15, 7});
29855 } else {
29856 // SRA upper i32, SRL whole i64 and select lower i32.
29858 ShiftAmt, DAG);
29859 SDValue Lower =
29860 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29861 Lower = DAG.getBitcast(ExVT, Lower);
29862 if (VT == MVT::v2i64)
29863 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29864 if (VT == MVT::v4i64)
29865 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29866 {8, 1, 10, 3, 12, 5, 14, 7});
29867 }
29868 return DAG.getBitcast(VT, Ex);
29869 };
29870
29871 // Optimize shl/srl/sra with constant shift amount.
29872 APInt APIntShiftAmt;
29873 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29874 return SDValue();
29875
29876 // If the shift amount is out of range, return undef.
29877 if (APIntShiftAmt.uge(EltSizeInBits))
29878 return DAG.getUNDEF(VT);
29879
29880 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29881
29882 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
29883 // Hardware support for vector shifts is sparse which makes us scalarize the
29884 // vector operations in many cases. Also, on sandybridge ADD is faster than
29885 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29886 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29887 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29888 // must be 0). (add undef, undef) however can be any value. To make this
29889 // safe, we must freeze R to ensure that register allocation uses the same
29890 // register for an undefined value. This ensures that the result will
29891 // still be even and preserves the original semantics.
29892 R = DAG.getFreeze(R);
29893 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29894 }
29895
29896 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29897 }
29898
29899 // i64 SRA needs to be performed as partial shifts.
29900 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29901 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29902 Op.getOpcode() == ISD::SRA)
29903 return ArithmeticShiftRight64(ShiftAmt);
29904
29905 // If we're logical shifting an all-signbits value then we can just perform as
29906 // a mask.
29907 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
29908 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
29909 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
29910 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
29911 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
29912 }
29913
29914 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29915 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29916 unsigned NumElts = VT.getVectorNumElements();
29917 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29918
29919 // Simple i8 add case
29920 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29921 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29922 // must be 0). (add undef, undef) however can be any value. To make this
29923 // safe, we must freeze R to ensure that register allocation uses the same
29924 // register for an undefined value. This ensures that the result will
29925 // still be even and preserves the original semantics.
29926 R = DAG.getFreeze(R);
29927 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29928 }
29929
29930 // ashr(R, 7) === cmp_slt(R, 0)
29931 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29932 SDValue Zeros = DAG.getConstant(0, dl, VT);
29933 if (VT.is512BitVector()) {
29934 assert(VT == MVT::v64i8 && "Unexpected element type!");
29935 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29936 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29937 }
29938 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29939 }
29940
29941 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29942 if (VT == MVT::v16i8 && Subtarget.hasXOP())
29943 return SDValue();
29944
29945 if (Subtarget.hasGFNI()) {
29946 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
29947 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
29948 DAG.getTargetConstant(0, dl, MVT::i8));
29949 }
29950
29951 if (Op.getOpcode() == ISD::SHL) {
29952 // Make a large shift.
29953 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29954 ShiftAmt, DAG);
29955 SHL = DAG.getBitcast(VT, SHL);
29956 // Zero out the rightmost bits.
29957 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29958 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
29959 }
29960 if (Op.getOpcode() == ISD::SRL) {
29961 // Make a large shift.
29962 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
29963 ShiftAmt, DAG);
29964 SRL = DAG.getBitcast(VT, SRL);
29965 // Zero out the leftmost bits.
29966 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29967 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
29968 }
29969 if (Op.getOpcode() == ISD::SRA) {
29970 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
29971 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29972
29973 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
29974 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
29975 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
29976 return Res;
29977 }
29978 llvm_unreachable("Unknown shift opcode.");
29979 }
29980
29981 return SDValue();
29982}
29983
29985 const X86Subtarget &Subtarget) {
29986 MVT VT = Op.getSimpleValueType();
29987 SDLoc dl(Op);
29988 SDValue R = Op.getOperand(0);
29989 SDValue Amt = Op.getOperand(1);
29990 unsigned Opcode = Op.getOpcode();
29991 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
29992
29993 int BaseShAmtIdx = -1;
29994 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
29995 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
29996 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
29997 Subtarget, DAG);
29998
29999 // vXi8 shifts - shift as v8i16 + mask result.
30000 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30001 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30002 VT == MVT::v64i8) &&
30003 !Subtarget.hasXOP()) {
30004 unsigned NumElts = VT.getVectorNumElements();
30005 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30006 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30007 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30008 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30009
30010 // Create the mask using vXi16 shifts. For shift-rights we need to move
30011 // the upper byte down before splatting the vXi8 mask.
30012 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30013 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30014 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30015 if (Opcode != ISD::SHL)
30016 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30017 8, DAG);
30018 BitMask = DAG.getBitcast(VT, BitMask);
30019 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30020 SmallVector<int, 64>(NumElts, 0));
30021
30022 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30023 DAG.getBitcast(ExtVT, R), BaseShAmt,
30024 BaseShAmtIdx, Subtarget, DAG);
30025 Res = DAG.getBitcast(VT, Res);
30026 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30027
30028 if (Opcode == ISD::SRA) {
30029 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30030 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30031 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30032 SignMask =
30033 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30034 BaseShAmtIdx, Subtarget, DAG);
30035 SignMask = DAG.getBitcast(VT, SignMask);
30036 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30037 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30038 }
30039 return Res;
30040 }
30041 }
30042 }
30043
30044 return SDValue();
30045}
30046
30047// Convert a shift/rotate left amount to a multiplication scale factor.
30049 const X86Subtarget &Subtarget,
30050 SelectionDAG &DAG) {
30051 MVT VT = Amt.getSimpleValueType();
30052 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30053 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30054 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30055 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30056 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30057 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30058 return SDValue();
30059
30060 MVT SVT = VT.getVectorElementType();
30061 unsigned SVTBits = SVT.getSizeInBits();
30062 unsigned NumElems = VT.getVectorNumElements();
30063
30064 APInt UndefElts;
30065 SmallVector<APInt> EltBits;
30066 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30067 APInt One(SVTBits, 1);
30068 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30069 for (unsigned I = 0; I != NumElems; ++I) {
30070 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30071 continue;
30072 uint64_t ShAmt = EltBits[I].getZExtValue();
30073 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30074 }
30075 return DAG.getBuildVector(VT, dl, Elts);
30076 }
30077
30078 // If the target doesn't support variable shifts, use either FP conversion
30079 // or integer multiplication to avoid shifting each element individually.
30080 if (VT == MVT::v4i32) {
30081 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30082 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30083 DAG.getConstant(0x3f800000U, dl, VT));
30084 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30085 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30086 }
30087
30088 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30089 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30090 SDValue Z = DAG.getConstant(0, dl, VT);
30091 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30092 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30093 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30094 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30095 if (Subtarget.hasSSE41())
30096 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30097 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30098 }
30099
30100 return SDValue();
30101}
30102
30103static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30104 SelectionDAG &DAG) {
30105 MVT VT = Op.getSimpleValueType();
30106 SDLoc dl(Op);
30107 SDValue R = Op.getOperand(0);
30108 SDValue Amt = Op.getOperand(1);
30109 unsigned NumElts = VT.getVectorNumElements();
30110 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30111 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30112
30113 unsigned Opc = Op.getOpcode();
30114 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30115 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30116
30117 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30118 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30119
30120 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30121 return V;
30122
30123 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30124 return V;
30125
30126 if (supportedVectorVarShift(VT, Subtarget, Opc))
30127 return Op;
30128
30129 // i64 vector arithmetic shift can be emulated with the transform:
30130 // M = lshr(SIGN_MASK, Amt)
30131 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30132 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30133 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30134 Opc == ISD::SRA) {
30135 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30136 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30137 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30138 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30139 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30140 return R;
30141 }
30142
30143 // XOP has 128-bit variable logical/arithmetic shifts.
30144 // +ve/-ve Amt = shift left/right.
30145 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30146 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30147 if (Opc == ISD::SRL || Opc == ISD::SRA)
30148 Amt = DAG.getNegative(Amt, dl, VT);
30149 if (Opc == ISD::SHL || Opc == ISD::SRL)
30150 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30151 if (Opc == ISD::SRA)
30152 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30153 }
30154
30155 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30156 // shifts per-lane and then shuffle the partial results back together.
30157 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30158 // Splat the shift amounts so the scalar shifts above will catch it.
30159 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30160 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30161 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30162 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30163 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30164 }
30165
30166 // Build a map of inrange constant amounts with element mask where they occur.
30168 if (ConstantAmt) {
30169 for (unsigned I = 0; I != NumElts; ++I) {
30170 SDValue A = Amt.getOperand(I);
30171 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30172 continue;
30173 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30174 if (UniqueCstAmt.count(CstAmt)) {
30175 UniqueCstAmt[CstAmt].setBit(I);
30176 continue;
30177 }
30178 UniqueCstAmt[CstAmt] = APInt::getOneBitSet(NumElts, I);
30179 }
30180 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30181 }
30182
30183 // If possible, lower this shift as a sequence of two shifts by
30184 // constant plus a BLENDing shuffle instead of scalarizing it.
30185 // Example:
30186 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30187 //
30188 // Could be rewritten as:
30189 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30190 //
30191 // The advantage is that the two shifts from the example would be
30192 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30193 if (UniqueCstAmt.size() == 2 &&
30194 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30195 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30196 unsigned AmtA = UniqueCstAmt.begin()->first;
30197 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30198 const APInt &MaskA = UniqueCstAmt.begin()->second;
30199 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30200 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30201 for (unsigned I = 0; I != NumElts; ++I) {
30202 if (MaskA[I])
30203 ShuffleMask[I] = I;
30204 if (MaskB[I])
30205 ShuffleMask[I] = I + NumElts;
30206 }
30207
30208 // Only perform this blend if we can perform it without loading a mask.
30209 if ((VT != MVT::v16i16 ||
30210 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30211 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30212 canWidenShuffleElements(ShuffleMask))) {
30213 SDValue Shift1 =
30214 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30215 SDValue Shift2 =
30216 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30217 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30218 }
30219 }
30220
30221 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30222 // using vYiM vector operations where X*N == Y*M and M > N.
30223 if (ConstantAmt &&
30224 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30225 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30226 !Subtarget.hasXOP()) {
30227 MVT NarrowScalarVT = VT.getScalarType();
30228 // We can do this extra fast if each pair of narrow elements is shifted by
30229 // the same amount by doing this SWAR style: use a shift to move the valid
30230 // bits to the right position, mask out any bits which crossed from one
30231 // element to the other.
30232 // This optimized lowering is only valid if the elements in a pair can
30233 // be treated identically.
30234 SmallVector<SDValue, 32> AmtWideElts(Amt->op_begin(), Amt->op_end());
30235 SmallVector<SDValue, 32> TmpAmtWideElts;
30236 int WideEltSizeInBits = EltSizeInBits;
30237 while (WideEltSizeInBits < 32) {
30238 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30239 // unprofitable.
30240 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30241 break;
30242 }
30243 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30244 bool SameShifts = true;
30245 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30246 unsigned DstI = SrcI / 2;
30247 // Both elements are undef? Make a note and keep going.
30248 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30249 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30250 continue;
30251 }
30252 // Even element is undef? We will shift it by the same shift amount as
30253 // the odd element.
30254 if (AmtWideElts[SrcI].isUndef()) {
30255 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30256 continue;
30257 }
30258 // Odd element is undef? We will shift it by the same shift amount as
30259 // the even element.
30260 if (AmtWideElts[SrcI + 1].isUndef()) {
30261 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30262 continue;
30263 }
30264 // Both elements are equal.
30265 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30266 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30267 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30268 continue;
30269 }
30270 // One of the provisional wide elements will not have the same shift
30271 // amount. Let's bail.
30272 SameShifts = false;
30273 break;
30274 }
30275 if (!SameShifts) {
30276 break;
30277 }
30278 WideEltSizeInBits *= 2;
30279 std::swap(TmpAmtWideElts, AmtWideElts);
30280 }
30281 APInt APIntShiftAmt;
30282 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30283 bool Profitable = WidenShift;
30284 // AVX512BW brings support for vpsllvw.
30285 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30286 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30287 Profitable = false;
30288 }
30289 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30290 // fairly cheaply in other ways.
30291 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30292 Profitable = false;
30293 }
30294 // Leave it up to GFNI if we have it around.
30295 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30296 // is probably a win to use other strategies in some cases.
30297 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30298 Profitable = false;
30299 }
30300
30301 // AVX1 does not have vpand which makes our masking impractical. It does
30302 // have vandps but that is an FP instruction and crossing FP<->int typically
30303 // has some cost.
30304 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30305 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30306 Profitable = false;
30307 }
30308 unsigned WideNumElts = AmtWideElts.size();
30309 // We are only dealing with identical pairs.
30310 if (Profitable && WideNumElts != NumElts) {
30311 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30312 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30313 // Cast the operand to vXiM.
30314 SDValue RWide = DAG.getBitcast(WideVT, R);
30315 // Create our new vector of shift amounts.
30316 SDValue AmtWide = DAG.getBuildVector(
30317 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30318 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30319 // Perform the actual shift.
30320 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30321 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30322 // Now we need to construct a mask which will "drop" bits that get
30323 // shifted past the LSB/MSB. For a logical shift left, it will look
30324 // like:
30325 // FullMask = (1 << EltSizeInBits) - 1
30326 // Mask = FullMask << Amt
30327 //
30328 // This masking ensures that bits cannot migrate from one narrow lane to
30329 // another. The construction of this mask will be constant folded.
30330 // The mask for a logical right shift is nearly identical, the only
30331 // difference is that the all ones mask is shifted right instead of left.
30332 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30333 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30334 Mask = DAG.getBitcast(WideVT, Mask);
30335 // Finally, we mask the shifted vector with the SWAR mask.
30336 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30337 Masked = DAG.getBitcast(VT, Masked);
30338 if (Opc != ISD::SRA) {
30339 // Logical shifts are complete at this point.
30340 return Masked;
30341 }
30342 // At this point, we have done a *logical* shift right. We now need to
30343 // sign extend the result so that we get behavior equivalent to an
30344 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30345 // are `EltSizeInBits-AmtWide` bits wide.
30346 //
30347 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30348 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30349 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30350 // can use the following trick to accomplish this:
30351 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30352 // (Masked ^ SignBitMask) - SignBitMask
30353 //
30354 // When the sign bit is already clear, this will compute:
30355 // Masked + SignBitMask - SignBitMask
30356 //
30357 // This is equal to Masked which is what we want: the sign bit was clear
30358 // so sign extending should be a no-op.
30359 //
30360 // When the sign bit is set, this will compute:
30361 // Masked - SignBitmask - SignBitMask
30362 //
30363 // This is equal to Masked - 2*SignBitMask which will correctly sign
30364 // extend our result.
30365 SDValue SplatHighBit =
30366 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30367 // This does not induce recursion, all operands are constants.
30368 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30369 SDValue FlippedSignBit =
30370 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30371 SDValue Subtraction =
30372 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30373 return Subtraction;
30374 }
30375 }
30376
30377 // If possible, lower this packed shift into a vector multiply instead of
30378 // expanding it into a sequence of scalar shifts.
30379 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30380 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30381 Subtarget.canExtendTo512BW())))
30382 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30383 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30384
30385 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30386 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30387 if (Opc == ISD::SRL && ConstantAmt &&
30388 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30389 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30390 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30391 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30392 SDValue Zero = DAG.getConstant(0, dl, VT);
30393 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30394 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30395 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30396 }
30397 }
30398
30399 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30400 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30401 // TODO: Special case handling for shift by 0/1, really we can afford either
30402 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30403 if (Opc == ISD::SRA && ConstantAmt &&
30404 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30405 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30406 !Subtarget.hasAVX512()) ||
30407 DAG.isKnownNeverZero(Amt))) {
30408 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30409 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30410 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30411 SDValue Amt0 =
30412 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30413 SDValue Amt1 =
30414 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30415 SDValue Sra1 =
30416 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30417 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30418 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30419 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30420 }
30421 }
30422
30423 // v4i32 Non Uniform Shifts.
30424 // If the shift amount is constant we can shift each lane using the SSE2
30425 // immediate shifts, else we need to zero-extend each lane to the lower i64
30426 // and shift using the SSE2 variable shifts.
30427 // The separate results can then be blended together.
30428 if (VT == MVT::v4i32) {
30429 SDValue Amt0, Amt1, Amt2, Amt3;
30430 if (ConstantAmt) {
30431 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30432 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30433 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30434 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30435 } else {
30436 // The SSE2 shifts use the lower i64 as the same shift amount for
30437 // all lanes and the upper i64 is ignored. On AVX we're better off
30438 // just zero-extending, but for SSE just duplicating the top 16-bits is
30439 // cheaper and has the same effect for out of range values.
30440 if (Subtarget.hasAVX()) {
30441 SDValue Z = DAG.getConstant(0, dl, VT);
30442 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30443 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30444 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30445 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30446 } else {
30447 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30448 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30449 {4, 5, 6, 7, -1, -1, -1, -1});
30450 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30451 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30452 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30453 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30454 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30455 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30456 }
30457 }
30458
30459 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30460 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30461 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30462 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30463 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30464
30465 // Merge the shifted lane results optimally with/without PBLENDW.
30466 // TODO - ideally shuffle combining would handle this.
30467 if (Subtarget.hasSSE41()) {
30468 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30469 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30470 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30471 }
30472 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30473 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30474 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30475 }
30476
30477 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30478 // look up the pre-computed shift values.
30479 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30480 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30481 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30482 unsigned NumLanes = VT.getSizeInBits() / 128u;
30483 unsigned NumEltsPerLane = NumElts / NumLanes;
30485 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30486 unsigned LoElt = Lane * NumEltsPerLane;
30487 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30488 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30489 if (!KnownLane.isConstant())
30490 break;
30491 const APInt &LaneSplat = KnownLane.getConstant();
30492 for (unsigned I = 0; I != 8; ++I) {
30493 if (Opc == ISD::SHL)
30494 LUT.push_back(LaneSplat.shl(I));
30495 else if (Opc == ISD::SRL)
30496 LUT.push_back(LaneSplat.lshr(I));
30497 else if (Opc == ISD::SRA)
30498 LUT.push_back(LaneSplat.ashr(I));
30499 }
30500 LUT.append(8, APInt::getZero(8));
30501 }
30502 if (LUT.size() == NumElts) {
30503 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30504 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30505 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30506 }
30507 }
30508
30509 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30510 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30511 // make the existing SSE solution better.
30512 // NOTE: We honor prefered vector width before promoting to 512-bits.
30513 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30514 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30515 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30516 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30517 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30518 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30519 "Unexpected vector type");
30520 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30521 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30522 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30523 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30524 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30525 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30526 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30527 }
30528
30529 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30530 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30531 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30532 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30533 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30534 !Subtarget.hasXOP()) {
30535 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30536 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30537
30538 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30539 // isn't legal).
30540 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30541 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30542 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30543 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30545 "Constant build vector expected");
30546
30547 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30548 bool IsSigned = Opc == ISD::SRA;
30549 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30550 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30551 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30552 return DAG.getZExtOrTrunc(R, dl, VT);
30553 }
30554
30555 SmallVector<SDValue, 16> LoAmt, HiAmt;
30556 for (unsigned i = 0; i != NumElts; i += 16) {
30557 for (int j = 0; j != 8; ++j) {
30558 LoAmt.push_back(Amt.getOperand(i + j));
30559 HiAmt.push_back(Amt.getOperand(i + j + 8));
30560 }
30561 }
30562
30563 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30564 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30565
30566 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30567 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30568 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30569 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30570 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30571 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30572 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30573 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30574 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30575 }
30576
30577 if (VT == MVT::v16i8 ||
30578 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
30579 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30580 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30581
30582 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30583 if (VT.is512BitVector()) {
30584 // On AVX512BW targets we make use of the fact that VSELECT lowers
30585 // to a masked blend which selects bytes based just on the sign bit
30586 // extracted to a mask.
30587 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30588 V0 = DAG.getBitcast(VT, V0);
30589 V1 = DAG.getBitcast(VT, V1);
30590 Sel = DAG.getBitcast(VT, Sel);
30591 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
30592 ISD::SETGT);
30593 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
30594 } else if (Subtarget.hasSSE41()) {
30595 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30596 // on the sign bit.
30597 V0 = DAG.getBitcast(VT, V0);
30598 V1 = DAG.getBitcast(VT, V1);
30599 Sel = DAG.getBitcast(VT, Sel);
30600 return DAG.getBitcast(SelVT,
30601 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
30602 }
30603 // On pre-SSE41 targets we test for the sign bit by comparing to
30604 // zero - a negative value will set all bits of the lanes to true
30605 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30606 SDValue Z = DAG.getConstant(0, dl, SelVT);
30607 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
30608 return DAG.getSelect(dl, SelVT, C, V0, V1);
30609 };
30610
30611 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30612 // We can safely do this using i16 shifts as we're only interested in
30613 // the 3 lower bits of each byte.
30614 Amt = DAG.getBitcast(ExtVT, Amt);
30615 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
30616 Amt = DAG.getBitcast(VT, Amt);
30617
30618 if (Opc == ISD::SHL || Opc == ISD::SRL) {
30619 // r = VSELECT(r, shift(r, 4), a);
30620 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
30621 R = SignBitSelect(VT, Amt, M, R);
30622
30623 // a += a
30624 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30625
30626 // r = VSELECT(r, shift(r, 2), a);
30627 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
30628 R = SignBitSelect(VT, Amt, M, R);
30629
30630 // a += a
30631 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30632
30633 // return VSELECT(r, shift(r, 1), a);
30634 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
30635 R = SignBitSelect(VT, Amt, M, R);
30636 return R;
30637 }
30638
30639 if (Opc == ISD::SRA) {
30640 // For SRA we need to unpack each byte to the higher byte of a i16 vector
30641 // so we can correctly sign extend. We don't care what happens to the
30642 // lower byte.
30643 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
30644 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
30645 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
30646 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
30647 ALo = DAG.getBitcast(ExtVT, ALo);
30648 AHi = DAG.getBitcast(ExtVT, AHi);
30649 RLo = DAG.getBitcast(ExtVT, RLo);
30650 RHi = DAG.getBitcast(ExtVT, RHi);
30651
30652 // r = VSELECT(r, shift(r, 4), a);
30653 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
30654 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
30655 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30656 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30657
30658 // a += a
30659 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
30660 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
30661
30662 // r = VSELECT(r, shift(r, 2), a);
30663 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
30664 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
30665 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30666 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30667
30668 // a += a
30669 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
30670 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
30671
30672 // r = VSELECT(r, shift(r, 1), a);
30673 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
30674 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
30675 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30676 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30677
30678 // Logical shift the result back to the lower byte, leaving a zero upper
30679 // byte meaning that we can safely pack with PACKUSWB.
30680 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
30681 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
30682 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
30683 }
30684 }
30685
30686 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
30687 MVT ExtVT = MVT::v8i32;
30688 SDValue Z = DAG.getConstant(0, dl, VT);
30689 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
30690 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
30691 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
30692 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
30693 ALo = DAG.getBitcast(ExtVT, ALo);
30694 AHi = DAG.getBitcast(ExtVT, AHi);
30695 RLo = DAG.getBitcast(ExtVT, RLo);
30696 RHi = DAG.getBitcast(ExtVT, RHi);
30697 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
30698 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
30699 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
30700 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
30701 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30702 }
30703
30704 if (VT == MVT::v8i16) {
30705 // If we have a constant shift amount, the non-SSE41 path is best as
30706 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
30707 bool UseSSE41 = Subtarget.hasSSE41() &&
30709
30710 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
30711 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
30712 // the sign bit.
30713 if (UseSSE41) {
30714 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
30715 V0 = DAG.getBitcast(ExtVT, V0);
30716 V1 = DAG.getBitcast(ExtVT, V1);
30717 Sel = DAG.getBitcast(ExtVT, Sel);
30718 return DAG.getBitcast(
30719 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
30720 }
30721 // On pre-SSE41 targets we splat the sign bit - a negative value will
30722 // set all bits of the lanes to true and VSELECT uses that in
30723 // its OR(AND(V0,C),AND(V1,~C)) lowering.
30724 SDValue C =
30725 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
30726 return DAG.getSelect(dl, VT, C, V0, V1);
30727 };
30728
30729 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
30730 if (UseSSE41) {
30731 // On SSE41 targets we need to replicate the shift mask in both
30732 // bytes for PBLENDVB.
30733 Amt = DAG.getNode(
30734 ISD::OR, dl, VT,
30735 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
30736 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
30737 } else {
30738 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
30739 }
30740
30741 // r = VSELECT(r, shift(r, 8), a);
30742 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
30743 R = SignBitSelect(Amt, M, R);
30744
30745 // a += a
30746 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30747
30748 // r = VSELECT(r, shift(r, 4), a);
30749 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
30750 R = SignBitSelect(Amt, M, R);
30751
30752 // a += a
30753 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30754
30755 // r = VSELECT(r, shift(r, 2), a);
30756 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
30757 R = SignBitSelect(Amt, M, R);
30758
30759 // a += a
30760 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30761
30762 // return VSELECT(r, shift(r, 1), a);
30763 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
30764 R = SignBitSelect(Amt, M, R);
30765 return R;
30766 }
30767
30768 // Decompose 256-bit shifts into 128-bit shifts.
30769 if (VT.is256BitVector())
30770 return splitVectorIntBinary(Op, DAG, dl);
30771
30772 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30773 return splitVectorIntBinary(Op, DAG, dl);
30774
30775 return SDValue();
30776}
30777
30779 SelectionDAG &DAG) {
30780 MVT VT = Op.getSimpleValueType();
30781 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
30782 "Unexpected funnel shift opcode!");
30783
30784 SDLoc DL(Op);
30785 SDValue Op0 = Op.getOperand(0);
30786 SDValue Op1 = Op.getOperand(1);
30787 SDValue Amt = Op.getOperand(2);
30788 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30789 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
30790
30791 if (VT.isVector()) {
30792 APInt APIntShiftAmt;
30793 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30794 unsigned NumElts = VT.getVectorNumElements();
30795
30796 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
30797 if (IsFSHR)
30798 std::swap(Op0, Op1);
30799
30800 if (IsCstSplat) {
30801 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
30802 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
30803 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
30804 {Op0, Op1, Imm}, DAG, Subtarget);
30805 }
30806 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
30807 {Op0, Op1, Amt}, DAG, Subtarget);
30808 }
30809 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30810 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
30811 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
30812 "Unexpected funnel shift type!");
30813
30814 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
30815 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
30816 if (IsCstSplat) {
30817 // TODO: Can't use generic expansion as UNDEF amt elements can be
30818 // converted to other values when folded to shift amounts, losing the
30819 // splat.
30820 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
30821 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
30822 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
30823 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
30824 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30825
30826 if (EltSizeInBits == 8 &&
30827 (Subtarget.hasXOP() ||
30828 (useVPTERNLOG(Subtarget, VT) &&
30829 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
30830 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
30831 // bit-select - lower using vXi16 shifts and then perform the bitmask at
30832 // the original vector width to handle cases where we split.
30833 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
30834 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
30835 SDValue ShX =
30836 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
30837 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
30838 SDValue ShY =
30839 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
30840 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
30841 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
30842 DAG.getConstant(MaskX, DL, VT));
30843 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
30844 DAG.getConstant(MaskY, DL, VT));
30845 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
30846 }
30847
30848 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
30849 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
30850 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
30851 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
30852 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
30853 }
30854
30855 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30856 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30857 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
30858
30859 // Constant vXi16 funnel shifts can be efficiently handled by default.
30860 if (IsCst && EltSizeInBits == 16)
30861 return SDValue();
30862
30863 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
30864 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30865 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30866
30867 // Split 256-bit integers on XOP/pre-AVX2 targets.
30868 // Split 512-bit integers on non 512-bit BWI targets.
30869 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
30870 !Subtarget.hasAVX2())) ||
30871 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
30872 EltSizeInBits < 32)) {
30873 // Pre-mask the amount modulo using the wider vector.
30874 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
30875 return splitVectorOp(Op, DAG, DL);
30876 }
30877
30878 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
30879 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
30880 int ScalarAmtIdx = -1;
30881 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
30882 // Uniform vXi16 funnel shifts can be efficiently handled by default.
30883 if (EltSizeInBits == 16)
30884 return SDValue();
30885
30886 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30887 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30888 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
30889 ScalarAmtIdx, Subtarget, DAG);
30890 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
30891 ScalarAmtIdx, Subtarget, DAG);
30892 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30893 }
30894 }
30895
30896 MVT WideSVT = MVT::getIntegerVT(
30897 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
30898 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
30899
30900 // If per-element shifts are legal, fallback to generic expansion.
30901 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
30902 return SDValue();
30903
30904 // Attempt to fold as:
30905 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30906 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30907 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30908 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30909 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
30910 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
30911 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30912 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
30913 EltSizeInBits, DAG);
30914 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
30915 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
30916 if (!IsFSHR)
30917 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
30918 EltSizeInBits, DAG);
30919 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
30920 }
30921
30922 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
30923 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
30924 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
30925 SDValue Z = DAG.getConstant(0, DL, VT);
30926 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30927 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30928 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30929 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30930 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30931 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30932 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30933 }
30934
30935 // Fallback to generic expansion.
30936 return SDValue();
30937 }
30938 assert(
30939 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
30940 "Unexpected funnel shift type!");
30941
30942 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
30943 bool OptForSize = DAG.shouldOptForSize();
30944 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
30945
30946 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30947 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30948 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
30949 !isa<ConstantSDNode>(Amt)) {
30950 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
30951 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
30952 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
30953 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
30954 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
30955 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
30956 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
30957 if (IsFSHR) {
30958 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
30959 } else {
30960 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
30961 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
30962 }
30963 return DAG.getZExtOrTrunc(Res, DL, VT);
30964 }
30965
30966 if (VT == MVT::i8 || ExpandFunnel)
30967 return SDValue();
30968
30969 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
30970 if (VT == MVT::i16) {
30971 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
30972 DAG.getConstant(15, DL, Amt.getValueType()));
30973 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
30974 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
30975 }
30976
30977 return Op;
30978}
30979
30980static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
30981 SelectionDAG &DAG) {
30982 MVT VT = Op.getSimpleValueType();
30983 assert(VT.isVector() && "Custom lowering only for vector rotates!");
30984
30985 SDLoc DL(Op);
30986 SDValue R = Op.getOperand(0);
30987 SDValue Amt = Op.getOperand(1);
30988 unsigned Opcode = Op.getOpcode();
30989 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30990 int NumElts = VT.getVectorNumElements();
30991 bool IsROTL = Opcode == ISD::ROTL;
30992
30993 // Check for constant splat rotation amount.
30994 APInt CstSplatValue;
30995 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
30996
30997 // Check for splat rotate by zero.
30998 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
30999 return R;
31000
31001 // AVX512 implicitly uses modulo rotation amounts.
31002 if ((Subtarget.hasVLX() ||
31003 (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
31004 32 <= EltSizeInBits) {
31005 // Attempt to rotate by immediate.
31006 if (IsCstSplat) {
31007 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31008 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31009 return DAG.getNode(RotOpc, DL, VT, R,
31010 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31011 }
31012
31013 // Else, fall-back on VPROLV/VPRORV.
31014 return Op;
31015 }
31016
31017 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31018 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31019 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31020 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31021 }
31022
31023 SDValue Z = DAG.getConstant(0, DL, VT);
31024
31025 if (!IsROTL) {
31026 // If the ISD::ROTR amount is constant, we're always better converting to
31027 // ISD::ROTL.
31028 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31029 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31030
31031 // XOP targets always prefers ISD::ROTL.
31032 if (Subtarget.hasXOP())
31033 return DAG.getNode(ISD::ROTL, DL, VT, R,
31034 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31035 }
31036
31037 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31038 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31040 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31041 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31042 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31043 DAG.getTargetConstant(0, DL, MVT::i8));
31044 }
31045
31046 // Split 256-bit integers on XOP/pre-AVX2 targets.
31047 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31048 return splitVectorIntBinary(Op, DAG, DL);
31049
31050 // XOP has 128-bit vector variable + immediate rotates.
31051 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31052 // XOP implicitly uses modulo rotation amounts.
31053 if (Subtarget.hasXOP()) {
31054 assert(IsROTL && "Only ROTL expected");
31055 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31056
31057 // Attempt to rotate by immediate.
31058 if (IsCstSplat) {
31059 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31060 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31061 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31062 }
31063
31064 // Use general rotate by variable (per-element).
31065 return Op;
31066 }
31067
31068 // Rotate by an uniform constant - expand back to shifts.
31069 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31070 // to other values when folded to shift amounts, losing the splat.
31071 if (IsCstSplat) {
31072 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31073 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31074 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31075 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31076 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31077 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31078 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31079 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31080 }
31081
31082 // Split 512-bit integers on non 512-bit BWI targets.
31083 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31084 return splitVectorIntBinary(Op, DAG, DL);
31085
31086 assert(
31087 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31088 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31089 Subtarget.hasAVX2()) ||
31090 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31091 "Only vXi32/vXi16/vXi8 vector rotates supported");
31092
31093 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31094 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31095
31096 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31097 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31098
31099 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31100 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31101 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31102 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31103 int BaseRotAmtIdx = -1;
31104 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31105 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31106 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31107 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31108 }
31109 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31110 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31111 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31112 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31113 BaseRotAmtIdx, Subtarget, DAG);
31114 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31115 BaseRotAmtIdx, Subtarget, DAG);
31116 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31117 }
31118 }
31119
31120 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31121 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31122
31123 // Attempt to fold as unpack(x,x) << zext(y):
31124 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31125 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31126 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31127 if (!(ConstantAmt && EltSizeInBits != 8) &&
31128 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31129 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31130 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31131 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31132 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31133 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31134 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31135 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31136 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31137 }
31138
31139 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31140 // the amount bit.
31141 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31142 if (EltSizeInBits == 8) {
31143 MVT WideVT =
31144 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31145
31146 // Attempt to fold as:
31147 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31148 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31149 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31150 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31151 // If we're rotating by constant, just use default promotion.
31152 if (ConstantAmt)
31153 return SDValue();
31154 // See if we can perform this by widening to vXi16 or vXi32.
31155 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31156 R = DAG.getNode(
31157 ISD::OR, DL, WideVT, R,
31158 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31159 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31160 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31161 if (IsROTL)
31162 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31163 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31164 }
31165
31166 // We don't need ModuloAmt here as we just peek at individual bits.
31167 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31168 if (Subtarget.hasSSE41()) {
31169 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31170 // on the sign bit.
31171 V0 = DAG.getBitcast(VT, V0);
31172 V1 = DAG.getBitcast(VT, V1);
31173 Sel = DAG.getBitcast(VT, Sel);
31174 return DAG.getBitcast(SelVT,
31175 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31176 }
31177 // On pre-SSE41 targets we test for the sign bit by comparing to
31178 // zero - a negative value will set all bits of the lanes to true
31179 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31180 SDValue Z = DAG.getConstant(0, DL, SelVT);
31181 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31182 return DAG.getSelect(DL, SelVT, C, V0, V1);
31183 };
31184
31185 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31186 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31187 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31188 IsROTL = true;
31189 }
31190
31191 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31192 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31193
31194 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31195 // We can safely do this using i16 shifts as we're only interested in
31196 // the 3 lower bits of each byte.
31197 Amt = DAG.getBitcast(ExtVT, Amt);
31198 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31199 Amt = DAG.getBitcast(VT, Amt);
31200
31201 // r = VSELECT(r, rot(r, 4), a);
31202 SDValue M;
31203 M = DAG.getNode(
31204 ISD::OR, DL, VT,
31205 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31206 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31207 R = SignBitSelect(VT, Amt, M, R);
31208
31209 // a += a
31210 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31211
31212 // r = VSELECT(r, rot(r, 2), a);
31213 M = DAG.getNode(
31214 ISD::OR, DL, VT,
31215 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31216 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31217 R = SignBitSelect(VT, Amt, M, R);
31218
31219 // a += a
31220 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31221
31222 // return VSELECT(r, rot(r, 1), a);
31223 M = DAG.getNode(
31224 ISD::OR, DL, VT,
31225 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31226 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31227 return SignBitSelect(VT, Amt, M, R);
31228 }
31229
31230 bool IsSplatAmt = DAG.isSplatValue(Amt);
31231 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31232 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31233
31234 // Fallback for splats + all supported variable shifts.
31235 // Fallback for non-constants AVX2 vXi16 as well.
31236 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31237 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31238 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31239 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31240 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31241 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31242 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31243 }
31244
31245 // Everything below assumes ISD::ROTL.
31246 if (!IsROTL) {
31247 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31248 IsROTL = true;
31249 }
31250
31251 // ISD::ROT* uses modulo rotate amounts.
31252 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31253
31254 assert(IsROTL && "Only ROTL supported");
31255
31256 // As with shifts, attempt to convert the rotation amount to a multiplication
31257 // factor, fallback to general expansion.
31258 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31259 if (!Scale)
31260 return SDValue();
31261
31262 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31263 if (EltSizeInBits == 16) {
31264 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31265 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31266 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31267 }
31268
31269 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31270 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31271 // that can then be OR'd with the lower 32-bits.
31272 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31273 static const int OddMask[] = {1, 1, 3, 3};
31274 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31275 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31276
31277 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31278 DAG.getBitcast(MVT::v2i64, R),
31279 DAG.getBitcast(MVT::v2i64, Scale));
31280 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31281 DAG.getBitcast(MVT::v2i64, R13),
31282 DAG.getBitcast(MVT::v2i64, Scale13));
31283 Res02 = DAG.getBitcast(VT, Res02);
31284 Res13 = DAG.getBitcast(VT, Res13);
31285
31286 return DAG.getNode(ISD::OR, DL, VT,
31287 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31288 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31289}
31290
31291/// Returns true if the operand type is exactly twice the native width, and
31292/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31293/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31294/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31295bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31296 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31297
31298 if (OpWidth == 64)
31299 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31300 if (OpWidth == 128)
31301 return Subtarget.canUseCMPXCHG16B();
31302
31303 return false;
31304}
31305
31307X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31308 Type *MemType = SI->getValueOperand()->getType();
31309
31310 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31311 !Subtarget.useSoftFloat()) {
31312 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31313 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31315
31316 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31317 Subtarget.hasAVX())
31319 }
31320
31321 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31323}
31324
31325// Note: this turns large loads into lock cmpxchg8b/16b.
31327X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31328 Type *MemType = LI->getType();
31329
31330 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31331 !Subtarget.useSoftFloat()) {
31332 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31333 // can use movq to do the load. If we have X87 we can load into an 80-bit
31334 // X87 register and store it to a stack temporary.
31335 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31336 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31338
31339 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31340 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31341 Subtarget.hasAVX())
31343 }
31344
31345 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31347}
31348
31349enum BitTestKind : unsigned {
31356
31357static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31358 using namespace llvm::PatternMatch;
31359 BitTestKind BTK = UndefBit;
31360 if (auto *C = dyn_cast<ConstantInt>(V)) {
31361 // Check if V is a power of 2 or NOT power of 2.
31362 if (isPowerOf2_64(C->getZExtValue()))
31363 BTK = ConstantBit;
31364 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31365 BTK = NotConstantBit;
31366 return {V, BTK};
31367 }
31368
31369 // Check if V is some power of 2 pattern known to be non-zero
31370 if (auto *I = dyn_cast<Instruction>(V)) {
31371 bool Not = false;
31372 // Check if we have a NOT
31373 Value *PeekI;
31374 if (match(I, m_Not(m_Value(PeekI))) ||
31375 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31376 Not = true;
31377 I = dyn_cast<Instruction>(PeekI);
31378
31379 // If I is constant, it will fold and we can evaluate later. If its an
31380 // argument or something of that nature, we can't analyze.
31381 if (I == nullptr)
31382 return {nullptr, UndefBit};
31383 }
31384 // We can only use 1 << X without more sophisticated analysis. C << X where
31385 // C is a power of 2 but not 1 can result in zero which cannot be translated
31386 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31387 if (I->getOpcode() == Instruction::Shl) {
31388 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31389 // -X` and some other provable power of 2 patterns that we can use CTZ on
31390 // may be profitable.
31391 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31392 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31393 // be provably a non-zero power of 2.
31394 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31395 // transformable to bittest.
31396 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31397 if (!ShiftVal)
31398 return {nullptr, UndefBit};
31399 if (ShiftVal->equalsInt(1))
31400 BTK = Not ? NotShiftBit : ShiftBit;
31401
31402 if (BTK == UndefBit)
31403 return {nullptr, UndefBit};
31404
31405 Value *BitV = I->getOperand(1);
31406
31407 // Read past a shiftmask instruction to find count
31408 Value *AndOp;
31409 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31410 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31411 BitV = AndOp;
31412
31413 return {BitV, BTK};
31414 }
31415 }
31416 return {nullptr, UndefBit};
31417}
31418
31420X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31421 using namespace llvm::PatternMatch;
31422 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31423 // prefix to a normal instruction for these operations.
31424 if (AI->use_empty())
31426
31427 if (AI->getOperation() == AtomicRMWInst::Xor) {
31428 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31429 // preferable to both `cmpxchg` and `btc`.
31430 if (match(AI->getOperand(1), m_SignMask()))
31432 }
31433
31434 // If the atomicrmw's result is used by a single bit AND, we may use
31435 // bts/btr/btc instruction for these operations.
31436 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31437 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31438 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31439 // detect it.
31440 Instruction *I = AI->user_back();
31441 auto BitChange = FindSingleBitChange(AI->getValOperand());
31442 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31443 I->getOpcode() != Instruction::And ||
31444 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31445 AI->getParent() != I->getParent())
31447
31448 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31449
31450 // This is a redundant AND, it should get cleaned up elsewhere.
31451 if (AI == I->getOperand(OtherIdx))
31453
31454 // The following instruction must be a AND single bit.
31455 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31456 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31457 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31458 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31460 }
31461 if (AI->getOperation() == AtomicRMWInst::And) {
31462 return ~C1->getValue() == C2->getValue()
31465 }
31468 }
31469
31470 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31471
31472 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31473 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31475
31476 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31477
31478 // If shift amounts are not the same we can't use BitTestIntrinsic.
31479 if (BitChange.first != BitTested.first)
31481
31482 // If atomic AND need to be masking all be one bit and testing the one bit
31483 // unset in the mask.
31484 if (AI->getOperation() == AtomicRMWInst::And)
31485 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31488
31489 // If atomic XOR/OR need to be setting and testing the same bit.
31490 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31493}
31494
31495void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31496 IRBuilder<> Builder(AI);
31497 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31500 switch (AI->getOperation()) {
31501 default:
31502 llvm_unreachable("Unknown atomic operation");
31503 case AtomicRMWInst::Or:
31504 IID_C = Intrinsic::x86_atomic_bts;
31505 IID_I = Intrinsic::x86_atomic_bts_rm;
31506 break;
31507 case AtomicRMWInst::Xor:
31508 IID_C = Intrinsic::x86_atomic_btc;
31509 IID_I = Intrinsic::x86_atomic_btc_rm;
31510 break;
31511 case AtomicRMWInst::And:
31512 IID_C = Intrinsic::x86_atomic_btr;
31513 IID_I = Intrinsic::x86_atomic_btr_rm;
31514 break;
31515 }
31516 Instruction *I = AI->user_back();
31517 LLVMContext &Ctx = AI->getContext();
31518 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31520 Value *Result = nullptr;
31521 auto BitTested = FindSingleBitChange(AI->getValOperand());
31522 assert(BitTested.first != nullptr);
31523
31524 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31525 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31526
31527 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31528 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31529 {Addr, Builder.getInt8(Imm)});
31530 } else {
31531 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31532
31533 Value *SI = BitTested.first;
31534 assert(SI != nullptr);
31535
31536 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31537 // mask it.
31538 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31539 Value *BitPos =
31540 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31541 // Todo(1): In many cases it may be provable that SI is less than
31542 // ShiftBits in which case this mask is unnecessary
31543 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31544 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31545 // favor of just a raw BT{S|R|C}.
31546
31547 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31548 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31549
31550 // If the result is only used for zero/non-zero status then we don't need to
31551 // shift value back. Otherwise do so.
31552 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31553 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31554 if (ICmp->isEquality()) {
31555 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31556 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31557 if (C0 || C1) {
31558 assert(C0 == nullptr || C1 == nullptr);
31559 if ((C0 ? C0 : C1)->isZero())
31560 continue;
31561 }
31562 }
31563 }
31564 Result = Builder.CreateShl(Result, BitPos);
31565 break;
31566 }
31567 }
31568
31569 I->replaceAllUsesWith(Result);
31570 I->eraseFromParent();
31571 AI->eraseFromParent();
31572}
31573
31575 using namespace llvm::PatternMatch;
31576 if (!AI->hasOneUse())
31577 return false;
31578
31579 Value *Op = AI->getOperand(1);
31580 CmpPredicate Pred;
31581 Instruction *I = AI->user_back();
31583 if (Opc == AtomicRMWInst::Add) {
31584 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
31585 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31586 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
31587 if (match(I->user_back(),
31589 return true;
31590 if (match(I->user_back(),
31592 return true;
31593 }
31594 return false;
31595 }
31596 if (Opc == AtomicRMWInst::Sub) {
31597 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
31598 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31599 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
31600 if (match(I->user_back(),
31602 return true;
31603 if (match(I->user_back(),
31605 return true;
31606 }
31607 return false;
31608 }
31609 if ((Opc == AtomicRMWInst::Or &&
31611 (Opc == AtomicRMWInst::And &&
31613 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
31614 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
31615 Pred == CmpInst::ICMP_SLT;
31616 if (match(I->user_back(),
31618 return true;
31619 return false;
31620 }
31621 if (Opc == AtomicRMWInst::Xor) {
31622 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
31623 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31624 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
31625 if (match(I->user_back(),
31627 return true;
31628 if (match(I->user_back(),
31630 return true;
31631 }
31632 return false;
31633 }
31634
31635 return false;
31636}
31637
31638void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
31639 AtomicRMWInst *AI) const {
31640 IRBuilder<> Builder(AI);
31641 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31642 Instruction *TempI = nullptr;
31643 LLVMContext &Ctx = AI->getContext();
31644 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
31645 if (!ICI) {
31646 TempI = AI->user_back();
31647 assert(TempI->hasOneUse() && "Must have one use");
31648 ICI = cast<ICmpInst>(TempI->user_back());
31649 }
31651 ICmpInst::Predicate Pred = ICI->getPredicate();
31652 switch (Pred) {
31653 default:
31654 llvm_unreachable("Not supported Pred");
31655 case CmpInst::ICMP_EQ:
31656 CC = X86::COND_E;
31657 break;
31658 case CmpInst::ICMP_NE:
31659 CC = X86::COND_NE;
31660 break;
31661 case CmpInst::ICMP_SLT:
31662 CC = X86::COND_S;
31663 break;
31664 case CmpInst::ICMP_SGT:
31665 CC = X86::COND_NS;
31666 break;
31667 }
31669 switch (AI->getOperation()) {
31670 default:
31671 llvm_unreachable("Unknown atomic operation");
31672 case AtomicRMWInst::Add:
31673 IID = Intrinsic::x86_atomic_add_cc;
31674 break;
31675 case AtomicRMWInst::Sub:
31676 IID = Intrinsic::x86_atomic_sub_cc;
31677 break;
31678 case AtomicRMWInst::Or:
31679 IID = Intrinsic::x86_atomic_or_cc;
31680 break;
31681 case AtomicRMWInst::And:
31682 IID = Intrinsic::x86_atomic_and_cc;
31683 break;
31684 case AtomicRMWInst::Xor:
31685 IID = Intrinsic::x86_atomic_xor_cc;
31686 break;
31687 }
31688 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31690 Value *Call = Builder.CreateIntrinsic(
31691 IID, AI->getType(),
31692 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
31693 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
31694 ICI->replaceAllUsesWith(Result);
31695 ICI->eraseFromParent();
31696 if (TempI)
31697 TempI->eraseFromParent();
31698 AI->eraseFromParent();
31699}
31700
31702X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
31703 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
31704 Type *MemType = AI->getType();
31705
31706 // If the operand is too big, we must see if cmpxchg8/16b is available
31707 // and default to library calls otherwise.
31708 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
31709 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31711 }
31712
31714 switch (Op) {
31717 case AtomicRMWInst::Add:
31718 case AtomicRMWInst::Sub:
31721 // It's better to use xadd, xsub or xchg for these in other cases.
31723 case AtomicRMWInst::Or:
31724 case AtomicRMWInst::And:
31725 case AtomicRMWInst::Xor:
31728 return shouldExpandLogicAtomicRMWInIR(AI);
31730 case AtomicRMWInst::Max:
31731 case AtomicRMWInst::Min:
31742 default:
31743 // These always require a non-trivial set of data operations on x86. We must
31744 // use a cmpxchg loop.
31746 }
31747}
31748
31749LoadInst *
31750X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
31751 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
31752 Type *MemType = AI->getType();
31753 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
31754 // there is no benefit in turning such RMWs into loads, and it is actually
31755 // harmful as it introduces a mfence.
31756 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
31757 return nullptr;
31758
31759 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
31760 // lowering available in lowerAtomicArith.
31761 // TODO: push more cases through this path.
31762 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
31763 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
31764 AI->use_empty())
31765 return nullptr;
31766
31767 IRBuilder<> Builder(AI);
31768 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31769 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
31770 auto SSID = AI->getSyncScopeID();
31771 // We must restrict the ordering to avoid generating loads with Release or
31772 // ReleaseAcquire orderings.
31774
31775 // Before the load we need a fence. Here is an example lifted from
31776 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
31777 // is required:
31778 // Thread 0:
31779 // x.store(1, relaxed);
31780 // r1 = y.fetch_add(0, release);
31781 // Thread 1:
31782 // y.fetch_add(42, acquire);
31783 // r2 = x.load(relaxed);
31784 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
31785 // lowered to just a load without a fence. A mfence flushes the store buffer,
31786 // making the optimization clearly correct.
31787 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
31788 // otherwise, we might be able to be more aggressive on relaxed idempotent
31789 // rmw. In practice, they do not look useful, so we don't try to be
31790 // especially clever.
31791 if (SSID == SyncScope::SingleThread)
31792 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
31793 // the IR level, so we must wrap it in an intrinsic.
31794 return nullptr;
31795
31796 if (!Subtarget.hasMFence())
31797 // FIXME: it might make sense to use a locked operation here but on a
31798 // different cache-line to prevent cache-line bouncing. In practice it
31799 // is probably a small win, and x86 processors without mfence are rare
31800 // enough that we do not bother.
31801 return nullptr;
31802
31803 Function *MFence =
31804 llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse2_mfence);
31805 Builder.CreateCall(MFence, {});
31806
31807 // Finally we can emit the atomic load.
31808 LoadInst *Loaded = Builder.CreateAlignedLoad(
31809 AI->getType(), AI->getPointerOperand(), AI->getAlign());
31810 Loaded->setAtomic(Order, SSID);
31811 AI->replaceAllUsesWith(Loaded);
31812 AI->eraseFromParent();
31813 return Loaded;
31814}
31815
31816/// Emit a locked operation on a stack location which does not change any
31817/// memory location, but does involve a lock prefix. Location is chosen to be
31818/// a) very likely accessed only by a single thread to minimize cache traffic,
31819/// and b) definitely dereferenceable. Returns the new Chain result.
31821 const X86Subtarget &Subtarget, SDValue Chain,
31822 const SDLoc &DL) {
31823 // Implementation notes:
31824 // 1) LOCK prefix creates a full read/write reordering barrier for memory
31825 // operations issued by the current processor. As such, the location
31826 // referenced is not relevant for the ordering properties of the instruction.
31827 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
31828 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
31829 // 2) Using an immediate operand appears to be the best encoding choice
31830 // here since it doesn't require an extra register.
31831 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
31832 // is small enough it might just be measurement noise.)
31833 // 4) When choosing offsets, there are several contributing factors:
31834 // a) If there's no redzone, we default to TOS. (We could allocate a cache
31835 // line aligned stack object to improve this case.)
31836 // b) To minimize our chances of introducing a false dependence, we prefer
31837 // to offset the stack usage from TOS slightly.
31838 // c) To minimize concerns about cross thread stack usage - in particular,
31839 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
31840 // captures state in the TOS frame and accesses it from many threads -
31841 // we want to use an offset such that the offset is in a distinct cache
31842 // line from the TOS frame.
31843 //
31844 // For a general discussion of the tradeoffs and benchmark results, see:
31845 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
31846
31847 auto &MF = DAG.getMachineFunction();
31848 auto &TFL = *Subtarget.getFrameLowering();
31849 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
31850
31851 if (Subtarget.is64Bit()) {
31852 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
31853 SDValue Ops[] = {
31854 DAG.getRegister(X86::RSP, MVT::i64), // Base
31855 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
31856 DAG.getRegister(0, MVT::i64), // Index
31857 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
31858 DAG.getRegister(0, MVT::i16), // Segment.
31859 Zero,
31860 Chain};
31861 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
31862 MVT::Other, Ops);
31863 return SDValue(Res, 1);
31864 }
31865
31866 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
31867 SDValue Ops[] = {
31868 DAG.getRegister(X86::ESP, MVT::i32), // Base
31869 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
31870 DAG.getRegister(0, MVT::i32), // Index
31871 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
31872 DAG.getRegister(0, MVT::i16), // Segment.
31873 Zero,
31874 Chain
31875 };
31876 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
31877 MVT::Other, Ops);
31878 return SDValue(Res, 1);
31879}
31880
31882 SelectionDAG &DAG) {
31883 SDLoc dl(Op);
31884 AtomicOrdering FenceOrdering =
31885 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
31886 SyncScope::ID FenceSSID =
31887 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
31888
31889 // The only fence that needs an instruction is a sequentially-consistent
31890 // cross-thread fence.
31891 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
31892 FenceSSID == SyncScope::System) {
31893 if (Subtarget.hasMFence())
31894 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
31895
31896 SDValue Chain = Op.getOperand(0);
31897 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
31898 }
31899
31900 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31901 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
31902}
31903
31905 SelectionDAG &DAG) {
31906 MVT T = Op.getSimpleValueType();
31907 SDLoc DL(Op);
31908 unsigned Reg = 0;
31909 unsigned size = 0;
31910 switch(T.SimpleTy) {
31911 default: llvm_unreachable("Invalid value type!");
31912 case MVT::i8: Reg = X86::AL; size = 1; break;
31913 case MVT::i16: Reg = X86::AX; size = 2; break;
31914 case MVT::i32: Reg = X86::EAX; size = 4; break;
31915 case MVT::i64:
31916 assert(Subtarget.is64Bit() && "Node not type legal!");
31917 Reg = X86::RAX; size = 8;
31918 break;
31919 }
31920 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
31921 Op.getOperand(2), SDValue());
31922 SDValue Ops[] = { cpIn.getValue(0),
31923 Op.getOperand(1),
31924 Op.getOperand(3),
31925 DAG.getTargetConstant(size, DL, MVT::i8),
31926 cpIn.getValue(1) };
31927 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31928 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
31930 Ops, T, MMO);
31931
31932 SDValue cpOut =
31933 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
31934 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
31935 MVT::i32, cpOut.getValue(2));
31936 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
31937
31938 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
31939 cpOut, Success, EFLAGS.getValue(1));
31940}
31941
31942// Create MOVMSKB, taking into account whether we need to split for AVX1.
31944 const X86Subtarget &Subtarget) {
31945 MVT InVT = V.getSimpleValueType();
31946
31947 if (InVT == MVT::v64i8) {
31948 SDValue Lo, Hi;
31949 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31950 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
31951 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
31952 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
31953 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
31954 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
31955 DAG.getConstant(32, DL, MVT::i8));
31956 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
31957 }
31958 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
31959 SDValue Lo, Hi;
31960 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31961 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
31962 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
31963 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
31964 DAG.getConstant(16, DL, MVT::i8));
31965 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
31966 }
31967
31968 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31969}
31970
31971static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
31972 SelectionDAG &DAG) {
31973 SDValue Src = Op.getOperand(0);
31974 MVT SrcVT = Src.getSimpleValueType();
31975 MVT DstVT = Op.getSimpleValueType();
31976
31977 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
31978 // half to v32i1 and concatenating the result.
31979 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
31980 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31981 assert(Subtarget.hasBWI() && "Expected BWI target");
31982 SDLoc dl(Op);
31983 SDValue Lo, Hi;
31984 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
31985 Lo = DAG.getBitcast(MVT::v32i1, Lo);
31986 Hi = DAG.getBitcast(MVT::v32i1, Hi);
31987 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
31988 }
31989
31990 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
31991 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
31992 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
31993 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
31994 SDLoc DL(Op);
31995 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
31996 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31997 return DAG.getZExtOrTrunc(V, DL, DstVT);
31998 }
31999
32000 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32001 SrcVT == MVT::i64) && "Unexpected VT!");
32002
32003 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32004 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32005 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32006 // This conversion needs to be expanded.
32007 return SDValue();
32008
32009 SDLoc dl(Op);
32010 if (SrcVT.isVector()) {
32011 // Widen the vector in input in the case of MVT::v2i32.
32012 // Example: from MVT::v2i32 to MVT::v4i32.
32014 SrcVT.getVectorNumElements() * 2);
32015 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32016 DAG.getUNDEF(SrcVT));
32017 } else {
32018 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32019 "Unexpected source type in LowerBITCAST");
32020 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32021 }
32022
32023 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32024 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32025
32026 if (DstVT == MVT::x86mmx)
32027 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32028
32029 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32030 DAG.getVectorIdxConstant(0, dl));
32031}
32032
32033/// Compute the horizontal sum of bytes in V for the elements of VT.
32034///
32035/// Requires V to be a byte vector and VT to be an integer vector type with
32036/// wider elements than V's type. The width of the elements of VT determines
32037/// how many bytes of V are summed horizontally to produce each element of the
32038/// result.
32040 const X86Subtarget &Subtarget,
32041 SelectionDAG &DAG) {
32042 SDLoc DL(V);
32043 MVT ByteVecVT = V.getSimpleValueType();
32044 MVT EltVT = VT.getVectorElementType();
32045 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32046 "Expected value to have byte element type.");
32047 assert(EltVT != MVT::i8 &&
32048 "Horizontal byte sum only makes sense for wider elements!");
32049 unsigned VecSize = VT.getSizeInBits();
32050 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32051
32052 // PSADBW instruction horizontally add all bytes and leave the result in i64
32053 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32054 if (EltVT == MVT::i64) {
32055 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32056 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32057 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32058 return DAG.getBitcast(VT, V);
32059 }
32060
32061 if (EltVT == MVT::i32) {
32062 // We unpack the low half and high half into i32s interleaved with zeros so
32063 // that we can use PSADBW to horizontally sum them. The most useful part of
32064 // this is that it lines up the results of two PSADBW instructions to be
32065 // two v2i64 vectors which concatenated are the 4 population counts. We can
32066 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32067 SDValue Zeros = DAG.getConstant(0, DL, VT);
32068 SDValue V32 = DAG.getBitcast(VT, V);
32069 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32070 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32071
32072 // Do the horizontal sums into two v2i64s.
32073 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32074 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32075 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32076 DAG.getBitcast(ByteVecVT, Low), Zeros);
32077 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32078 DAG.getBitcast(ByteVecVT, High), Zeros);
32079
32080 // Merge them together.
32081 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32082 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32083 DAG.getBitcast(ShortVecVT, Low),
32084 DAG.getBitcast(ShortVecVT, High));
32085
32086 return DAG.getBitcast(VT, V);
32087 }
32088
32089 // The only element type left is i16.
32090 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32091
32092 // To obtain pop count for each i16 element starting from the pop count for
32093 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32094 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32095 // directly supported.
32096 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32097 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32098 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32099 DAG.getBitcast(ByteVecVT, V));
32100 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32101}
32102
32104 const X86Subtarget &Subtarget,
32105 SelectionDAG &DAG) {
32106 MVT VT = Op.getSimpleValueType();
32107 MVT EltVT = VT.getVectorElementType();
32108 int NumElts = VT.getVectorNumElements();
32109 (void)EltVT;
32110 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32111
32112 // Implement a lookup table in register by using an algorithm based on:
32113 // http://wm.ite.pl/articles/sse-popcount.html
32114 //
32115 // The general idea is that every lower byte nibble in the input vector is an
32116 // index into a in-register pre-computed pop count table. We then split up the
32117 // input vector in two new ones: (1) a vector with only the shifted-right
32118 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32119 // masked out higher ones) for each byte. PSHUFB is used separately with both
32120 // to index the in-register table. Next, both are added and the result is a
32121 // i8 vector where each element contains the pop count for input byte.
32122 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32123 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32124 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32125 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32126
32128 for (int i = 0; i < NumElts; ++i)
32129 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32130 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32131 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32132
32133 // High nibbles
32134 SDValue FourV = DAG.getConstant(4, DL, VT);
32135 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32136
32137 // Low nibbles
32138 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32139
32140 // The input vector is used as the shuffle mask that index elements into the
32141 // LUT. After counting low and high nibbles, add the vector to obtain the
32142 // final pop count per i8 element.
32143 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32144 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32145 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32146}
32147
32148// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32149// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32151 const X86Subtarget &Subtarget,
32152 SelectionDAG &DAG) {
32153 MVT VT = Op.getSimpleValueType();
32154 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32155 "Unknown CTPOP type to handle");
32156 SDValue Op0 = Op.getOperand(0);
32157
32158 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32159 if (Subtarget.hasVPOPCNTDQ()) {
32160 unsigned NumElems = VT.getVectorNumElements();
32161 assert((VT.getVectorElementType() == MVT::i8 ||
32162 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32163 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32164 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32165 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32166 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32167 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32168 }
32169 }
32170
32171 // Decompose 256-bit ops into smaller 128-bit ops.
32172 if (VT.is256BitVector() && !Subtarget.hasInt256())
32173 return splitVectorIntUnary(Op, DAG, DL);
32174
32175 // Decompose 512-bit ops into smaller 256-bit ops.
32176 if (VT.is512BitVector() && !Subtarget.hasBWI())
32177 return splitVectorIntUnary(Op, DAG, DL);
32178
32179 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32180 if (VT.getScalarType() != MVT::i8) {
32181 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32182 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32183 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32184 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32185 }
32186
32187 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32188 if (!Subtarget.hasSSSE3())
32189 return SDValue();
32190
32191 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32192}
32193
32194static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32195 SelectionDAG &DAG) {
32196 MVT VT = N.getSimpleValueType();
32197 SDValue Op = N.getOperand(0);
32198 SDLoc DL(N);
32199
32200 if (VT.isScalarInteger()) {
32201 // Compute the lower/upper bounds of the active bits of the value,
32202 // allowing us to shift the active bits down if necessary to fit into the
32203 // special cases below.
32204 KnownBits Known = DAG.computeKnownBits(Op);
32205 if (Known.isConstant())
32206 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32207 unsigned LZ = Known.countMinLeadingZeros();
32208 unsigned TZ = Known.countMinTrailingZeros();
32209 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32210 unsigned ActiveBits = Known.getBitWidth() - LZ;
32211 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32212
32213 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32214 if (ShiftedActiveBits <= 2) {
32215 if (ActiveBits > 2)
32216 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32217 DAG.getShiftAmountConstant(TZ, VT, DL));
32218 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32219 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32220 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32221 DAG.getShiftAmountConstant(1, VT, DL)));
32222 return DAG.getZExtOrTrunc(Op, DL, VT);
32223 }
32224
32225 // i3 CTPOP - perform LUT into i32 integer.
32226 if (ShiftedActiveBits <= 3) {
32227 if (ActiveBits > 3)
32228 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32229 DAG.getShiftAmountConstant(TZ, VT, DL));
32230 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32231 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32232 DAG.getShiftAmountConstant(1, VT, DL));
32233 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32234 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32235 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32236 DAG.getConstant(0x3, DL, MVT::i32));
32237 return DAG.getZExtOrTrunc(Op, DL, VT);
32238 }
32239
32240 // i4 CTPOP - perform LUT into i64 integer.
32241 if (ShiftedActiveBits <= 4 &&
32242 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32243 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32244 if (ActiveBits > 4)
32245 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32246 DAG.getShiftAmountConstant(TZ, VT, DL));
32247 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32248 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32249 DAG.getConstant(4, DL, MVT::i32));
32250 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32251 DAG.getShiftAmountOperand(MVT::i64, Op));
32252 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32253 DAG.getConstant(0x7, DL, MVT::i64));
32254 return DAG.getZExtOrTrunc(Op, DL, VT);
32255 }
32256
32257 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32258 if (ShiftedActiveBits <= 8) {
32259 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32260 if (ActiveBits > 8)
32261 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32262 DAG.getShiftAmountConstant(TZ, VT, DL));
32263 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32264 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32265 DAG.getConstant(0x08040201U, DL, MVT::i32));
32266 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32267 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32268 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32269 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32270 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32271 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32272 return DAG.getZExtOrTrunc(Op, DL, VT);
32273 }
32274
32275 return SDValue(); // fallback to generic expansion.
32276 }
32277
32278 assert(VT.isVector() &&
32279 "We only do custom lowering for vector population count.");
32280 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32281}
32282
32284 MVT VT = Op.getSimpleValueType();
32285 SDValue In = Op.getOperand(0);
32286 SDLoc DL(Op);
32287
32288 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32289 // perform the BITREVERSE.
32290 if (!VT.isVector()) {
32291 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32292 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32293 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32294 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32295 DAG.getVectorIdxConstant(0, DL));
32296 }
32297
32298 int NumElts = VT.getVectorNumElements();
32299 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32300
32301 // Decompose 256-bit ops into smaller 128-bit ops.
32302 if (VT.is256BitVector())
32303 return splitVectorIntUnary(Op, DAG, DL);
32304
32305 assert(VT.is128BitVector() &&
32306 "Only 128-bit vector bitreverse lowering supported.");
32307
32308 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32309 // perform the BSWAP in the shuffle.
32310 // Its best to shuffle using the second operand as this will implicitly allow
32311 // memory folding for multiple vectors.
32312 SmallVector<SDValue, 16> MaskElts;
32313 for (int i = 0; i != NumElts; ++i) {
32314 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32315 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32316 int PermuteByte = SourceByte | (2 << 5);
32317 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32318 }
32319 }
32320
32321 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32322 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32323 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32324 Res, Mask);
32325 return DAG.getBitcast(VT, Res);
32326}
32327
32329 SelectionDAG &DAG) {
32330 MVT VT = Op.getSimpleValueType();
32331
32332 if (Subtarget.hasXOP() && !VT.is512BitVector())
32333 return LowerBITREVERSE_XOP(Op, DAG);
32334
32335 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
32336
32337 SDValue In = Op.getOperand(0);
32338 SDLoc DL(Op);
32339
32340 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32341 if (VT.is512BitVector() && !Subtarget.hasBWI())
32342 return splitVectorIntUnary(Op, DAG, DL);
32343
32344 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32345 if (VT.is256BitVector() && !Subtarget.hasInt256())
32346 return splitVectorIntUnary(Op, DAG, DL);
32347
32348 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32349 if (!VT.isVector()) {
32350 assert(
32351 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32352 "Only tested for i8/i16/i32/i64");
32353 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32354 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32355 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32356 DAG.getBitcast(MVT::v16i8, Res));
32357 Res =
32358 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32359 DAG.getVectorIdxConstant(0, DL));
32360 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32361 }
32362
32363 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32364
32365 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32366 if (VT.getScalarType() != MVT::i8) {
32367 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32368 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32369 Res = DAG.getBitcast(ByteVT, Res);
32370 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32371 return DAG.getBitcast(VT, Res);
32372 }
32373 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32374 "Only byte vector BITREVERSE supported");
32375
32376 unsigned NumElts = VT.getVectorNumElements();
32377
32378 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32379 if (Subtarget.hasGFNI()) {
32381 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32382 DAG.getTargetConstant(0, DL, MVT::i8));
32383 }
32384
32385 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32386 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32387 // 0-15 value (moved to the other nibble).
32388 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32389 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32390 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32391
32392 const int LoLUT[16] = {
32393 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32394 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32395 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32396 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32397 const int HiLUT[16] = {
32398 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32399 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32400 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32401 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32402
32403 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32404 for (unsigned i = 0; i < NumElts; ++i) {
32405 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32406 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32407 }
32408
32409 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32410 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32411 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32412 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32413 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32414}
32415
32416static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32417 SelectionDAG &DAG) {
32418 SDLoc DL(Op);
32419 SDValue X = Op.getOperand(0);
32420 MVT VT = Op.getSimpleValueType();
32421
32422 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32423 if (VT == MVT::i8 ||
32425 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32426 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32427 DAG.getConstant(0, DL, MVT::i8));
32428 // Copy the inverse of the parity flag into a register with setcc.
32429 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32430 // Extend to the original type.
32431 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32432 }
32433
32434 // If we have POPCNT, use the default expansion.
32435 if (Subtarget.hasPOPCNT())
32436 return SDValue();
32437
32438 if (VT == MVT::i64) {
32439 // Xor the high and low 16-bits together using a 32-bit operation.
32440 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32441 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32442 DAG.getConstant(32, DL, MVT::i8)));
32443 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32444 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32445 }
32446
32447 if (VT != MVT::i16) {
32448 // Xor the high and low 16-bits together using a 32-bit operation.
32449 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32450 DAG.getConstant(16, DL, MVT::i8));
32451 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32452 } else {
32453 // If the input is 16-bits, we need to extend to use an i32 shift below.
32454 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32455 }
32456
32457 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32458 // This should allow an h-reg to be used to save a shift.
32459 SDValue Hi = DAG.getNode(
32460 ISD::TRUNCATE, DL, MVT::i8,
32461 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32462 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32463 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32464 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32465
32466 // Copy the inverse of the parity flag into a register with setcc.
32467 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32468 // Extend to the original type.
32469 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32470}
32471
32473 const X86Subtarget &Subtarget) {
32474 unsigned NewOpc = 0;
32475 switch (N->getOpcode()) {
32477 NewOpc = X86ISD::LADD;
32478 break;
32480 NewOpc = X86ISD::LSUB;
32481 break;
32483 NewOpc = X86ISD::LOR;
32484 break;
32486 NewOpc = X86ISD::LXOR;
32487 break;
32489 NewOpc = X86ISD::LAND;
32490 break;
32491 default:
32492 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32493 }
32494
32495 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32496
32497 return DAG.getMemIntrinsicNode(
32498 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32499 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32500 /*MemVT=*/N->getSimpleValueType(0), MMO);
32501}
32502
32503/// Lower atomic_load_ops into LOCK-prefixed operations.
32505 const X86Subtarget &Subtarget) {
32506 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32507 SDValue Chain = N->getOperand(0);
32508 SDValue LHS = N->getOperand(1);
32509 SDValue RHS = N->getOperand(2);
32510 unsigned Opc = N->getOpcode();
32511 MVT VT = N->getSimpleValueType(0);
32512 SDLoc DL(N);
32513
32514 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32515 // can only be lowered when the result is unused. They should have already
32516 // been transformed into a cmpxchg loop in AtomicExpand.
32517 if (N->hasAnyUseOfValue(0)) {
32518 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32519 // select LXADD if LOCK_SUB can't be selected.
32520 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32521 // can use LXADD as opposed to cmpxchg.
32522 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32524 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32525 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32526
32528 "Used AtomicRMW ops other than Add should have been expanded!");
32529 return N;
32530 }
32531
32532 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32533 // The core idea here is that since the memory location isn't actually
32534 // changing, all we need is a lowering for the *ordering* impacts of the
32535 // atomicrmw. As such, we can chose a different operation and memory
32536 // location to minimize impact on other code.
32537 // The above holds unless the node is marked volatile in which
32538 // case it needs to be preserved according to the langref.
32539 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32540 // On X86, the only ordering which actually requires an instruction is
32541 // seq_cst which isn't SingleThread, everything just needs to be preserved
32542 // during codegen and then dropped. Note that we expect (but don't assume),
32543 // that orderings other than seq_cst and acq_rel have been canonicalized to
32544 // a store or load.
32547 // Prefer a locked operation against a stack location to minimize cache
32548 // traffic. This assumes that stack locations are very likely to be
32549 // accessed only by the owning thread.
32550 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32551 assert(!N->hasAnyUseOfValue(0));
32552 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32553 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32554 DAG.getUNDEF(VT), NewChain);
32555 }
32556 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32557 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32558 assert(!N->hasAnyUseOfValue(0));
32559 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32560 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32561 DAG.getUNDEF(VT), NewChain);
32562 }
32563
32564 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32565 // RAUW the chain, but don't worry about the result, as it's unused.
32566 assert(!N->hasAnyUseOfValue(0));
32567 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32568 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32569 DAG.getUNDEF(VT), LockOp.getValue(1));
32570}
32571
32573 const X86Subtarget &Subtarget) {
32574 auto *Node = cast<AtomicSDNode>(Op.getNode());
32575 SDLoc dl(Node);
32576 EVT VT = Node->getMemoryVT();
32577
32578 bool IsSeqCst =
32579 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32580 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32581
32582 // If this store is not sequentially consistent and the type is legal
32583 // we can just keep it.
32584 if (!IsSeqCst && IsTypeLegal)
32585 return Op;
32586
32587 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32589 Attribute::NoImplicitFloat)) {
32590 SDValue Chain;
32591 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
32592 // vector store.
32593 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
32594 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
32595 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
32596 Node->getMemOperand());
32597 }
32598
32599 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
32600 // is enabled.
32601 if (VT == MVT::i64) {
32602 if (Subtarget.hasSSE1()) {
32603 SDValue SclToVec =
32604 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
32605 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
32606 SclToVec = DAG.getBitcast(StVT, SclToVec);
32607 SDVTList Tys = DAG.getVTList(MVT::Other);
32608 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
32609 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
32610 MVT::i64, Node->getMemOperand());
32611 } else if (Subtarget.hasX87()) {
32612 // First load this into an 80-bit X87 register using a stack temporary.
32613 // This will put the whole integer into the significand.
32614 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
32615 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32616 MachinePointerInfo MPI =
32618 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
32620 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
32621 SDValue LdOps[] = {Chain, StackPtr};
32623 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
32624 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
32625 Chain = Value.getValue(1);
32626
32627 // Now use an FIST to do the atomic store.
32628 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
32629 Chain =
32630 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
32631 StoreOps, MVT::i64, Node->getMemOperand());
32632 }
32633 }
32634
32635 if (Chain) {
32636 // If this is a sequentially consistent store, also emit an appropriate
32637 // barrier.
32638 if (IsSeqCst)
32639 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
32640
32641 return Chain;
32642 }
32643 }
32644
32645 // Convert seq_cst store -> xchg
32646 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
32647 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
32648 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
32649 Node->getOperand(0), Node->getOperand(2),
32650 Node->getOperand(1), Node->getMemOperand());
32651 return Swap.getValue(1);
32652}
32653
32655 SDNode *N = Op.getNode();
32656 MVT VT = N->getSimpleValueType(0);
32657 unsigned Opc = Op.getOpcode();
32658
32659 // Let legalize expand this if it isn't a legal type yet.
32660 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32661 return SDValue();
32662
32663 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
32664 SDLoc DL(N);
32665
32666 // Set the carry flag.
32667 SDValue Carry = Op.getOperand(2);
32668 EVT CarryVT = Carry.getValueType();
32669 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
32670 Carry, DAG.getAllOnesConstant(DL, CarryVT));
32671
32672 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
32673 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
32674 Op.getOperand(0), Op.getOperand(1),
32675 Carry.getValue(1));
32676
32677 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
32678 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
32679 Sum.getValue(1), DL, DAG);
32680 if (N->getValueType(1) == MVT::i1)
32681 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
32682
32683 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
32684}
32685
32686static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
32687 SelectionDAG &DAG) {
32688 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
32689
32690 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
32691 // which returns the values as { float, float } (in XMM0) or
32692 // { double, double } (which is returned in XMM0, XMM1).
32693 SDLoc dl(Op);
32694 SDValue Arg = Op.getOperand(0);
32695 EVT ArgVT = Arg.getValueType();
32696 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
32697
32700
32701 Entry.Node = Arg;
32702 Entry.Ty = ArgTy;
32703 Entry.IsSExt = false;
32704 Entry.IsZExt = false;
32705 Args.push_back(Entry);
32706
32707 bool isF64 = ArgVT == MVT::f64;
32708 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
32709 // the small struct {f32, f32} is returned in (eax, edx). For f64,
32710 // the results are returned via SRet in memory.
32711 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32712 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
32713 const char *LibcallName = TLI.getLibcallName(LC);
32714 SDValue Callee =
32715 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
32716
32717 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
32718 : (Type *)FixedVectorType::get(ArgTy, 4);
32719
32721 CLI.setDebugLoc(dl)
32722 .setChain(DAG.getEntryNode())
32723 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
32724
32725 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
32726
32727 if (isF64)
32728 // Returned in xmm0 and xmm1.
32729 return CallResult.first;
32730
32731 // Returned in bits 0:31 and 32:64 xmm0.
32732 SDValue SinVal =
32733 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
32734 DAG.getVectorIdxConstant(0, dl));
32735 SDValue CosVal =
32736 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
32737 DAG.getVectorIdxConstant(1, dl));
32738 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
32739 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
32740}
32741
32742/// Widen a vector input to a vector of NVT. The
32743/// input vector must have the same element type as NVT.
32745 bool FillWithZeroes = false) {
32746 // Check if InOp already has the right width.
32747 MVT InVT = InOp.getSimpleValueType();
32748 if (InVT == NVT)
32749 return InOp;
32750
32751 if (InOp.isUndef())
32752 return DAG.getUNDEF(NVT);
32753
32755 "input and widen element type must match");
32756
32757 unsigned InNumElts = InVT.getVectorNumElements();
32758 unsigned WidenNumElts = NVT.getVectorNumElements();
32759 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
32760 "Unexpected request for vector widening");
32761
32762 SDLoc dl(InOp);
32763 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
32764 SDValue N1 = InOp.getOperand(1);
32765 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
32766 N1.isUndef()) {
32767 InOp = InOp.getOperand(0);
32768 InVT = InOp.getSimpleValueType();
32769 InNumElts = InVT.getVectorNumElements();
32770 }
32771 }
32774 EVT EltVT = InOp.getOperand(0).getValueType();
32775 SDValue FillVal =
32776 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
32777 SmallVector<SDValue, 16> Ops(InOp->op_begin(), InOp->op_end());
32778 Ops.append(WidenNumElts - InNumElts, FillVal);
32779 return DAG.getBuildVector(NVT, dl, Ops);
32780 }
32781 SDValue FillVal =
32782 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
32783 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
32784 DAG.getVectorIdxConstant(0, dl));
32785}
32786
32788 SelectionDAG &DAG) {
32789 assert(Subtarget.hasAVX512() &&
32790 "MGATHER/MSCATTER are supported on AVX-512 arch only");
32791
32792 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
32793 SDValue Src = N->getValue();
32794 MVT VT = Src.getSimpleValueType();
32795 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
32796 SDLoc dl(Op);
32797
32798 SDValue Scale = N->getScale();
32799 SDValue Index = N->getIndex();
32800 SDValue Mask = N->getMask();
32801 SDValue Chain = N->getChain();
32802 SDValue BasePtr = N->getBasePtr();
32803
32804 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
32805 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
32806 // If the index is v2i64 and we have VLX we can use xmm for data and index.
32807 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
32808 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32809 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
32810 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
32811 SDVTList VTs = DAG.getVTList(MVT::Other);
32812 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
32813 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
32814 N->getMemoryVT(), N->getMemOperand());
32815 }
32816 return SDValue();
32817 }
32818
32819 MVT IndexVT = Index.getSimpleValueType();
32820
32821 // If the index is v2i32, we're being called by type legalization and we
32822 // should just let the default handling take care of it.
32823 if (IndexVT == MVT::v2i32)
32824 return SDValue();
32825
32826 // If we don't have VLX and neither the passthru or index is 512-bits, we
32827 // need to widen until one is.
32828 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
32829 !Index.getSimpleValueType().is512BitVector()) {
32830 // Determine how much we need to widen by to get a 512-bit type.
32831 unsigned Factor = std::min(512/VT.getSizeInBits(),
32832 512/IndexVT.getSizeInBits());
32833 unsigned NumElts = VT.getVectorNumElements() * Factor;
32834
32835 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32836 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32837 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32838
32839 Src = ExtendToType(Src, VT, DAG);
32840 Index = ExtendToType(Index, IndexVT, DAG);
32841 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32842 }
32843
32844 SDVTList VTs = DAG.getVTList(MVT::Other);
32845 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
32846 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
32847 N->getMemoryVT(), N->getMemOperand());
32848}
32849
32850static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
32851 SelectionDAG &DAG) {
32852
32853 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
32854 MVT VT = Op.getSimpleValueType();
32855 MVT ScalarVT = VT.getScalarType();
32856 SDValue Mask = N->getMask();
32857 MVT MaskVT = Mask.getSimpleValueType();
32858 SDValue PassThru = N->getPassThru();
32859 SDLoc dl(Op);
32860
32861 // Handle AVX masked loads which don't support passthru other than 0.
32862 if (MaskVT.getVectorElementType() != MVT::i1) {
32863 // We also allow undef in the isel pattern.
32864 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
32865 return Op;
32866
32867 SDValue NewLoad = DAG.getMaskedLoad(
32868 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32869 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
32870 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
32871 N->isExpandingLoad());
32872 // Emit a blend.
32873 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
32874 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
32875 }
32876
32877 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
32878 "Expanding masked load is supported on AVX-512 target only!");
32879
32880 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
32881 "Expanding masked load is supported for 32 and 64-bit types only!");
32882
32883 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32884 "Cannot lower masked load op.");
32885
32886 assert((ScalarVT.getSizeInBits() >= 32 ||
32887 (Subtarget.hasBWI() &&
32888 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
32889 "Unsupported masked load op.");
32890
32891 // This operation is legal for targets with VLX, but without
32892 // VLX the vector should be widened to 512 bit
32893 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
32894 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32895 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
32896
32897 // Mask element has to be i1.
32898 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
32899 "Unexpected mask type");
32900
32901 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32902
32903 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32904 SDValue NewLoad = DAG.getMaskedLoad(
32905 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32906 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
32907 N->getExtensionType(), N->isExpandingLoad());
32908
32909 SDValue Extract =
32910 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
32911 DAG.getVectorIdxConstant(0, dl));
32912 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
32913 return DAG.getMergeValues(RetOps, dl);
32914}
32915
32916static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
32917 SelectionDAG &DAG) {
32918 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
32919 SDValue DataToStore = N->getValue();
32920 MVT VT = DataToStore.getSimpleValueType();
32921 MVT ScalarVT = VT.getScalarType();
32922 SDValue Mask = N->getMask();
32923 SDLoc dl(Op);
32924
32925 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
32926 "Expanding masked load is supported on AVX-512 target only!");
32927
32928 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
32929 "Expanding masked load is supported for 32 and 64-bit types only!");
32930
32931 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32932 "Cannot lower masked store op.");
32933
32934 assert((ScalarVT.getSizeInBits() >= 32 ||
32935 (Subtarget.hasBWI() &&
32936 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
32937 "Unsupported masked store op.");
32938
32939 // This operation is legal for targets with VLX, but without
32940 // VLX the vector should be widened to 512 bit
32941 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
32942 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32943
32944 // Mask element has to be i1.
32945 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
32946 "Unexpected mask type");
32947
32948 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32949
32950 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
32951 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32952 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
32953 N->getOffset(), Mask, N->getMemoryVT(),
32954 N->getMemOperand(), N->getAddressingMode(),
32955 N->isTruncatingStore(), N->isCompressingStore());
32956}
32957
32958static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
32959 SelectionDAG &DAG) {
32960 assert(Subtarget.hasAVX2() &&
32961 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
32962
32963 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
32964 SDLoc dl(Op);
32965 MVT VT = Op.getSimpleValueType();
32966 SDValue Index = N->getIndex();
32967 SDValue Mask = N->getMask();
32968 SDValue PassThru = N->getPassThru();
32969 MVT IndexVT = Index.getSimpleValueType();
32970
32971 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
32972
32973 // If the index is v2i32, we're being called by type legalization.
32974 if (IndexVT == MVT::v2i32)
32975 return SDValue();
32976
32977 // If we don't have VLX and neither the passthru or index is 512-bits, we
32978 // need to widen until one is.
32979 MVT OrigVT = VT;
32980 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32981 !IndexVT.is512BitVector()) {
32982 // Determine how much we need to widen by to get a 512-bit type.
32983 unsigned Factor = std::min(512/VT.getSizeInBits(),
32984 512/IndexVT.getSizeInBits());
32985
32986 unsigned NumElts = VT.getVectorNumElements() * Factor;
32987
32988 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32989 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32990 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32991
32992 PassThru = ExtendToType(PassThru, VT, DAG);
32993 Index = ExtendToType(Index, IndexVT, DAG);
32994 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32995 }
32996
32997 // Break dependency on the data register.
32998 if (PassThru.isUndef())
32999 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33000
33001 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33002 N->getScale() };
33003 SDValue NewGather = DAG.getMemIntrinsicNode(
33004 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33005 N->getMemOperand());
33006 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33007 DAG.getVectorIdxConstant(0, dl));
33008 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33009}
33010
33012 SDLoc dl(Op);
33013 SDValue Src = Op.getOperand(0);
33014 MVT DstVT = Op.getSimpleValueType();
33015
33016 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
33017 unsigned SrcAS = N->getSrcAddressSpace();
33018
33019 assert(SrcAS != N->getDestAddressSpace() &&
33020 "addrspacecast must be between different address spaces");
33021
33022 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33023 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33024 } else if (DstVT == MVT::i64) {
33025 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33026 } else if (DstVT == MVT::i32) {
33027 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33028 } else {
33029 report_fatal_error("Bad address space in addrspacecast");
33030 }
33031 return Op;
33032}
33033
33034SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33035 SelectionDAG &DAG) const {
33036 // TODO: Eventually, the lowering of these nodes should be informed by or
33037 // deferred to the GC strategy for the function in which they appear. For
33038 // now, however, they must be lowered to something. Since they are logically
33039 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33040 // require special handling for these nodes), lower them as literal NOOPs for
33041 // the time being.
33043 Ops.push_back(Op.getOperand(0));
33044 if (Op->getGluedNode())
33045 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33046
33047 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33048 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33049}
33050
33051// Custom split CVTPS2PH with wide types.
33053 SDLoc dl(Op);
33054 EVT VT = Op.getValueType();
33055 SDValue Lo, Hi;
33056 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33057 EVT LoVT, HiVT;
33058 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33059 SDValue RC = Op.getOperand(1);
33060 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33061 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33062 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33063}
33064
33066 SelectionDAG &DAG) {
33067 unsigned IsData = Op.getConstantOperandVal(4);
33068
33069 // We don't support non-data prefetch without PREFETCHI.
33070 // Just preserve the chain.
33071 if (!IsData && !Subtarget.hasPREFETCHI())
33072 return Op.getOperand(0);
33073
33074 return Op;
33075}
33076
33078 SDNode *N = Op.getNode();
33079 SDValue Operand = N->getOperand(0);
33080 EVT VT = Operand.getValueType();
33081 SDLoc dl(N);
33082
33083 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33084
33085 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33086 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33087 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33088 // promote this operator's result!
33089 SDValue Chain = DAG.getEntryNode();
33090 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33091 {Chain, Operand, One});
33092 return StrictFmul;
33093}
33094
33096 unsigned OpNo) {
33097 const APInt Operand(32, OpNo);
33098 std::string OpNoStr = llvm::toString(Operand, 10, false);
33099 std::string Str(" $");
33100
33101 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33102 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33103
33104 auto I = StringRef::npos;
33105 for (auto &AsmStr : AsmStrs) {
33106 // Match the OpNo string. We should match exactly to exclude match
33107 // sub-string, e.g. "$12" contain "$1"
33108 if (AsmStr.ends_with(OpNoStr1))
33109 I = AsmStr.size() - OpNoStr1.size();
33110
33111 // Get the index of operand in AsmStr.
33112 if (I == StringRef::npos)
33113 I = AsmStr.find(OpNoStr1 + ",");
33114 if (I == StringRef::npos)
33115 I = AsmStr.find(OpNoStr2);
33116
33117 if (I == StringRef::npos)
33118 continue;
33119
33120 assert(I > 0 && "Unexpected inline asm string!");
33121 // Remove the operand string and label (if exsit).
33122 // For example:
33123 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33124 // ==>
33125 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33126 // ==>
33127 // "call dword ptr "
33128 auto TmpStr = AsmStr.substr(0, I);
33129 I = TmpStr.rfind(':');
33130 if (I != StringRef::npos)
33131 TmpStr = TmpStr.substr(I + 1);
33132 return TmpStr.take_while(llvm::isAlpha);
33133 }
33134
33135 return StringRef();
33136}
33137
33139 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33140 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33141 // changed from indirect TargetLowering::C_Memory to direct
33142 // TargetLowering::C_Address.
33143 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33144 // location.
33145 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33146 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33147}
33148
33150 SDValue Mask) {
33151 EVT Ty = MVT::i8;
33152 auto V = DAG.getBitcast(MVT::i1, Mask);
33153 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33154 auto Zero = DAG.getConstant(0, DL, Ty);
33155 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33156 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33157 return SDValue(CmpZero.getNode(), 1);
33158}
33159
33161 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33162 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33163 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33164 // ->
33165 // _, flags = SUB 0, mask
33166 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33167 // bit_cast_to_vector<res>
33168 EVT VTy = PassThru.getValueType();
33169 EVT Ty = VTy.getVectorElementType();
33170 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33171 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33172 : DAG.getBitcast(Ty, PassThru);
33173 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33174 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33175 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33176 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33177 return DAG.getBitcast(VTy, NewLoad);
33178}
33179
33181 SDValue Chain,
33183 SDValue Val, SDValue Mask) const {
33184 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33185 // ->
33186 // _, flags = SUB 0, mask
33187 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33189 SDVTList Tys = DAG.getVTList(MVT::Other);
33190 auto ScalarVal = DAG.getBitcast(Ty, Val);
33191 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33192 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33193 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33194 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33195}
33196
33197/// Provide custom lowering hooks for some operations.
33199 switch (Op.getOpcode()) {
33200 // clang-format off
33201 default: llvm_unreachable("Should not custom lower this!");
33202 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33204 return LowerCMP_SWAP(Op, Subtarget, DAG);
33205 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33210 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33211 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33212 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33213 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33214 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33215 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33216 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33217 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33218 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33219 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33220 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33221 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33222 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33223 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33224 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33225 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33226 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33227 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33228 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33229 case ISD::SHL_PARTS:
33230 case ISD::SRA_PARTS:
33231 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33232 case ISD::FSHL:
33233 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33234 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33236 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33238 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33239 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33240 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33241 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33242 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33245 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33246 case ISD::FP_TO_SINT:
33248 case ISD::FP_TO_UINT:
33249 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33251 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33252 case ISD::FP_EXTEND:
33253 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33254 case ISD::FP_ROUND:
33255 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33256 case ISD::FP16_TO_FP:
33257 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33258 case ISD::FP_TO_FP16:
33259 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33260 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33261 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33262 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33263 case ISD::FADD:
33264 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33265 case ISD::FROUND: return LowerFROUND(Op, DAG);
33266 case ISD::FABS:
33267 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33268 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33269 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33270 case ISD::LRINT:
33271 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33272 case ISD::SETCC:
33273 case ISD::STRICT_FSETCC:
33274 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33275 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33276 case ISD::SELECT: return LowerSELECT(Op, DAG);
33277 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33278 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33279 case ISD::VASTART: return LowerVASTART(Op, DAG);
33280 case ISD::VAARG: return LowerVAARG(Op, DAG);
33281 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33282 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33284 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33285 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33286 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33287 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33289 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33290 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33291 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33292 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33293 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33295 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33296 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33298 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33299 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33300 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33301 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33302 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33303 case ISD::CTLZ:
33304 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33305 case ISD::CTTZ:
33306 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33307 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33308 case ISD::MULHS:
33309 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33310 case ISD::ROTL:
33311 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33312 case ISD::SRA:
33313 case ISD::SRL:
33314 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33315 case ISD::SADDO:
33316 case ISD::UADDO:
33317 case ISD::SSUBO:
33318 case ISD::USUBO: return LowerXALUO(Op, DAG);
33319 case ISD::SMULO:
33320 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33321 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33322 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33323 case ISD::SADDO_CARRY:
33324 case ISD::SSUBO_CARRY:
33325 case ISD::UADDO_CARRY:
33326 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33327 case ISD::ADD:
33328 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33329 case ISD::UADDSAT:
33330 case ISD::SADDSAT:
33331 case ISD::USUBSAT:
33332 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33333 case ISD::SMAX:
33334 case ISD::SMIN:
33335 case ISD::UMAX:
33336 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33337 case ISD::FMINIMUM:
33338 case ISD::FMAXIMUM:
33339 case ISD::FMINIMUMNUM:
33340 case ISD::FMAXIMUMNUM:
33341 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33342 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33343 case ISD::ABDS:
33344 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33345 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33346 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33347 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33348 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33349 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33350 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33352 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33353 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33354 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33355 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33356 // clang-format on
33357 }
33358}
33359
33360/// Replace a node with an illegal result type with a new node built out of
33361/// custom code.
33364 SelectionDAG &DAG) const {
33365 SDLoc dl(N);
33366 unsigned Opc = N->getOpcode();
33367 switch (Opc) {
33368 default:
33369#ifndef NDEBUG
33370 dbgs() << "ReplaceNodeResults: ";
33371 N->dump(&DAG);
33372#endif
33373 llvm_unreachable("Do not know how to custom type legalize this operation!");
33374 case X86ISD::CVTPH2PS: {
33375 EVT VT = N->getValueType(0);
33376 SDValue Lo, Hi;
33377 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33378 EVT LoVT, HiVT;
33379 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33380 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33381 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33382 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33383 Results.push_back(Res);
33384 return;
33385 }
33387 EVT VT = N->getValueType(0);
33388 SDValue Lo, Hi;
33389 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33390 EVT LoVT, HiVT;
33391 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33392 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33393 {N->getOperand(0), Lo});
33394 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33395 {N->getOperand(0), Hi});
33396 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33397 Lo.getValue(1), Hi.getValue(1));
33398 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33399 Results.push_back(Res);
33400 Results.push_back(Chain);
33401 return;
33402 }
33403 case X86ISD::CVTPS2PH:
33404 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33405 return;
33406 case ISD::CTPOP: {
33407 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33408 // If we have at most 32 active bits, then perform as i32 CTPOP.
33409 // TODO: Perform this in generic legalizer?
33410 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33411 unsigned LZ = Known.countMinLeadingZeros();
33412 unsigned TZ = Known.countMinTrailingZeros();
33413 if ((LZ + TZ) >= 32) {
33414 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33415 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33416 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33417 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33418 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33419 Results.push_back(Op);
33420 return;
33421 }
33422 // Use a v2i64 if possible.
33423 bool NoImplicitFloatOps =
33425 Attribute::NoImplicitFloat);
33426 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33427 SDValue Wide =
33428 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33429 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33430 // Bit count should fit in 32-bits, extract it as that and then zero
33431 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33432 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33433 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33434 DAG.getVectorIdxConstant(0, dl));
33435 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33436 Results.push_back(Wide);
33437 }
33438 return;
33439 }
33440 case ISD::MUL: {
33441 EVT VT = N->getValueType(0);
33443 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33444 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33445 // elements are needed.
33446 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33447 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33448 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33449 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33450 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33451 unsigned NumConcats = 16 / VT.getVectorNumElements();
33452 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33453 ConcatOps[0] = Res;
33454 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33455 Results.push_back(Res);
33456 return;
33457 }
33458 case ISD::SMULO:
33459 case ISD::UMULO: {
33460 EVT VT = N->getValueType(0);
33462 VT == MVT::v2i32 && "Unexpected VT!");
33463 bool IsSigned = Opc == ISD::SMULO;
33464 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33465 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33466 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33467 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33468 // Extract the high 32 bits from each result using PSHUFD.
33469 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33470 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33471 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33472 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33473 DAG.getVectorIdxConstant(0, dl));
33474
33475 // Truncate the low bits of the result. This will become PSHUFD.
33476 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33477
33478 SDValue HiCmp;
33479 if (IsSigned) {
33480 // SMULO overflows if the high bits don't match the sign of the low.
33481 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33482 } else {
33483 // UMULO overflows if the high bits are non-zero.
33484 HiCmp = DAG.getConstant(0, dl, VT);
33485 }
33486 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33487
33488 // Widen the result with by padding with undef.
33489 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33490 DAG.getUNDEF(VT));
33491 Results.push_back(Res);
33492 Results.push_back(Ovf);
33493 return;
33494 }
33495 case X86ISD::VPMADDWD: {
33496 // Legalize types for X86ISD::VPMADDWD by widening.
33497 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33498
33499 EVT VT = N->getValueType(0);
33500 EVT InVT = N->getOperand(0).getValueType();
33501 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33502 "Expected a VT that divides into 128 bits.");
33504 "Unexpected type action!");
33505 unsigned NumConcat = 128 / InVT.getSizeInBits();
33506
33507 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33508 InVT.getVectorElementType(),
33509 NumConcat * InVT.getVectorNumElements());
33510 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33512 NumConcat * VT.getVectorNumElements());
33513
33514 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33515 Ops[0] = N->getOperand(0);
33516 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33517 Ops[0] = N->getOperand(1);
33518 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33519
33520 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33521 Results.push_back(Res);
33522 return;
33523 }
33524 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33525 case X86ISD::FMINC:
33526 case X86ISD::FMIN:
33527 case X86ISD::FMAXC:
33528 case X86ISD::FMAX:
33530 case X86ISD::STRICT_FMAX: {
33531 EVT VT = N->getValueType(0);
33532 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33533 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33534 SDValue UNDEF = DAG.getUNDEF(VT);
33535 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33536 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33537 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33538 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33539 SDValue Res;
33540 if (IsStrict)
33541 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33542 {N->getOperand(0), LHS, RHS});
33543 else
33544 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33545 Results.push_back(Res);
33546 if (IsStrict)
33547 Results.push_back(Res.getValue(1));
33548 return;
33549 }
33550 case ISD::SDIV:
33551 case ISD::UDIV:
33552 case ISD::SREM:
33553 case ISD::UREM: {
33554 EVT VT = N->getValueType(0);
33555 if (VT.isVector()) {
33557 "Unexpected type action!");
33558 // If this RHS is a constant splat vector we can widen this and let
33559 // division/remainder by constant optimize it.
33560 // TODO: Can we do something for non-splat?
33561 APInt SplatVal;
33562 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33563 unsigned NumConcats = 128 / VT.getSizeInBits();
33564 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33565 Ops0[0] = N->getOperand(0);
33566 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33567 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33568 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33569 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33570 Results.push_back(Res);
33571 }
33572 return;
33573 }
33574
33575 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33576 Results.push_back(V);
33577 return;
33578 }
33579 case ISD::TRUNCATE: {
33580 MVT VT = N->getSimpleValueType(0);
33581 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33582 return;
33583
33584 // The generic legalizer will try to widen the input type to the same
33585 // number of elements as the widened result type. But this isn't always
33586 // the best thing so do some custom legalization to avoid some cases.
33587 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33588 SDValue In = N->getOperand(0);
33589 EVT InVT = In.getValueType();
33590 EVT InEltVT = InVT.getVectorElementType();
33591 EVT EltVT = VT.getVectorElementType();
33592 unsigned MinElts = VT.getVectorNumElements();
33593 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33594 unsigned InBits = InVT.getSizeInBits();
33595
33596 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33597 unsigned PackOpcode;
33598 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
33599 Subtarget, N->getFlags())) {
33600 if (SDValue Res =
33601 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
33602 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
33603 Results.push_back(Res);
33604 return;
33605 }
33606 }
33607
33608 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
33609 // 128 bit and smaller inputs should avoid truncate all together and
33610 // use a shuffle.
33611 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
33612 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
33613 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
33614 for (unsigned I = 0; I < MinElts; ++I)
33615 TruncMask[I] = Scale * I;
33616 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
33617 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
33618 "Illegal vector type in truncation");
33619 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
33620 Results.push_back(
33621 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
33622 return;
33623 }
33624 }
33625
33626 // With AVX512 there are some cases that can use a target specific
33627 // truncate node to go from 256/512 to less than 128 with zeros in the
33628 // upper elements of the 128 bit result.
33629 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
33630 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
33631 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
33632 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
33633 return;
33634 }
33635 // There's one case we can widen to 512 bits and use VTRUNC.
33636 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
33637 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
33638 DAG.getUNDEF(MVT::v4i64));
33639 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
33640 return;
33641 }
33642 }
33643 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
33644 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
33645 isTypeLegal(MVT::v4i64)) {
33646 // Input needs to be split and output needs to widened. Let's use two
33647 // VTRUNCs, and shuffle their results together into the wider type.
33648 SDValue Lo, Hi;
33649 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
33650
33651 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
33652 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
33653 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
33654 { 0, 1, 2, 3, 16, 17, 18, 19,
33655 -1, -1, -1, -1, -1, -1, -1, -1 });
33656 Results.push_back(Res);
33657 return;
33658 }
33659
33660 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
33661 // this via type legalization.
33662 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
33663 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
33664 (!Subtarget.hasSSSE3() ||
33665 (!isTypeLegal(InVT) &&
33666 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
33667 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
33668 InEltVT.getSizeInBits() * WidenNumElts);
33669 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
33670 return;
33671 }
33672
33673 return;
33674 }
33675 case ISD::ANY_EXTEND:
33676 // Right now, only MVT::v8i8 has Custom action for an illegal type.
33677 // It's intended to custom handle the input type.
33678 assert(N->getValueType(0) == MVT::v8i8 &&
33679 "Do not know how to legalize this Node");
33680 return;
33681 case ISD::SIGN_EXTEND:
33682 case ISD::ZERO_EXTEND: {
33683 EVT VT = N->getValueType(0);
33684 SDValue In = N->getOperand(0);
33685 EVT InVT = In.getValueType();
33686 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
33687 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
33689 "Unexpected type action!");
33690 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
33691 // Custom split this so we can extend i8/i16->i32 invec. This is better
33692 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
33693 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
33694 // we allow the sra from the extend to i32 to be shared by the split.
33695 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
33696
33697 // Fill a vector with sign bits for each element.
33698 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
33699 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
33700
33701 // Create an unpackl and unpackh to interleave the sign bits then bitcast
33702 // to v2i64.
33703 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
33704 {0, 4, 1, 5});
33705 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
33706 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
33707 {2, 6, 3, 7});
33708 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
33709
33710 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33711 Results.push_back(Res);
33712 return;
33713 }
33714
33715 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
33716 if (!InVT.is128BitVector()) {
33717 // Not a 128 bit vector, but maybe type legalization will promote
33718 // it to 128 bits.
33719 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
33720 return;
33721 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
33722 if (!InVT.is128BitVector())
33723 return;
33724
33725 // Promote the input to 128 bits. Type legalization will turn this into
33726 // zext_inreg/sext_inreg.
33727 In = DAG.getNode(Opc, dl, InVT, In);
33728 }
33729
33730 // Perform custom splitting instead of the two stage extend we would get
33731 // by default.
33732 EVT LoVT, HiVT;
33733 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
33734 assert(isTypeLegal(LoVT) && "Split VT not legal?");
33735
33736 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
33737
33738 // We need to shift the input over by half the number of elements.
33739 unsigned NumElts = InVT.getVectorNumElements();
33740 unsigned HalfNumElts = NumElts / 2;
33741 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
33742 for (unsigned i = 0; i != HalfNumElts; ++i)
33743 ShufMask[i] = i + HalfNumElts;
33744
33745 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
33746 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
33747
33748 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33749 Results.push_back(Res);
33750 }
33751 return;
33752 }
33754 case ISD::FP_TO_UINT_SAT: {
33755 if (!Subtarget.hasAVX10_2())
33756 return;
33757
33758 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
33759 EVT VT = N->getValueType(0);
33760 SDValue Op = N->getOperand(0);
33761 EVT OpVT = Op.getValueType();
33762 SDValue Res;
33763
33764 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
33765 if (IsSigned)
33766 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
33767 else
33768 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
33769 Results.push_back(Res);
33770 }
33771 return;
33772 }
33773 case ISD::FP_TO_SINT:
33775 case ISD::FP_TO_UINT:
33777 bool IsStrict = N->isStrictFPOpcode();
33778 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
33779 EVT VT = N->getValueType(0);
33780 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33781 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33782 EVT SrcVT = Src.getValueType();
33783
33784 SDValue Res;
33785 if (isSoftF16(SrcVT, Subtarget)) {
33786 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
33787 if (IsStrict) {
33788 Res =
33789 DAG.getNode(Opc, dl, {VT, MVT::Other},
33790 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
33791 {NVT, MVT::Other}, {Chain, Src})});
33792 Chain = Res.getValue(1);
33793 } else {
33794 Res =
33795 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
33796 }
33797 Results.push_back(Res);
33798 if (IsStrict)
33799 Results.push_back(Chain);
33800
33801 return;
33802 }
33803
33804 if (VT.isVector() && Subtarget.hasFP16() &&
33805 SrcVT.getVectorElementType() == MVT::f16) {
33806 EVT EleVT = VT.getVectorElementType();
33807 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
33808
33809 if (SrcVT != MVT::v8f16) {
33810 SDValue Tmp =
33811 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
33812 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
33813 Ops[0] = Src;
33814 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
33815 }
33816
33817 if (IsStrict) {
33819 Res =
33820 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
33821 Chain = Res.getValue(1);
33822 } else {
33823 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33824 Res = DAG.getNode(Opc, dl, ResVT, Src);
33825 }
33826
33827 // TODO: Need to add exception check code for strict FP.
33828 if (EleVT.getSizeInBits() < 16) {
33829 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
33830 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
33831
33832 // Now widen to 128 bits.
33833 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
33834 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
33835 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
33836 ConcatOps[0] = Res;
33837 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
33838 }
33839
33840 Results.push_back(Res);
33841 if (IsStrict)
33842 Results.push_back(Chain);
33843
33844 return;
33845 }
33846
33847 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
33849 "Unexpected type action!");
33850
33851 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
33852 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
33853 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
33855 SDValue Res;
33856 SDValue Chain;
33857 if (IsStrict) {
33858 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
33859 {N->getOperand(0), Src});
33860 Chain = Res.getValue(1);
33861 } else
33862 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
33863
33864 // Preserve what we know about the size of the original result. If the
33865 // result is v2i32, we have to manually widen the assert.
33866 if (PromoteVT == MVT::v2i32)
33867 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33868 DAG.getUNDEF(MVT::v2i32));
33869
33870 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
33871 Res.getValueType(), Res,
33873
33874 if (PromoteVT == MVT::v2i32)
33875 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
33876 DAG.getVectorIdxConstant(0, dl));
33877
33878 // Truncate back to the original width.
33879 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33880
33881 // Now widen to 128 bits.
33882 unsigned NumConcats = 128 / VT.getSizeInBits();
33884 VT.getVectorNumElements() * NumConcats);
33885 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33886 ConcatOps[0] = Res;
33887 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
33888 Results.push_back(Res);
33889 if (IsStrict)
33890 Results.push_back(Chain);
33891 return;
33892 }
33893
33894
33895 if (VT == MVT::v2i32) {
33896 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
33897 "Strict unsigned conversion requires AVX512");
33898 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33900 "Unexpected type action!");
33901 if (Src.getValueType() == MVT::v2f64) {
33902 if (!IsSigned && !Subtarget.hasAVX512()) {
33903 SDValue Res =
33904 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
33905 Results.push_back(Res);
33906 return;
33907 }
33908
33909 if (IsStrict)
33911 else
33912 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33913
33914 // If we have VLX we can emit a target specific FP_TO_UINT node,.
33915 if (!IsSigned && !Subtarget.hasVLX()) {
33916 // Otherwise we can defer to the generic legalizer which will widen
33917 // the input as well. This will be further widened during op
33918 // legalization to v8i32<-v8f64.
33919 // For strict nodes we'll need to widen ourselves.
33920 // FIXME: Fix the type legalizer to safely widen strict nodes?
33921 if (!IsStrict)
33922 return;
33923 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
33924 DAG.getConstantFP(0.0, dl, MVT::v2f64));
33925 Opc = N->getOpcode();
33926 }
33927 SDValue Res;
33928 SDValue Chain;
33929 if (IsStrict) {
33930 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
33931 {N->getOperand(0), Src});
33932 Chain = Res.getValue(1);
33933 } else {
33934 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
33935 }
33936 Results.push_back(Res);
33937 if (IsStrict)
33938 Results.push_back(Chain);
33939 return;
33940 }
33941
33942 // Custom widen strict v2f32->v2i32 by padding with zeros.
33943 // FIXME: Should generic type legalizer do this?
33944 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
33945 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
33946 DAG.getConstantFP(0.0, dl, MVT::v2f32));
33947 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
33948 {N->getOperand(0), Src});
33949 Results.push_back(Res);
33950 Results.push_back(Res.getValue(1));
33951 return;
33952 }
33953
33954 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
33955 // so early out here.
33956 return;
33957 }
33958
33959 assert(!VT.isVector() && "Vectors should have been handled above!");
33960
33961 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
33962 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
33963 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
33964 assert(!Subtarget.is64Bit() && "i64 should be legal");
33965 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
33966 // If we use a 128-bit result we might need to use a target specific node.
33967 unsigned SrcElts =
33968 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
33969 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
33970 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
33971 if (NumElts != SrcElts) {
33972 if (IsStrict)
33974 else
33975 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33976 }
33977
33978 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
33979 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
33980 DAG.getConstantFP(0.0, dl, VecInVT), Src,
33981 ZeroIdx);
33982 SDValue Chain;
33983 if (IsStrict) {
33984 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
33985 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
33986 Chain = Res.getValue(1);
33987 } else
33988 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
33989 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
33990 Results.push_back(Res);
33991 if (IsStrict)
33992 Results.push_back(Chain);
33993 return;
33994 }
33995
33996 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
33997 SDValue Chain;
33998 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
33999 Results.push_back(V);
34000 if (IsStrict)
34001 Results.push_back(Chain);
34002 return;
34003 }
34004
34005 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34006 Results.push_back(V);
34007 if (IsStrict)
34008 Results.push_back(Chain);
34009 }
34010 return;
34011 }
34012 case ISD::LRINT:
34013 case ISD::LLRINT: {
34014 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34015 Results.push_back(V);
34016 return;
34017 }
34018
34019 case ISD::SINT_TO_FP:
34021 case ISD::UINT_TO_FP:
34023 bool IsStrict = N->isStrictFPOpcode();
34024 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34025 EVT VT = N->getValueType(0);
34026 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34027 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34028 Subtarget.hasVLX()) {
34029 if (Src.getValueType().getVectorElementType() == MVT::i16)
34030 return;
34031
34032 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34033 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34034 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34035 : DAG.getUNDEF(MVT::v2i32));
34036 if (IsStrict) {
34037 unsigned Opc =
34039 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34040 {N->getOperand(0), Src});
34041 Results.push_back(Res);
34042 Results.push_back(Res.getValue(1));
34043 } else {
34044 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34045 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34046 }
34047 return;
34048 }
34049 if (VT != MVT::v2f32)
34050 return;
34051 EVT SrcVT = Src.getValueType();
34052 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34053 if (IsStrict) {
34054 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34056 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34057 {N->getOperand(0), Src});
34058 Results.push_back(Res);
34059 Results.push_back(Res.getValue(1));
34060 } else {
34061 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34062 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34063 }
34064 return;
34065 }
34066 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34067 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34068 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34069 SDValue One = DAG.getConstant(1, dl, SrcVT);
34070 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34071 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34072 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34073 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34074 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34075 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34076 for (int i = 0; i != 2; ++i) {
34077 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34078 SignSrc, DAG.getVectorIdxConstant(i, dl));
34079 if (IsStrict)
34080 SignCvts[i] =
34081 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34082 {N->getOperand(0), Elt});
34083 else
34084 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34085 };
34086 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34087 SDValue Slow, Chain;
34088 if (IsStrict) {
34089 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34090 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34091 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34092 {Chain, SignCvt, SignCvt});
34093 Chain = Slow.getValue(1);
34094 } else {
34095 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34096 }
34097 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34098 IsNeg =
34099 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34100 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34101 Results.push_back(Cvt);
34102 if (IsStrict)
34103 Results.push_back(Chain);
34104 return;
34105 }
34106
34107 if (SrcVT != MVT::v2i32)
34108 return;
34109
34110 if (IsSigned || Subtarget.hasAVX512()) {
34111 if (!IsStrict)
34112 return;
34113
34114 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34115 // FIXME: Should generic type legalizer do this?
34116 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34117 DAG.getConstant(0, dl, MVT::v2i32));
34118 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34119 {N->getOperand(0), Src});
34120 Results.push_back(Res);
34121 Results.push_back(Res.getValue(1));
34122 return;
34123 }
34124
34125 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34126 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34127 SDValue VBias = DAG.getConstantFP(
34128 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34129 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34130 DAG.getBitcast(MVT::v2i64, VBias));
34131 Or = DAG.getBitcast(MVT::v2f64, Or);
34132 if (IsStrict) {
34133 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34134 {N->getOperand(0), Or, VBias});
34136 {MVT::v4f32, MVT::Other},
34137 {Sub.getValue(1), Sub});
34138 Results.push_back(Res);
34139 Results.push_back(Res.getValue(1));
34140 } else {
34141 // TODO: Are there any fast-math-flags to propagate here?
34142 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34143 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34144 }
34145 return;
34146 }
34148 case ISD::FP_ROUND: {
34149 bool IsStrict = N->isStrictFPOpcode();
34150 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34151 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34152 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34153 EVT SrcVT = Src.getValueType();
34154 EVT VT = N->getValueType(0);
34155 SDValue V;
34156 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34157 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34158 : DAG.getUNDEF(MVT::v2f32);
34159 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34160 }
34161 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34162 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34163 if (SrcVT.getVectorElementType() != MVT::f32)
34164 return;
34165
34166 if (IsStrict)
34167 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34168 {Chain, Src, Rnd});
34169 else
34170 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34171
34172 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34173 if (IsStrict)
34174 Results.push_back(V.getValue(1));
34175 return;
34176 }
34177 if (!isTypeLegal(Src.getValueType()))
34178 return;
34179 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34180 if (IsStrict)
34181 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34182 {Chain, Src});
34183 else
34184 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34185 Results.push_back(V);
34186 if (IsStrict)
34187 Results.push_back(V.getValue(1));
34188 return;
34189 }
34190 case ISD::FP_EXTEND:
34191 case ISD::STRICT_FP_EXTEND: {
34192 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34193 // No other ValueType for FP_EXTEND should reach this point.
34194 assert(N->getValueType(0) == MVT::v2f32 &&
34195 "Do not know how to legalize this Node");
34196 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34197 return;
34198 bool IsStrict = N->isStrictFPOpcode();
34199 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34200 if (Src.getValueType().getVectorElementType() != MVT::f16)
34201 return;
34202 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34203 : DAG.getUNDEF(MVT::v2f16);
34204 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34205 if (IsStrict)
34206 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34207 {N->getOperand(0), V});
34208 else
34209 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34210 Results.push_back(V);
34211 if (IsStrict)
34212 Results.push_back(V.getValue(1));
34213 return;
34214 }
34216 unsigned IntNo = N->getConstantOperandVal(1);
34217 switch (IntNo) {
34218 default : llvm_unreachable("Do not know how to custom type "
34219 "legalize this intrinsic operation!");
34220 case Intrinsic::x86_rdtsc:
34221 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34222 Results);
34223 case Intrinsic::x86_rdtscp:
34224 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34225 Results);
34226 case Intrinsic::x86_rdpmc:
34227 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34228 Results);
34229 return;
34230 case Intrinsic::x86_rdpru:
34231 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34232 Results);
34233 return;
34234 case Intrinsic::x86_xgetbv:
34235 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34236 Results);
34237 return;
34238 }
34239 }
34240 case ISD::READCYCLECOUNTER: {
34241 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34242 }
34244 EVT T = N->getValueType(0);
34245 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34246 bool Regs64bit = T == MVT::i128;
34247 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34248 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34249 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34250 SDValue cpInL, cpInH;
34251 std::tie(cpInL, cpInH) =
34252 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34253 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34254 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34255 cpInH =
34256 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34257 cpInH, cpInL.getValue(1));
34258 SDValue swapInL, swapInH;
34259 std::tie(swapInL, swapInH) =
34260 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34261 swapInH =
34262 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34263 swapInH, cpInH.getValue(1));
34264
34265 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34266 // until later. So we keep the RBX input in a vreg and use a custom
34267 // inserter.
34268 // Since RBX will be a reserved register the register allocator will not
34269 // make sure its value will be properly saved and restored around this
34270 // live-range.
34271 SDValue Result;
34272 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34273 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34274 if (Regs64bit) {
34275 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34276 swapInH.getValue(1)};
34277 Result =
34278 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34279 } else {
34280 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34281 swapInH.getValue(1));
34282 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34283 swapInL.getValue(1)};
34284 Result =
34285 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34286 }
34287
34288 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34289 Regs64bit ? X86::RAX : X86::EAX,
34290 HalfT, Result.getValue(1));
34291 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34292 Regs64bit ? X86::RDX : X86::EDX,
34293 HalfT, cpOutL.getValue(2));
34294 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34295
34296 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34297 MVT::i32, cpOutH.getValue(2));
34298 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34299 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34300
34301 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34302 Results.push_back(Success);
34303 Results.push_back(EFLAGS.getValue(1));
34304 return;
34305 }
34306 case ISD::ATOMIC_LOAD: {
34307 assert(
34308 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34309 "Unexpected VT!");
34310 bool NoImplicitFloatOps =
34312 Attribute::NoImplicitFloat);
34313 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34314 auto *Node = cast<AtomicSDNode>(N);
34315
34316 if (N->getValueType(0) == MVT::i128) {
34317 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34318 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34319 Node->getBasePtr(), Node->getMemOperand());
34320 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34321 DAG.getVectorIdxConstant(0, dl));
34322 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34323 DAG.getVectorIdxConstant(1, dl));
34324 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34325 {ResL, ResH}));
34326 Results.push_back(Ld.getValue(1));
34327 return;
34328 }
34329 break;
34330 }
34331 if (Subtarget.hasSSE1()) {
34332 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34333 // Then extract the lower 64-bits.
34334 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34335 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34336 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34337 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34338 MVT::i64, Node->getMemOperand());
34339 if (Subtarget.hasSSE2()) {
34340 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34341 DAG.getVectorIdxConstant(0, dl));
34342 Results.push_back(Res);
34343 Results.push_back(Ld.getValue(1));
34344 return;
34345 }
34346 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34347 // then casts to i64. This avoids a 128-bit stack temporary being
34348 // created by type legalization if we were to cast v4f32->v2i64.
34349 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34350 DAG.getVectorIdxConstant(0, dl));
34351 Res = DAG.getBitcast(MVT::i64, Res);
34352 Results.push_back(Res);
34353 Results.push_back(Ld.getValue(1));
34354 return;
34355 }
34356 if (Subtarget.hasX87()) {
34357 // First load this into an 80-bit X87 register. This will put the whole
34358 // integer into the significand.
34359 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34360 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34362 dl, Tys, Ops, MVT::i64,
34363 Node->getMemOperand());
34364 SDValue Chain = Result.getValue(1);
34365
34366 // Now store the X87 register to a stack temporary and convert to i64.
34367 // This store is not atomic and doesn't need to be.
34368 // FIXME: We don't need a stack temporary if the result of the load
34369 // is already being stored. We could just directly store there.
34370 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34371 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34372 MachinePointerInfo MPI =
34374 SDValue StoreOps[] = { Chain, Result, StackPtr };
34375 Chain = DAG.getMemIntrinsicNode(
34376 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34377 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34378
34379 // Finally load the value back from the stack temporary and return it.
34380 // This load is not atomic and doesn't need to be.
34381 // This load will be further type legalized.
34382 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34383 Results.push_back(Result);
34384 Results.push_back(Result.getValue(1));
34385 return;
34386 }
34387 }
34388 // TODO: Use MOVLPS when SSE1 is available?
34389 // Delegate to generic TypeLegalization. Situations we can really handle
34390 // should have already been dealt with by AtomicExpandPass.cpp.
34391 break;
34392 }
34393 case ISD::ATOMIC_SWAP:
34404 // Delegate to generic TypeLegalization. Situations we can really handle
34405 // should have already been dealt with by AtomicExpandPass.cpp.
34406 break;
34407
34408 case ISD::BITCAST: {
34409 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34410 EVT DstVT = N->getValueType(0);
34411 EVT SrcVT = N->getOperand(0).getValueType();
34412
34413 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34414 // we can split using the k-register rather than memory.
34415 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34416 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34417 SDValue Lo, Hi;
34418 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34419 Lo = DAG.getBitcast(MVT::i32, Lo);
34420 Hi = DAG.getBitcast(MVT::i32, Hi);
34421 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34422 Results.push_back(Res);
34423 return;
34424 }
34425
34426 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34427 // FIXME: Use v4f32 for SSE1?
34428 assert(Subtarget.hasSSE2() && "Requires SSE2");
34429 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34430 "Unexpected type action!");
34431 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34432 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34433 N->getOperand(0));
34434 Res = DAG.getBitcast(WideVT, Res);
34435 Results.push_back(Res);
34436 return;
34437 }
34438
34439 return;
34440 }
34441 case ISD::MGATHER: {
34442 EVT VT = N->getValueType(0);
34443 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34444 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34445 auto *Gather = cast<MaskedGatherSDNode>(N);
34446 SDValue Index = Gather->getIndex();
34447 if (Index.getValueType() != MVT::v2i64)
34448 return;
34450 "Unexpected type action!");
34451 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34452 SDValue Mask = Gather->getMask();
34453 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34454 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34455 Gather->getPassThru(),
34456 DAG.getUNDEF(VT));
34457 if (!Subtarget.hasVLX()) {
34458 // We need to widen the mask, but the instruction will only use 2
34459 // of its elements. So we can use undef.
34460 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34461 DAG.getUNDEF(MVT::v2i1));
34462 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34463 }
34464 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34465 Gather->getBasePtr(), Index, Gather->getScale() };
34466 SDValue Res = DAG.getMemIntrinsicNode(
34467 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34468 Gather->getMemoryVT(), Gather->getMemOperand());
34469 Results.push_back(Res);
34470 Results.push_back(Res.getValue(1));
34471 return;
34472 }
34473 return;
34474 }
34475 case ISD::LOAD: {
34476 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34477 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34478 // cast since type legalization will try to use an i64 load.
34479 MVT VT = N->getSimpleValueType(0);
34480 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34482 "Unexpected type action!");
34483 if (!ISD::isNON_EXTLoad(N))
34484 return;
34485 auto *Ld = cast<LoadSDNode>(N);
34486 if (Subtarget.hasSSE2()) {
34487 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34488 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34489 Ld->getPointerInfo(), Ld->getOriginalAlign(),
34490 Ld->getMemOperand()->getFlags());
34491 SDValue Chain = Res.getValue(1);
34492 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34493 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34494 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34495 Res = DAG.getBitcast(WideVT, Res);
34496 Results.push_back(Res);
34497 Results.push_back(Chain);
34498 return;
34499 }
34500 assert(Subtarget.hasSSE1() && "Expected SSE");
34501 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34502 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34503 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34504 MVT::i64, Ld->getMemOperand());
34505 Results.push_back(Res);
34506 Results.push_back(Res.getValue(1));
34507 return;
34508 }
34509 case ISD::ADDRSPACECAST: {
34510 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34511 Results.push_back(V);
34512 return;
34513 }
34514 case ISD::BITREVERSE: {
34515 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34516 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34517 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34518 // We'll need to move the scalar in two i32 pieces.
34519 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34520 return;
34521 }
34523 // f16 = extract vXf16 %vec, i64 %idx
34524 assert(N->getSimpleValueType(0) == MVT::f16 &&
34525 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34526 assert(Subtarget.hasFP16() && "Expected FP16");
34527 SDValue VecOp = N->getOperand(0);
34529 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34530 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34531 N->getOperand(1));
34532 Split = DAG.getBitcast(MVT::f16, Split);
34533 Results.push_back(Split);
34534 return;
34535 }
34536 }
34537}
34538
34539const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34540 switch ((X86ISD::NodeType)Opcode) {
34541 case X86ISD::FIRST_NUMBER: break;
34542#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34543 NODE_NAME_CASE(BSF)
34544 NODE_NAME_CASE(BSR)
34545 NODE_NAME_CASE(FSHL)
34546 NODE_NAME_CASE(FSHR)
34547 NODE_NAME_CASE(FAND)
34548 NODE_NAME_CASE(FANDN)
34549 NODE_NAME_CASE(FOR)
34550 NODE_NAME_CASE(FXOR)
34551 NODE_NAME_CASE(FILD)
34552 NODE_NAME_CASE(FIST)
34553 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34554 NODE_NAME_CASE(FLD)
34555 NODE_NAME_CASE(FST)
34556 NODE_NAME_CASE(CALL)
34557 NODE_NAME_CASE(CALL_RVMARKER)
34559 NODE_NAME_CASE(CMP)
34560 NODE_NAME_CASE(FCMP)
34561 NODE_NAME_CASE(STRICT_FCMP)
34562 NODE_NAME_CASE(STRICT_FCMPS)
34564 NODE_NAME_CASE(UCOMI)
34565 NODE_NAME_CASE(COMX)
34566 NODE_NAME_CASE(UCOMX)
34567 NODE_NAME_CASE(CMPM)
34568 NODE_NAME_CASE(CMPMM)
34569 NODE_NAME_CASE(STRICT_CMPM)
34570 NODE_NAME_CASE(CMPMM_SAE)
34571 NODE_NAME_CASE(SETCC)
34572 NODE_NAME_CASE(SETCC_CARRY)
34573 NODE_NAME_CASE(FSETCC)
34574 NODE_NAME_CASE(FSETCCM)
34575 NODE_NAME_CASE(FSETCCM_SAE)
34576 NODE_NAME_CASE(CMOV)
34577 NODE_NAME_CASE(BRCOND)
34578 NODE_NAME_CASE(RET_GLUE)
34579 NODE_NAME_CASE(IRET)
34580 NODE_NAME_CASE(REP_STOS)
34581 NODE_NAME_CASE(REP_MOVS)
34582 NODE_NAME_CASE(GlobalBaseReg)
34584 NODE_NAME_CASE(WrapperRIP)
34585 NODE_NAME_CASE(MOVQ2DQ)
34586 NODE_NAME_CASE(MOVDQ2Q)
34587 NODE_NAME_CASE(MMX_MOVD2W)
34588 NODE_NAME_CASE(MMX_MOVW2D)
34589 NODE_NAME_CASE(PEXTRB)
34590 NODE_NAME_CASE(PEXTRW)
34591 NODE_NAME_CASE(INSERTPS)
34592 NODE_NAME_CASE(PINSRB)
34593 NODE_NAME_CASE(PINSRW)
34594 NODE_NAME_CASE(PSHUFB)
34595 NODE_NAME_CASE(ANDNP)
34596 NODE_NAME_CASE(BLENDI)
34598 NODE_NAME_CASE(HADD)
34599 NODE_NAME_CASE(HSUB)
34600 NODE_NAME_CASE(FHADD)
34601 NODE_NAME_CASE(FHSUB)
34602 NODE_NAME_CASE(CONFLICT)
34603 NODE_NAME_CASE(FMAX)
34604 NODE_NAME_CASE(FMAXS)
34605 NODE_NAME_CASE(FMAX_SAE)
34606 NODE_NAME_CASE(FMAXS_SAE)
34607 NODE_NAME_CASE(STRICT_FMAX)
34608 NODE_NAME_CASE(FMIN)
34609 NODE_NAME_CASE(FMINS)
34610 NODE_NAME_CASE(FMIN_SAE)
34611 NODE_NAME_CASE(FMINS_SAE)
34612 NODE_NAME_CASE(STRICT_FMIN)
34613 NODE_NAME_CASE(FMAXC)
34614 NODE_NAME_CASE(FMINC)
34615 NODE_NAME_CASE(FRSQRT)
34616 NODE_NAME_CASE(FRCP)
34617 NODE_NAME_CASE(EXTRQI)
34618 NODE_NAME_CASE(INSERTQI)
34619 NODE_NAME_CASE(TLSADDR)
34620 NODE_NAME_CASE(TLSBASEADDR)
34621 NODE_NAME_CASE(TLSCALL)
34622 NODE_NAME_CASE(TLSDESC)
34623 NODE_NAME_CASE(EH_SJLJ_SETJMP)
34624 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
34625 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
34626 NODE_NAME_CASE(EH_RETURN)
34627 NODE_NAME_CASE(TC_RETURN)
34628 NODE_NAME_CASE(FNSTCW16m)
34629 NODE_NAME_CASE(FLDCW16m)
34630 NODE_NAME_CASE(FNSTENVm)
34631 NODE_NAME_CASE(FLDENVm)
34632 NODE_NAME_CASE(LCMPXCHG_DAG)
34633 NODE_NAME_CASE(LCMPXCHG8_DAG)
34634 NODE_NAME_CASE(LCMPXCHG16_DAG)
34635 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
34636 NODE_NAME_CASE(LADD)
34637 NODE_NAME_CASE(LSUB)
34638 NODE_NAME_CASE(LOR)
34639 NODE_NAME_CASE(LXOR)
34640 NODE_NAME_CASE(LAND)
34641 NODE_NAME_CASE(LBTS)
34642 NODE_NAME_CASE(LBTC)
34643 NODE_NAME_CASE(LBTR)
34644 NODE_NAME_CASE(LBTS_RM)
34645 NODE_NAME_CASE(LBTC_RM)
34646 NODE_NAME_CASE(LBTR_RM)
34647 NODE_NAME_CASE(AADD)
34648 NODE_NAME_CASE(AOR)
34649 NODE_NAME_CASE(AXOR)
34650 NODE_NAME_CASE(AAND)
34651 NODE_NAME_CASE(VZEXT_MOVL)
34652 NODE_NAME_CASE(VZEXT_LOAD)
34653 NODE_NAME_CASE(VEXTRACT_STORE)
34654 NODE_NAME_CASE(VTRUNC)
34655 NODE_NAME_CASE(VTRUNCS)
34656 NODE_NAME_CASE(VTRUNCUS)
34657 NODE_NAME_CASE(VMTRUNC)
34658 NODE_NAME_CASE(VMTRUNCS)
34659 NODE_NAME_CASE(VMTRUNCUS)
34660 NODE_NAME_CASE(VTRUNCSTORES)
34661 NODE_NAME_CASE(VTRUNCSTOREUS)
34662 NODE_NAME_CASE(VMTRUNCSTORES)
34663 NODE_NAME_CASE(VMTRUNCSTOREUS)
34664 NODE_NAME_CASE(VFPEXT)
34665 NODE_NAME_CASE(STRICT_VFPEXT)
34666 NODE_NAME_CASE(VFPEXT_SAE)
34667 NODE_NAME_CASE(VFPEXTS)
34668 NODE_NAME_CASE(VFPEXTS_SAE)
34669 NODE_NAME_CASE(VFPROUND)
34670 NODE_NAME_CASE(VFPROUND2)
34671 NODE_NAME_CASE(VFPROUND2_RND)
34672 NODE_NAME_CASE(STRICT_VFPROUND)
34673 NODE_NAME_CASE(VMFPROUND)
34674 NODE_NAME_CASE(VFPROUND_RND)
34675 NODE_NAME_CASE(VFPROUNDS)
34676 NODE_NAME_CASE(VFPROUNDS_RND)
34677 NODE_NAME_CASE(VSHLDQ)
34678 NODE_NAME_CASE(VSRLDQ)
34679 NODE_NAME_CASE(VSHL)
34680 NODE_NAME_CASE(VSRL)
34681 NODE_NAME_CASE(VSRA)
34682 NODE_NAME_CASE(VSHLI)
34683 NODE_NAME_CASE(VSRLI)
34684 NODE_NAME_CASE(VSRAI)
34685 NODE_NAME_CASE(VSHLV)
34686 NODE_NAME_CASE(VSRLV)
34687 NODE_NAME_CASE(VSRAV)
34688 NODE_NAME_CASE(VROTLI)
34689 NODE_NAME_CASE(VROTRI)
34690 NODE_NAME_CASE(VPPERM)
34691 NODE_NAME_CASE(CMPP)
34692 NODE_NAME_CASE(STRICT_CMPP)
34693 NODE_NAME_CASE(PCMPEQ)
34694 NODE_NAME_CASE(PCMPGT)
34695 NODE_NAME_CASE(PHMINPOS)
34696 NODE_NAME_CASE(ADD)
34697 NODE_NAME_CASE(SUB)
34698 NODE_NAME_CASE(ADC)
34699 NODE_NAME_CASE(SBB)
34700 NODE_NAME_CASE(SMUL)
34701 NODE_NAME_CASE(UMUL)
34702 NODE_NAME_CASE(OR)
34703 NODE_NAME_CASE(XOR)
34704 NODE_NAME_CASE(AND)
34705 NODE_NAME_CASE(BEXTR)
34707 NODE_NAME_CASE(BZHI)
34708 NODE_NAME_CASE(PDEP)
34709 NODE_NAME_CASE(PEXT)
34710 NODE_NAME_CASE(MUL_IMM)
34711 NODE_NAME_CASE(MOVMSK)
34712 NODE_NAME_CASE(PTEST)
34713 NODE_NAME_CASE(TESTP)
34714 NODE_NAME_CASE(KORTEST)
34715 NODE_NAME_CASE(KTEST)
34716 NODE_NAME_CASE(KADD)
34717 NODE_NAME_CASE(KSHIFTL)
34718 NODE_NAME_CASE(KSHIFTR)
34719 NODE_NAME_CASE(PACKSS)
34720 NODE_NAME_CASE(PACKUS)
34721 NODE_NAME_CASE(PALIGNR)
34722 NODE_NAME_CASE(VALIGN)
34723 NODE_NAME_CASE(VSHLD)
34724 NODE_NAME_CASE(VSHRD)
34725 NODE_NAME_CASE(VSHLDV)
34726 NODE_NAME_CASE(VSHRDV)
34727 NODE_NAME_CASE(PSHUFD)
34728 NODE_NAME_CASE(PSHUFHW)
34729 NODE_NAME_CASE(PSHUFLW)
34730 NODE_NAME_CASE(SHUFP)
34731 NODE_NAME_CASE(SHUF128)
34732 NODE_NAME_CASE(MOVLHPS)
34733 NODE_NAME_CASE(MOVHLPS)
34734 NODE_NAME_CASE(MOVDDUP)
34735 NODE_NAME_CASE(MOVSHDUP)
34736 NODE_NAME_CASE(MOVSLDUP)
34737 NODE_NAME_CASE(MOVSD)
34738 NODE_NAME_CASE(MOVSS)
34739 NODE_NAME_CASE(MOVSH)
34740 NODE_NAME_CASE(UNPCKL)
34741 NODE_NAME_CASE(UNPCKH)
34742 NODE_NAME_CASE(VBROADCAST)
34743 NODE_NAME_CASE(VBROADCAST_LOAD)
34744 NODE_NAME_CASE(VBROADCASTM)
34745 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
34746 NODE_NAME_CASE(VPERMILPV)
34747 NODE_NAME_CASE(VPERMILPI)
34748 NODE_NAME_CASE(VPERM2X128)
34749 NODE_NAME_CASE(VPERMV)
34750 NODE_NAME_CASE(VPERMV3)
34751 NODE_NAME_CASE(VPERMI)
34752 NODE_NAME_CASE(VPTERNLOG)
34753 NODE_NAME_CASE(FP_TO_SINT_SAT)
34754 NODE_NAME_CASE(FP_TO_UINT_SAT)
34755 NODE_NAME_CASE(VFIXUPIMM)
34756 NODE_NAME_CASE(VFIXUPIMM_SAE)
34757 NODE_NAME_CASE(VFIXUPIMMS)
34758 NODE_NAME_CASE(VFIXUPIMMS_SAE)
34759 NODE_NAME_CASE(VRANGE)
34760 NODE_NAME_CASE(VRANGE_SAE)
34761 NODE_NAME_CASE(VRANGES)
34762 NODE_NAME_CASE(VRANGES_SAE)
34763 NODE_NAME_CASE(PMULUDQ)
34764 NODE_NAME_CASE(PMULDQ)
34765 NODE_NAME_CASE(PSADBW)
34766 NODE_NAME_CASE(DBPSADBW)
34767 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
34768 NODE_NAME_CASE(VAARG_64)
34769 NODE_NAME_CASE(VAARG_X32)
34770 NODE_NAME_CASE(DYN_ALLOCA)
34771 NODE_NAME_CASE(MFENCE)
34772 NODE_NAME_CASE(SEG_ALLOCA)
34773 NODE_NAME_CASE(PROBED_ALLOCA)
34776 NODE_NAME_CASE(RDPKRU)
34777 NODE_NAME_CASE(WRPKRU)
34778 NODE_NAME_CASE(VPMADDUBSW)
34779 NODE_NAME_CASE(VPMADDWD)
34780 NODE_NAME_CASE(VPSHA)
34781 NODE_NAME_CASE(VPSHL)
34782 NODE_NAME_CASE(VPCOM)
34783 NODE_NAME_CASE(VPCOMU)
34784 NODE_NAME_CASE(VPERMIL2)
34786 NODE_NAME_CASE(STRICT_FMSUB)
34788 NODE_NAME_CASE(STRICT_FNMADD)
34790 NODE_NAME_CASE(STRICT_FNMSUB)
34791 NODE_NAME_CASE(FMADDSUB)
34792 NODE_NAME_CASE(FMSUBADD)
34793 NODE_NAME_CASE(FMADD_RND)
34794 NODE_NAME_CASE(FNMADD_RND)
34795 NODE_NAME_CASE(FMSUB_RND)
34796 NODE_NAME_CASE(FNMSUB_RND)
34797 NODE_NAME_CASE(FMADDSUB_RND)
34798 NODE_NAME_CASE(FMSUBADD_RND)
34799 NODE_NAME_CASE(VFMADDC)
34800 NODE_NAME_CASE(VFMADDC_RND)
34801 NODE_NAME_CASE(VFCMADDC)
34802 NODE_NAME_CASE(VFCMADDC_RND)
34803 NODE_NAME_CASE(VFMULC)
34804 NODE_NAME_CASE(VFMULC_RND)
34805 NODE_NAME_CASE(VFCMULC)
34806 NODE_NAME_CASE(VFCMULC_RND)
34807 NODE_NAME_CASE(VFMULCSH)
34808 NODE_NAME_CASE(VFMULCSH_RND)
34809 NODE_NAME_CASE(VFCMULCSH)
34810 NODE_NAME_CASE(VFCMULCSH_RND)
34811 NODE_NAME_CASE(VFMADDCSH)
34812 NODE_NAME_CASE(VFMADDCSH_RND)
34813 NODE_NAME_CASE(VFCMADDCSH)
34814 NODE_NAME_CASE(VFCMADDCSH_RND)
34815 NODE_NAME_CASE(VPMADD52H)
34816 NODE_NAME_CASE(VPMADD52L)
34817 NODE_NAME_CASE(VRNDSCALE)
34818 NODE_NAME_CASE(STRICT_VRNDSCALE)
34819 NODE_NAME_CASE(VRNDSCALE_SAE)
34820 NODE_NAME_CASE(VRNDSCALES)
34821 NODE_NAME_CASE(VRNDSCALES_SAE)
34822 NODE_NAME_CASE(VREDUCE)
34823 NODE_NAME_CASE(VREDUCE_SAE)
34824 NODE_NAME_CASE(VREDUCES)
34825 NODE_NAME_CASE(VREDUCES_SAE)
34826 NODE_NAME_CASE(VGETMANT)
34827 NODE_NAME_CASE(VGETMANT_SAE)
34828 NODE_NAME_CASE(VGETMANTS)
34829 NODE_NAME_CASE(VGETMANTS_SAE)
34830 NODE_NAME_CASE(PCMPESTR)
34831 NODE_NAME_CASE(PCMPISTR)
34833 NODE_NAME_CASE(COMPRESS)
34835 NODE_NAME_CASE(SELECTS)
34836 NODE_NAME_CASE(ADDSUB)
34837 NODE_NAME_CASE(RCP14)
34838 NODE_NAME_CASE(RCP14S)
34839 NODE_NAME_CASE(RSQRT14)
34840 NODE_NAME_CASE(RSQRT14S)
34841 NODE_NAME_CASE(FADD_RND)
34842 NODE_NAME_CASE(FADDS)
34843 NODE_NAME_CASE(FADDS_RND)
34844 NODE_NAME_CASE(FSUB_RND)
34845 NODE_NAME_CASE(FSUBS)
34846 NODE_NAME_CASE(FSUBS_RND)
34847 NODE_NAME_CASE(FMUL_RND)
34848 NODE_NAME_CASE(FMULS)
34849 NODE_NAME_CASE(FMULS_RND)
34850 NODE_NAME_CASE(FDIV_RND)
34851 NODE_NAME_CASE(FDIVS)
34852 NODE_NAME_CASE(FDIVS_RND)
34853 NODE_NAME_CASE(FSQRT_RND)
34854 NODE_NAME_CASE(FSQRTS)
34855 NODE_NAME_CASE(FSQRTS_RND)
34856 NODE_NAME_CASE(FGETEXP)
34857 NODE_NAME_CASE(FGETEXP_SAE)
34858 NODE_NAME_CASE(FGETEXPS)
34859 NODE_NAME_CASE(FGETEXPS_SAE)
34860 NODE_NAME_CASE(SCALEF)
34861 NODE_NAME_CASE(SCALEF_RND)
34862 NODE_NAME_CASE(SCALEFS)
34863 NODE_NAME_CASE(SCALEFS_RND)
34864 NODE_NAME_CASE(MULHRS)
34865 NODE_NAME_CASE(SINT_TO_FP_RND)
34866 NODE_NAME_CASE(UINT_TO_FP_RND)
34867 NODE_NAME_CASE(CVTTP2SI)
34868 NODE_NAME_CASE(CVTTP2UI)
34869 NODE_NAME_CASE(STRICT_CVTTP2SI)
34870 NODE_NAME_CASE(STRICT_CVTTP2UI)
34871 NODE_NAME_CASE(MCVTTP2SI)
34872 NODE_NAME_CASE(MCVTTP2UI)
34873 NODE_NAME_CASE(CVTTP2SI_SAE)
34874 NODE_NAME_CASE(CVTTP2UI_SAE)
34875 NODE_NAME_CASE(CVTTS2SI)
34876 NODE_NAME_CASE(CVTTS2UI)
34877 NODE_NAME_CASE(CVTTS2SI_SAE)
34878 NODE_NAME_CASE(CVTTS2UI_SAE)
34879 NODE_NAME_CASE(CVTSI2P)
34880 NODE_NAME_CASE(CVTUI2P)
34881 NODE_NAME_CASE(STRICT_CVTSI2P)
34882 NODE_NAME_CASE(STRICT_CVTUI2P)
34883 NODE_NAME_CASE(MCVTSI2P)
34884 NODE_NAME_CASE(MCVTUI2P)
34885 NODE_NAME_CASE(VFPCLASS)
34886 NODE_NAME_CASE(VFPCLASSS)
34887 NODE_NAME_CASE(MULTISHIFT)
34888 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
34889 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
34890 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
34891 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
34892 NODE_NAME_CASE(CVTPS2PH)
34893 NODE_NAME_CASE(STRICT_CVTPS2PH)
34894 NODE_NAME_CASE(CVTPS2PH_SAE)
34895 NODE_NAME_CASE(MCVTPS2PH)
34896 NODE_NAME_CASE(MCVTPS2PH_SAE)
34897 NODE_NAME_CASE(CVTPH2PS)
34898 NODE_NAME_CASE(STRICT_CVTPH2PS)
34899 NODE_NAME_CASE(CVTPH2PS_SAE)
34900 NODE_NAME_CASE(CVTP2SI)
34901 NODE_NAME_CASE(CVTP2UI)
34902 NODE_NAME_CASE(MCVTP2SI)
34903 NODE_NAME_CASE(MCVTP2UI)
34904 NODE_NAME_CASE(CVTP2SI_RND)
34905 NODE_NAME_CASE(CVTP2UI_RND)
34906 NODE_NAME_CASE(CVTS2SI)
34907 NODE_NAME_CASE(CVTS2UI)
34908 NODE_NAME_CASE(CVTS2SI_RND)
34909 NODE_NAME_CASE(CVTS2UI_RND)
34910 NODE_NAME_CASE(CVTNEPS2BF16)
34911 NODE_NAME_CASE(MCVTNEPS2BF16)
34912 NODE_NAME_CASE(DPBF16PS)
34913 NODE_NAME_CASE(DPFP16PS)
34914 NODE_NAME_CASE(MPSADBW)
34915 NODE_NAME_CASE(LWPINS)
34916 NODE_NAME_CASE(MGATHER)
34917 NODE_NAME_CASE(MSCATTER)
34918 NODE_NAME_CASE(VPDPBUSD)
34919 NODE_NAME_CASE(VPDPBUSDS)
34920 NODE_NAME_CASE(VPDPWSSD)
34921 NODE_NAME_CASE(VPDPWSSDS)
34922 NODE_NAME_CASE(VPSHUFBITQMB)
34923 NODE_NAME_CASE(GF2P8MULB)
34924 NODE_NAME_CASE(GF2P8AFFINEQB)
34925 NODE_NAME_CASE(GF2P8AFFINEINVQB)
34926 NODE_NAME_CASE(NT_CALL)
34927 NODE_NAME_CASE(NT_BRIND)
34928 NODE_NAME_CASE(UMWAIT)
34929 NODE_NAME_CASE(TPAUSE)
34930 NODE_NAME_CASE(ENQCMD)
34931 NODE_NAME_CASE(ENQCMDS)
34932 NODE_NAME_CASE(VP2INTERSECT)
34933 NODE_NAME_CASE(VPDPBSUD)
34934 NODE_NAME_CASE(VPDPBSUDS)
34935 NODE_NAME_CASE(VPDPBUUD)
34936 NODE_NAME_CASE(VPDPBUUDS)
34937 NODE_NAME_CASE(VPDPBSSD)
34938 NODE_NAME_CASE(VPDPBSSDS)
34939 NODE_NAME_CASE(VPDPWSUD)
34940 NODE_NAME_CASE(VPDPWSUDS)
34941 NODE_NAME_CASE(VPDPWUSD)
34942 NODE_NAME_CASE(VPDPWUSDS)
34943 NODE_NAME_CASE(VPDPWUUD)
34944 NODE_NAME_CASE(VPDPWUUDS)
34945 NODE_NAME_CASE(VMINMAX)
34946 NODE_NAME_CASE(VMINMAX_SAE)
34947 NODE_NAME_CASE(VMINMAXS)
34948 NODE_NAME_CASE(VMINMAXS_SAE)
34949 NODE_NAME_CASE(CVTP2IBS)
34950 NODE_NAME_CASE(CVTP2IUBS)
34951 NODE_NAME_CASE(CVTP2IBS_RND)
34952 NODE_NAME_CASE(CVTP2IUBS_RND)
34953 NODE_NAME_CASE(CVTTP2IBS)
34954 NODE_NAME_CASE(CVTTP2IUBS)
34955 NODE_NAME_CASE(CVTTP2IBS_SAE)
34956 NODE_NAME_CASE(CVTTP2IUBS_SAE)
34957 NODE_NAME_CASE(VCVT2PH2BF8)
34958 NODE_NAME_CASE(VCVT2PH2BF8S)
34959 NODE_NAME_CASE(VCVT2PH2HF8)
34960 NODE_NAME_CASE(VCVT2PH2HF8S)
34961 NODE_NAME_CASE(VCVTBIASPH2BF8)
34962 NODE_NAME_CASE(VCVTBIASPH2BF8S)
34963 NODE_NAME_CASE(VCVTBIASPH2HF8)
34964 NODE_NAME_CASE(VCVTBIASPH2HF8S)
34965 NODE_NAME_CASE(VCVTPH2BF8)
34966 NODE_NAME_CASE(VCVTPH2BF8S)
34967 NODE_NAME_CASE(VCVTPH2HF8)
34968 NODE_NAME_CASE(VCVTPH2HF8S)
34969 NODE_NAME_CASE(VMCVTBIASPH2BF8)
34970 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
34971 NODE_NAME_CASE(VMCVTBIASPH2HF8)
34972 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
34973 NODE_NAME_CASE(VMCVTPH2BF8)
34974 NODE_NAME_CASE(VMCVTPH2BF8S)
34975 NODE_NAME_CASE(VMCVTPH2HF8)
34976 NODE_NAME_CASE(VMCVTPH2HF8S)
34977 NODE_NAME_CASE(VCVTHF82PH)
34978 NODE_NAME_CASE(AESENC128KL)
34979 NODE_NAME_CASE(AESDEC128KL)
34980 NODE_NAME_CASE(AESENC256KL)
34981 NODE_NAME_CASE(AESDEC256KL)
34982 NODE_NAME_CASE(AESENCWIDE128KL)
34983 NODE_NAME_CASE(AESDECWIDE128KL)
34984 NODE_NAME_CASE(AESENCWIDE256KL)
34985 NODE_NAME_CASE(AESDECWIDE256KL)
34986 NODE_NAME_CASE(CMPCCXADD)
34987 NODE_NAME_CASE(TESTUI)
34988 NODE_NAME_CASE(FP80_ADD)
34989 NODE_NAME_CASE(STRICT_FP80_ADD)
34990 NODE_NAME_CASE(CCMP)
34991 NODE_NAME_CASE(CTEST)
34992 NODE_NAME_CASE(CLOAD)
34993 NODE_NAME_CASE(CSTORE)
34994 NODE_NAME_CASE(CVTTS2SIS)
34995 NODE_NAME_CASE(CVTTS2UIS)
34996 NODE_NAME_CASE(CVTTS2SIS_SAE)
34997 NODE_NAME_CASE(CVTTS2UIS_SAE)
34998 NODE_NAME_CASE(CVTTP2SIS)
34999 NODE_NAME_CASE(MCVTTP2SIS)
35000 NODE_NAME_CASE(CVTTP2UIS_SAE)
35001 NODE_NAME_CASE(CVTTP2SIS_SAE)
35002 NODE_NAME_CASE(CVTTP2UIS)
35003 NODE_NAME_CASE(MCVTTP2UIS)
35004 }
35005 return nullptr;
35006#undef NODE_NAME_CASE
35007}
35008
35009/// Return true if the addressing mode represented by AM is legal for this
35010/// target, for a load/store of the specified type.
35012 const AddrMode &AM, Type *Ty,
35013 unsigned AS,
35014 Instruction *I) const {
35015 // X86 supports extremely general addressing modes.
35017
35018 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35019 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35020 return false;
35021
35022 if (AM.BaseGV) {
35023 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35024
35025 // If a reference to this global requires an extra load, we can't fold it.
35026 if (isGlobalStubReference(GVFlags))
35027 return false;
35028
35029 // If BaseGV requires a register for the PIC base, we cannot also have a
35030 // BaseReg specified.
35031 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35032 return false;
35033
35034 // If lower 4G is not available, then we must use rip-relative addressing.
35035 if ((M != CodeModel::Small || isPositionIndependent()) &&
35036 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35037 return false;
35038 }
35039
35040 switch (AM.Scale) {
35041 case 0:
35042 case 1:
35043 case 2:
35044 case 4:
35045 case 8:
35046 // These scales always work.
35047 break;
35048 case 3:
35049 case 5:
35050 case 9:
35051 // These scales are formed with basereg+scalereg. Only accept if there is
35052 // no basereg yet.
35053 if (AM.HasBaseReg)
35054 return false;
35055 break;
35056 default: // Other stuff never works.
35057 return false;
35058 }
35059
35060 return true;
35061}
35062
35063bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35064 switch (Opcode) {
35065 // These are non-commutative binops.
35066 // TODO: Add more X86ISD opcodes once we have test coverage.
35067 case X86ISD::ANDNP:
35068 case X86ISD::PCMPGT:
35069 case X86ISD::FMAX:
35070 case X86ISD::FMIN:
35071 case X86ISD::FANDN:
35072 case X86ISD::VPSHA:
35073 case X86ISD::VPSHL:
35074 case X86ISD::VSHLV:
35075 case X86ISD::VSRLV:
35076 case X86ISD::VSRAV:
35077 return true;
35078 }
35079
35080 return TargetLoweringBase::isBinOp(Opcode);
35081}
35082
35083bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35084 switch (Opcode) {
35085 // TODO: Add more X86ISD opcodes once we have test coverage.
35086 case X86ISD::PCMPEQ:
35087 case X86ISD::PMULDQ:
35088 case X86ISD::PMULUDQ:
35089 case X86ISD::FMAXC:
35090 case X86ISD::FMINC:
35091 case X86ISD::FAND:
35092 case X86ISD::FOR:
35093 case X86ISD::FXOR:
35094 return true;
35095 }
35096
35098}
35099
35101 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35102 return false;
35103 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35104 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35105 return NumBits1 > NumBits2;
35106}
35107
35109 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35110 return false;
35111
35112 if (!isTypeLegal(EVT::getEVT(Ty1)))
35113 return false;
35114
35115 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35116
35117 // Assuming the caller doesn't have a zeroext or signext return parameter,
35118 // truncation all the way down to i1 is valid.
35119 return true;
35120}
35121
35123 return isInt<32>(Imm);
35124}
35125
35127 // Can also use sub to handle negated immediates.
35128 return isInt<32>(Imm);
35129}
35130
35132 return isInt<32>(Imm);
35133}
35134
35136 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35137 return false;
35138 unsigned NumBits1 = VT1.getSizeInBits();
35139 unsigned NumBits2 = VT2.getSizeInBits();
35140 return NumBits1 > NumBits2;
35141}
35142
35144 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35145 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35146}
35147
35149 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35150 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35151}
35152
35154 EVT VT1 = Val.getValueType();
35155 if (isZExtFree(VT1, VT2))
35156 return true;
35157
35158 if (Val.getOpcode() != ISD::LOAD)
35159 return false;
35160
35161 if (!VT1.isSimple() || !VT1.isInteger() ||
35162 !VT2.isSimple() || !VT2.isInteger())
35163 return false;
35164
35165 switch (VT1.getSimpleVT().SimpleTy) {
35166 default: break;
35167 case MVT::i8:
35168 case MVT::i16:
35169 case MVT::i32:
35170 // X86 has 8, 16, and 32-bit zero-extending loads.
35171 return true;
35172 }
35173
35174 return false;
35175}
35176
35178 if (!Subtarget.is64Bit())
35179 return false;
35181}
35182
35184 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35185 return false;
35186
35187 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35188
35189 // There is no extending load for vXi1.
35190 if (SrcVT.getScalarType() == MVT::i1)
35191 return false;
35192
35193 return true;
35194}
35195
35197 EVT VT) const {
35198 if (Subtarget.useSoftFloat())
35199 return false;
35200
35201 if (!Subtarget.hasAnyFMA())
35202 return false;
35203
35204 VT = VT.getScalarType();
35205
35206 if (!VT.isSimple())
35207 return false;
35208
35209 switch (VT.getSimpleVT().SimpleTy) {
35210 case MVT::f16:
35211 return Subtarget.hasFP16();
35212 case MVT::f32:
35213 case MVT::f64:
35214 return true;
35215 default:
35216 break;
35217 }
35218
35219 return false;
35220}
35221
35223 EVT DestVT) const {
35224 // i16 instructions are longer (0x66 prefix) and potentially slower.
35225 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35226}
35227
35229 EVT VT) const {
35230 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35231 // benefit. The transform may also be profitable for scalar code.
35232 if (!Subtarget.hasAVX512())
35233 return false;
35234 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35235 return false;
35236 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35237 return false;
35238
35239 return true;
35240}
35241
35242/// Targets can use this to indicate that they only support *some*
35243/// VECTOR_SHUFFLE operations, those with specific masks.
35244/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35245/// are assumed to be legal.
35247 if (!VT.isSimple())
35248 return false;
35249
35250 // Not for i1 vectors
35251 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35252 return false;
35253
35254 // Very little shuffling can be done for 64-bit vectors right now.
35255 if (VT.getSimpleVT().getSizeInBits() == 64)
35256 return false;
35257
35258 // We only care that the types being shuffled are legal. The lowering can
35259 // handle any possible shuffle mask that results.
35260 return isTypeLegal(VT.getSimpleVT());
35261}
35262
35264 EVT VT) const {
35265 // Don't convert an 'and' into a shuffle that we don't directly support.
35266 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35267 if (!Subtarget.hasAVX2())
35268 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35269 return false;
35270
35271 // Just delegate to the generic legality, clear masks aren't special.
35272 return isShuffleMaskLegal(Mask, VT);
35273}
35274
35276 // If the subtarget is using thunks, we need to not generate jump tables.
35277 if (Subtarget.useIndirectThunkBranches())
35278 return false;
35279
35280 // Otherwise, fallback on the generic logic.
35282}
35283
35285 EVT ConditionVT) const {
35286 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35287 // zero-extensions.
35288 if (ConditionVT.getSizeInBits() < 32)
35289 return MVT::i32;
35291 ConditionVT);
35292}
35293
35294//===----------------------------------------------------------------------===//
35295// X86 Scheduler Hooks
35296//===----------------------------------------------------------------------===//
35297
35298// Returns true if EFLAG is consumed after this iterator in the rest of the
35299// basic block or any successors of the basic block.
35301 MachineBasicBlock *BB) {
35302 // Scan forward through BB for a use/def of EFLAGS.
35303 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
35304 if (mi.readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
35305 return true;
35306 // If we found a def, we can stop searching.
35307 if (mi.definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
35308 return false;
35309 }
35310
35311 // If we hit the end of the block, check whether EFLAGS is live into a
35312 // successor.
35313 for (MachineBasicBlock *Succ : BB->successors())
35314 if (Succ->isLiveIn(X86::EFLAGS))
35315 return true;
35316
35317 return false;
35318}
35319
35320/// Utility function to emit xbegin specifying the start of an RTM region.
35322 const TargetInstrInfo *TII) {
35323 const MIMetadata MIMD(MI);
35324
35325 const BasicBlock *BB = MBB->getBasicBlock();
35327
35328 // For the v = xbegin(), we generate
35329 //
35330 // thisMBB:
35331 // xbegin sinkMBB
35332 //
35333 // mainMBB:
35334 // s0 = -1
35335 //
35336 // fallBB:
35337 // eax = # XABORT_DEF
35338 // s1 = eax
35339 //
35340 // sinkMBB:
35341 // v = phi(s0/mainBB, s1/fallBB)
35342
35343 MachineBasicBlock *thisMBB = MBB;
35344 MachineFunction *MF = MBB->getParent();
35345 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35346 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35347 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35348 MF->insert(I, mainMBB);
35349 MF->insert(I, fallMBB);
35350 MF->insert(I, sinkMBB);
35351
35352 if (isEFLAGSLiveAfter(MI, MBB)) {
35353 mainMBB->addLiveIn(X86::EFLAGS);
35354 fallMBB->addLiveIn(X86::EFLAGS);
35355 sinkMBB->addLiveIn(X86::EFLAGS);
35356 }
35357
35358 // Transfer the remainder of BB and its successor edges to sinkMBB.
35359 sinkMBB->splice(sinkMBB->begin(), MBB,
35360 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35362
35364 Register DstReg = MI.getOperand(0).getReg();
35365 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35366 Register mainDstReg = MRI.createVirtualRegister(RC);
35367 Register fallDstReg = MRI.createVirtualRegister(RC);
35368
35369 // thisMBB:
35370 // xbegin fallMBB
35371 // # fallthrough to mainMBB
35372 // # abortion to fallMBB
35373 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35374 thisMBB->addSuccessor(mainMBB);
35375 thisMBB->addSuccessor(fallMBB);
35376
35377 // mainMBB:
35378 // mainDstReg := -1
35379 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35380 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35381 mainMBB->addSuccessor(sinkMBB);
35382
35383 // fallMBB:
35384 // ; pseudo instruction to model hardware's definition from XABORT
35385 // EAX := XABORT_DEF
35386 // fallDstReg := EAX
35387 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35388 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35389 .addReg(X86::EAX);
35390 fallMBB->addSuccessor(sinkMBB);
35391
35392 // sinkMBB:
35393 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35394 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35395 .addReg(mainDstReg).addMBB(mainMBB)
35396 .addReg(fallDstReg).addMBB(fallMBB);
35397
35398 MI.eraseFromParent();
35399 return sinkMBB;
35400}
35401
35403X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35404 MachineBasicBlock *MBB) const {
35405 // Emit va_arg instruction on X86-64.
35406
35407 // Operands to this pseudo-instruction:
35408 // 0 ) Output : destination address (reg)
35409 // 1-5) Input : va_list address (addr, i64mem)
35410 // 6 ) ArgSize : Size (in bytes) of vararg type
35411 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35412 // 8 ) Align : Alignment of type
35413 // 9 ) EFLAGS (implicit-def)
35414
35415 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35416 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35417
35418 Register DestReg = MI.getOperand(0).getReg();
35419 MachineOperand &Base = MI.getOperand(1);
35420 MachineOperand &Scale = MI.getOperand(2);
35421 MachineOperand &Index = MI.getOperand(3);
35422 MachineOperand &Disp = MI.getOperand(4);
35423 MachineOperand &Segment = MI.getOperand(5);
35424 unsigned ArgSize = MI.getOperand(6).getImm();
35425 unsigned ArgMode = MI.getOperand(7).getImm();
35426 Align Alignment = Align(MI.getOperand(8).getImm());
35427
35428 MachineFunction *MF = MBB->getParent();
35429
35430 // Memory Reference
35431 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35432
35433 MachineMemOperand *OldMMO = MI.memoperands().front();
35434
35435 // Clone the MMO into two separate MMOs for loading and storing
35436 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35437 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35438 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35439 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35440
35441 // Machine Information
35442 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35444 const TargetRegisterClass *AddrRegClass =
35446 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35447 const MIMetadata MIMD(MI);
35448
35449 // struct va_list {
35450 // i32 gp_offset
35451 // i32 fp_offset
35452 // i64 overflow_area (address)
35453 // i64 reg_save_area (address)
35454 // }
35455 // sizeof(va_list) = 24
35456 // alignment(va_list) = 8
35457
35458 unsigned TotalNumIntRegs = 6;
35459 unsigned TotalNumXMMRegs = 8;
35460 bool UseGPOffset = (ArgMode == 1);
35461 bool UseFPOffset = (ArgMode == 2);
35462 unsigned MaxOffset = TotalNumIntRegs * 8 +
35463 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35464
35465 /* Align ArgSize to a multiple of 8 */
35466 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35467 bool NeedsAlign = (Alignment > 8);
35468
35469 MachineBasicBlock *thisMBB = MBB;
35470 MachineBasicBlock *overflowMBB;
35471 MachineBasicBlock *offsetMBB;
35472 MachineBasicBlock *endMBB;
35473
35474 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
35475 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
35476 unsigned OffsetReg = 0;
35477
35478 if (!UseGPOffset && !UseFPOffset) {
35479 // If we only pull from the overflow region, we don't create a branch.
35480 // We don't need to alter control flow.
35481 OffsetDestReg = 0; // unused
35482 OverflowDestReg = DestReg;
35483
35484 offsetMBB = nullptr;
35485 overflowMBB = thisMBB;
35486 endMBB = thisMBB;
35487 } else {
35488 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35489 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35490 // If not, pull from overflow_area. (branch to overflowMBB)
35491 //
35492 // thisMBB
35493 // | .
35494 // | .
35495 // offsetMBB overflowMBB
35496 // | .
35497 // | .
35498 // endMBB
35499
35500 // Registers for the PHI in endMBB
35501 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35502 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35503
35504 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35505 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35506 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35507 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35508
35510
35511 // Insert the new basic blocks
35512 MF->insert(MBBIter, offsetMBB);
35513 MF->insert(MBBIter, overflowMBB);
35514 MF->insert(MBBIter, endMBB);
35515
35516 // Transfer the remainder of MBB and its successor edges to endMBB.
35517 endMBB->splice(endMBB->begin(), thisMBB,
35518 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35519 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35520
35521 // Make offsetMBB and overflowMBB successors of thisMBB
35522 thisMBB->addSuccessor(offsetMBB);
35523 thisMBB->addSuccessor(overflowMBB);
35524
35525 // endMBB is a successor of both offsetMBB and overflowMBB
35526 offsetMBB->addSuccessor(endMBB);
35527 overflowMBB->addSuccessor(endMBB);
35528
35529 // Load the offset value into a register
35530 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35531 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35532 .add(Base)
35533 .add(Scale)
35534 .add(Index)
35535 .addDisp(Disp, UseFPOffset ? 4 : 0)
35536 .add(Segment)
35537 .setMemRefs(LoadOnlyMMO);
35538
35539 // Check if there is enough room left to pull this argument.
35540 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35541 .addReg(OffsetReg)
35542 .addImm(MaxOffset + 8 - ArgSizeA8);
35543
35544 // Branch to "overflowMBB" if offset >= max
35545 // Fall through to "offsetMBB" otherwise
35546 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35547 .addMBB(overflowMBB).addImm(X86::COND_AE);
35548 }
35549
35550 // In offsetMBB, emit code to use the reg_save_area.
35551 if (offsetMBB) {
35552 assert(OffsetReg != 0);
35553
35554 // Read the reg_save_area address.
35555 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35556 BuildMI(
35557 offsetMBB, MIMD,
35558 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35559 RegSaveReg)
35560 .add(Base)
35561 .add(Scale)
35562 .add(Index)
35563 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35564 .add(Segment)
35565 .setMemRefs(LoadOnlyMMO);
35566
35567 if (Subtarget.isTarget64BitLP64()) {
35568 // Zero-extend the offset
35569 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35570 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35571 .addImm(0)
35572 .addReg(OffsetReg)
35573 .addImm(X86::sub_32bit);
35574
35575 // Add the offset to the reg_save_area to get the final address.
35576 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35577 .addReg(OffsetReg64)
35578 .addReg(RegSaveReg);
35579 } else {
35580 // Add the offset to the reg_save_area to get the final address.
35581 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35582 .addReg(OffsetReg)
35583 .addReg(RegSaveReg);
35584 }
35585
35586 // Compute the offset for the next argument
35587 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35588 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
35589 .addReg(OffsetReg)
35590 .addImm(UseFPOffset ? 16 : 8);
35591
35592 // Store it back into the va_list.
35593 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
35594 .add(Base)
35595 .add(Scale)
35596 .add(Index)
35597 .addDisp(Disp, UseFPOffset ? 4 : 0)
35598 .add(Segment)
35599 .addReg(NextOffsetReg)
35600 .setMemRefs(StoreOnlyMMO);
35601
35602 // Jump to endMBB
35603 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
35604 .addMBB(endMBB);
35605 }
35606
35607 //
35608 // Emit code to use overflow area
35609 //
35610
35611 // Load the overflow_area address into a register.
35612 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
35613 BuildMI(overflowMBB, MIMD,
35614 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35615 OverflowAddrReg)
35616 .add(Base)
35617 .add(Scale)
35618 .add(Index)
35619 .addDisp(Disp, 8)
35620 .add(Segment)
35621 .setMemRefs(LoadOnlyMMO);
35622
35623 // If we need to align it, do so. Otherwise, just copy the address
35624 // to OverflowDestReg.
35625 if (NeedsAlign) {
35626 // Align the overflow address
35627 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
35628
35629 // aligned_addr = (addr + (align-1)) & ~(align-1)
35630 BuildMI(
35631 overflowMBB, MIMD,
35632 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35633 TmpReg)
35634 .addReg(OverflowAddrReg)
35635 .addImm(Alignment.value() - 1);
35636
35637 BuildMI(
35638 overflowMBB, MIMD,
35639 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
35640 OverflowDestReg)
35641 .addReg(TmpReg)
35642 .addImm(~(uint64_t)(Alignment.value() - 1));
35643 } else {
35644 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
35645 .addReg(OverflowAddrReg);
35646 }
35647
35648 // Compute the next overflow address after this argument.
35649 // (the overflow address should be kept 8-byte aligned)
35650 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
35651 BuildMI(
35652 overflowMBB, MIMD,
35653 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35654 NextAddrReg)
35655 .addReg(OverflowDestReg)
35656 .addImm(ArgSizeA8);
35657
35658 // Store the new overflow address.
35659 BuildMI(overflowMBB, MIMD,
35660 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
35661 .add(Base)
35662 .add(Scale)
35663 .add(Index)
35664 .addDisp(Disp, 8)
35665 .add(Segment)
35666 .addReg(NextAddrReg)
35667 .setMemRefs(StoreOnlyMMO);
35668
35669 // If we branched, emit the PHI to the front of endMBB.
35670 if (offsetMBB) {
35671 BuildMI(*endMBB, endMBB->begin(), MIMD,
35672 TII->get(X86::PHI), DestReg)
35673 .addReg(OffsetDestReg).addMBB(offsetMBB)
35674 .addReg(OverflowDestReg).addMBB(overflowMBB);
35675 }
35676
35677 // Erase the pseudo instruction
35678 MI.eraseFromParent();
35679
35680 return endMBB;
35681}
35682
35683// The EFLAGS operand of SelectItr might be missing a kill marker
35684// because there were multiple uses of EFLAGS, and ISel didn't know
35685// which to mark. Figure out whether SelectItr should have had a
35686// kill marker, and set it if it should. Returns the correct kill
35687// marker value.
35690 const TargetRegisterInfo* TRI) {
35691 if (isEFLAGSLiveAfter(SelectItr, BB))
35692 return false;
35693
35694 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
35695 // out. SelectMI should have a kill flag on EFLAGS.
35696 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
35697 return true;
35698}
35699
35700// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
35701// together with other CMOV pseudo-opcodes into a single basic-block with
35702// conditional jump around it.
35704 switch (MI.getOpcode()) {
35705 case X86::CMOV_FR16:
35706 case X86::CMOV_FR16X:
35707 case X86::CMOV_FR32:
35708 case X86::CMOV_FR32X:
35709 case X86::CMOV_FR64:
35710 case X86::CMOV_FR64X:
35711 case X86::CMOV_GR8:
35712 case X86::CMOV_GR16:
35713 case X86::CMOV_GR32:
35714 case X86::CMOV_RFP32:
35715 case X86::CMOV_RFP64:
35716 case X86::CMOV_RFP80:
35717 case X86::CMOV_VR64:
35718 case X86::CMOV_VR128:
35719 case X86::CMOV_VR128X:
35720 case X86::CMOV_VR256:
35721 case X86::CMOV_VR256X:
35722 case X86::CMOV_VR512:
35723 case X86::CMOV_VK1:
35724 case X86::CMOV_VK2:
35725 case X86::CMOV_VK4:
35726 case X86::CMOV_VK8:
35727 case X86::CMOV_VK16:
35728 case X86::CMOV_VK32:
35729 case X86::CMOV_VK64:
35730 return true;
35731
35732 default:
35733 return false;
35734 }
35735}
35736
35737// Helper function, which inserts PHI functions into SinkMBB:
35738// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
35739// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
35740// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
35741// the last PHI function inserted.
35744 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
35745 MachineBasicBlock *SinkMBB) {
35746 MachineFunction *MF = TrueMBB->getParent();
35748 const MIMetadata MIMD(*MIItBegin);
35749
35750 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
35752
35753 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
35754
35755 // As we are creating the PHIs, we have to be careful if there is more than
35756 // one. Later CMOVs may reference the results of earlier CMOVs, but later
35757 // PHIs have to reference the individual true/false inputs from earlier PHIs.
35758 // That also means that PHI construction must work forward from earlier to
35759 // later, and that the code must maintain a mapping from earlier PHI's
35760 // destination registers, and the registers that went into the PHI.
35763
35764 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
35765 Register DestReg = MIIt->getOperand(0).getReg();
35766 Register Op1Reg = MIIt->getOperand(1).getReg();
35767 Register Op2Reg = MIIt->getOperand(2).getReg();
35768
35769 // If this CMOV we are generating is the opposite condition from
35770 // the jump we generated, then we have to swap the operands for the
35771 // PHI that is going to be generated.
35772 if (MIIt->getOperand(3).getImm() == OppCC)
35773 std::swap(Op1Reg, Op2Reg);
35774
35775 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
35776 Op1Reg = It->second.first;
35777
35778 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
35779 Op2Reg = It->second.second;
35780
35781 MIB =
35782 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
35783 .addReg(Op1Reg)
35784 .addMBB(FalseMBB)
35785 .addReg(Op2Reg)
35786 .addMBB(TrueMBB);
35787
35788 // Add this PHI to the rewrite table.
35789 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
35790 }
35791
35792 return MIB;
35793}
35794
35795// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
35797X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
35798 MachineInstr &SecondCascadedCMOV,
35799 MachineBasicBlock *ThisMBB) const {
35800 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35801 const MIMetadata MIMD(FirstCMOV);
35802
35803 // We lower cascaded CMOVs such as
35804 //
35805 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
35806 //
35807 // to two successive branches.
35808 //
35809 // Without this, we would add a PHI between the two jumps, which ends up
35810 // creating a few copies all around. For instance, for
35811 //
35812 // (sitofp (zext (fcmp une)))
35813 //
35814 // we would generate:
35815 //
35816 // ucomiss %xmm1, %xmm0
35817 // movss <1.0f>, %xmm0
35818 // movaps %xmm0, %xmm1
35819 // jne .LBB5_2
35820 // xorps %xmm1, %xmm1
35821 // .LBB5_2:
35822 // jp .LBB5_4
35823 // movaps %xmm1, %xmm0
35824 // .LBB5_4:
35825 // retq
35826 //
35827 // because this custom-inserter would have generated:
35828 //
35829 // A
35830 // | \
35831 // | B
35832 // | /
35833 // C
35834 // | \
35835 // | D
35836 // | /
35837 // E
35838 //
35839 // A: X = ...; Y = ...
35840 // B: empty
35841 // C: Z = PHI [X, A], [Y, B]
35842 // D: empty
35843 // E: PHI [X, C], [Z, D]
35844 //
35845 // If we lower both CMOVs in a single step, we can instead generate:
35846 //
35847 // A
35848 // | \
35849 // | C
35850 // | /|
35851 // |/ |
35852 // | |
35853 // | D
35854 // | /
35855 // E
35856 //
35857 // A: X = ...; Y = ...
35858 // D: empty
35859 // E: PHI [X, A], [X, C], [Y, D]
35860 //
35861 // Which, in our sitofp/fcmp example, gives us something like:
35862 //
35863 // ucomiss %xmm1, %xmm0
35864 // movss <1.0f>, %xmm0
35865 // jne .LBB5_4
35866 // jp .LBB5_4
35867 // xorps %xmm0, %xmm0
35868 // .LBB5_4:
35869 // retq
35870 //
35871
35872 // We lower cascaded CMOV into two successive branches to the same block.
35873 // EFLAGS is used by both, so mark it as live in the second.
35874 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35875 MachineFunction *F = ThisMBB->getParent();
35876 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35877 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35878 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35879
35880 MachineFunction::iterator It = ++ThisMBB->getIterator();
35881 F->insert(It, FirstInsertedMBB);
35882 F->insert(It, SecondInsertedMBB);
35883 F->insert(It, SinkMBB);
35884
35885 // For a cascaded CMOV, we lower it to two successive branches to
35886 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
35887 // the FirstInsertedMBB.
35888 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
35889
35890 // If the EFLAGS register isn't dead in the terminator, then claim that it's
35891 // live into the sink and copy blocks.
35892 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35893 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
35894 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
35895 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
35896 SinkMBB->addLiveIn(X86::EFLAGS);
35897 }
35898
35899 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35900 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
35901 std::next(MachineBasicBlock::iterator(FirstCMOV)),
35902 ThisMBB->end());
35903 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35904
35905 // Fallthrough block for ThisMBB.
35906 ThisMBB->addSuccessor(FirstInsertedMBB);
35907 // The true block target of the first branch is always SinkMBB.
35908 ThisMBB->addSuccessor(SinkMBB);
35909 // Fallthrough block for FirstInsertedMBB.
35910 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
35911 // The true block for the branch of FirstInsertedMBB.
35912 FirstInsertedMBB->addSuccessor(SinkMBB);
35913 // This is fallthrough.
35914 SecondInsertedMBB->addSuccessor(SinkMBB);
35915
35916 // Create the conditional branch instructions.
35917 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
35918 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
35919
35920 X86::CondCode SecondCC =
35921 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
35922 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
35923 .addMBB(SinkMBB)
35924 .addImm(SecondCC);
35925
35926 // SinkMBB:
35927 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
35928 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
35929 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
35930 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
35932 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
35933 .addReg(Op1Reg)
35934 .addMBB(SecondInsertedMBB)
35935 .addReg(Op2Reg)
35936 .addMBB(ThisMBB);
35937
35938 // The second SecondInsertedMBB provides the same incoming value as the
35939 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
35940 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
35941
35942 // Now remove the CMOVs.
35943 FirstCMOV.eraseFromParent();
35944 SecondCascadedCMOV.eraseFromParent();
35945
35946 return SinkMBB;
35947}
35948
35950X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
35951 MachineBasicBlock *ThisMBB) const {
35952 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35953 const MIMetadata MIMD(MI);
35954
35955 // To "insert" a SELECT_CC instruction, we actually have to insert the
35956 // diamond control-flow pattern. The incoming instruction knows the
35957 // destination vreg to set, the condition code register to branch on, the
35958 // true/false values to select between and a branch opcode to use.
35959
35960 // ThisMBB:
35961 // ...
35962 // TrueVal = ...
35963 // cmpTY ccX, r1, r2
35964 // bCC copy1MBB
35965 // fallthrough --> FalseMBB
35966
35967 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
35968 // as described above, by inserting a BB, and then making a PHI at the join
35969 // point to select the true and false operands of the CMOV in the PHI.
35970 //
35971 // The code also handles two different cases of multiple CMOV opcodes
35972 // in a row.
35973 //
35974 // Case 1:
35975 // In this case, there are multiple CMOVs in a row, all which are based on
35976 // the same condition setting (or the exact opposite condition setting).
35977 // In this case we can lower all the CMOVs using a single inserted BB, and
35978 // then make a number of PHIs at the join point to model the CMOVs. The only
35979 // trickiness here, is that in a case like:
35980 //
35981 // t2 = CMOV cond1 t1, f1
35982 // t3 = CMOV cond1 t2, f2
35983 //
35984 // when rewriting this into PHIs, we have to perform some renaming on the
35985 // temps since you cannot have a PHI operand refer to a PHI result earlier
35986 // in the same block. The "simple" but wrong lowering would be:
35987 //
35988 // t2 = PHI t1(BB1), f1(BB2)
35989 // t3 = PHI t2(BB1), f2(BB2)
35990 //
35991 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
35992 // renaming is to note that on the path through BB1, t2 is really just a
35993 // copy of t1, and do that renaming, properly generating:
35994 //
35995 // t2 = PHI t1(BB1), f1(BB2)
35996 // t3 = PHI t1(BB1), f2(BB2)
35997 //
35998 // Case 2:
35999 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36000 // function - EmitLoweredCascadedSelect.
36001
36002 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36004 MachineInstr *LastCMOV = &MI;
36006
36007 // Check for case 1, where there are multiple CMOVs with the same condition
36008 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36009 // number of jumps the most.
36010
36011 if (isCMOVPseudo(MI)) {
36012 // See if we have a string of CMOVS with the same condition. Skip over
36013 // intervening debug insts.
36014 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36015 (NextMIIt->getOperand(3).getImm() == CC ||
36016 NextMIIt->getOperand(3).getImm() == OppCC)) {
36017 LastCMOV = &*NextMIIt;
36018 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36019 }
36020 }
36021
36022 // This checks for case 2, but only do this if we didn't already find
36023 // case 1, as indicated by LastCMOV == MI.
36024 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36025 NextMIIt->getOpcode() == MI.getOpcode() &&
36026 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36027 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36028 NextMIIt->getOperand(1).isKill()) {
36029 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36030 }
36031
36032 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36033 MachineFunction *F = ThisMBB->getParent();
36034 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36035 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36036
36037 MachineFunction::iterator It = ++ThisMBB->getIterator();
36038 F->insert(It, FalseMBB);
36039 F->insert(It, SinkMBB);
36040
36041 // Set the call frame size on entry to the new basic blocks.
36042 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36043 FalseMBB->setCallFrameSize(CallFrameSize);
36044 SinkMBB->setCallFrameSize(CallFrameSize);
36045
36046 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36047 // live into the sink and copy blocks.
36048 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36049 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36050 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36051 FalseMBB->addLiveIn(X86::EFLAGS);
36052 SinkMBB->addLiveIn(X86::EFLAGS);
36053 }
36054
36055 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36057 MachineBasicBlock::iterator(LastCMOV));
36058 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36059 if (MI.isDebugInstr())
36060 SinkMBB->push_back(MI.removeFromParent());
36061
36062 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36063 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36064 std::next(MachineBasicBlock::iterator(LastCMOV)),
36065 ThisMBB->end());
36066 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36067
36068 // Fallthrough block for ThisMBB.
36069 ThisMBB->addSuccessor(FalseMBB);
36070 // The true block target of the first (or only) branch is always a SinkMBB.
36071 ThisMBB->addSuccessor(SinkMBB);
36072 // Fallthrough block for FalseMBB.
36073 FalseMBB->addSuccessor(SinkMBB);
36074
36075 // Create the conditional branch instruction.
36076 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36077
36078 // SinkMBB:
36079 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36080 // ...
36083 std::next(MachineBasicBlock::iterator(LastCMOV));
36084 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36085
36086 // Now remove the CMOV(s).
36087 ThisMBB->erase(MIItBegin, MIItEnd);
36088
36089 return SinkMBB;
36090}
36091
36092static unsigned getSUBriOpcode(bool IsLP64) {
36093 if (IsLP64)
36094 return X86::SUB64ri32;
36095 else
36096 return X86::SUB32ri;
36097}
36098
36100X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36101 MachineBasicBlock *MBB) const {
36102 MachineFunction *MF = MBB->getParent();
36103 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36104 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36105 const MIMetadata MIMD(MI);
36106 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36107
36108 const unsigned ProbeSize = getStackProbeSize(*MF);
36109
36111 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36112 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36113 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36114
36116 MF->insert(MBBIter, testMBB);
36117 MF->insert(MBBIter, blockMBB);
36118 MF->insert(MBBIter, tailMBB);
36119
36120 Register sizeVReg = MI.getOperand(1).getReg();
36121
36122 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36123
36124 Register TmpStackPtr = MRI.createVirtualRegister(
36125 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36126 Register FinalStackPtr = MRI.createVirtualRegister(
36127 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36128
36129 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36130 .addReg(physSPReg);
36131 {
36132 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36133 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36134 .addReg(TmpStackPtr)
36135 .addReg(sizeVReg);
36136 }
36137
36138 // test rsp size
36139
36140 BuildMI(testMBB, MIMD,
36141 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36142 .addReg(FinalStackPtr)
36143 .addReg(physSPReg);
36144
36145 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36146 .addMBB(tailMBB)
36148 testMBB->addSuccessor(blockMBB);
36149 testMBB->addSuccessor(tailMBB);
36150
36151 // Touch the block then extend it. This is done on the opposite side of
36152 // static probe where we allocate then touch, to avoid the need of probing the
36153 // tail of the static alloca. Possible scenarios are:
36154 //
36155 // + ---- <- ------------ <- ------------- <- ------------ +
36156 // | |
36157 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36158 // | |
36159 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36160 //
36161 // The property we want to enforce is to never have more than [page alloc] between two probes.
36162
36163 const unsigned XORMIOpc =
36164 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36165 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36166 .addImm(0);
36167
36168 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36169 physSPReg)
36170 .addReg(physSPReg)
36171 .addImm(ProbeSize);
36172
36173 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36174 blockMBB->addSuccessor(testMBB);
36175
36176 // Replace original instruction by the expected stack ptr
36177 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36178 MI.getOperand(0).getReg())
36179 .addReg(FinalStackPtr);
36180
36181 tailMBB->splice(tailMBB->end(), MBB,
36182 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36184 MBB->addSuccessor(testMBB);
36185
36186 // Delete the original pseudo instruction.
36187 MI.eraseFromParent();
36188
36189 // And we're done.
36190 return tailMBB;
36191}
36192
36194X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36195 MachineBasicBlock *BB) const {
36196 MachineFunction *MF = BB->getParent();
36197 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36198 const MIMetadata MIMD(MI);
36199 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36200
36201 assert(MF->shouldSplitStack());
36202
36203 const bool Is64Bit = Subtarget.is64Bit();
36204 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36205
36206 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36207 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36208
36209 // BB:
36210 // ... [Till the alloca]
36211 // If stacklet is not large enough, jump to mallocMBB
36212 //
36213 // bumpMBB:
36214 // Allocate by subtracting from RSP
36215 // Jump to continueMBB
36216 //
36217 // mallocMBB:
36218 // Allocate by call to runtime
36219 //
36220 // continueMBB:
36221 // ...
36222 // [rest of original BB]
36223 //
36224
36225 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36226 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36227 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36228
36230 const TargetRegisterClass *AddrRegClass =
36232
36233 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36234 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36235 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36236 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36237 sizeVReg = MI.getOperand(1).getReg(),
36238 physSPReg =
36239 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
36240
36241 MachineFunction::iterator MBBIter = ++BB->getIterator();
36242
36243 MF->insert(MBBIter, bumpMBB);
36244 MF->insert(MBBIter, mallocMBB);
36245 MF->insert(MBBIter, continueMBB);
36246
36247 continueMBB->splice(continueMBB->begin(), BB,
36248 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36249 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36250
36251 // Add code to the main basic block to check if the stack limit has been hit,
36252 // and if so, jump to mallocMBB otherwise to bumpMBB.
36253 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36254 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36255 .addReg(tmpSPVReg).addReg(sizeVReg);
36256 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36257 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36258 .addReg(SPLimitVReg);
36259 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36260
36261 // bumpMBB simply decreases the stack pointer, since we know the current
36262 // stacklet has enough space.
36263 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36264 .addReg(SPLimitVReg);
36265 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36266 .addReg(SPLimitVReg);
36267 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36268
36269 // Calls into a routine in libgcc to allocate more space from the heap.
36270 const uint32_t *RegMask =
36272 if (IsLP64) {
36273 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36274 .addReg(sizeVReg);
36275 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36276 .addExternalSymbol("__morestack_allocate_stack_space")
36277 .addRegMask(RegMask)
36278 .addReg(X86::RDI, RegState::Implicit)
36279 .addReg(X86::RAX, RegState::ImplicitDefine);
36280 } else if (Is64Bit) {
36281 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36282 .addReg(sizeVReg);
36283 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36284 .addExternalSymbol("__morestack_allocate_stack_space")
36285 .addRegMask(RegMask)
36286 .addReg(X86::EDI, RegState::Implicit)
36287 .addReg(X86::EAX, RegState::ImplicitDefine);
36288 } else {
36289 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36290 .addImm(12);
36291 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36292 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36293 .addExternalSymbol("__morestack_allocate_stack_space")
36294 .addRegMask(RegMask)
36295 .addReg(X86::EAX, RegState::ImplicitDefine);
36296 }
36297
36298 if (!Is64Bit)
36299 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36300 .addImm(16);
36301
36302 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36303 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36304 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36305
36306 // Set up the CFG correctly.
36307 BB->addSuccessor(bumpMBB);
36308 BB->addSuccessor(mallocMBB);
36309 mallocMBB->addSuccessor(continueMBB);
36310 bumpMBB->addSuccessor(continueMBB);
36311
36312 // Take care of the PHI nodes.
36313 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36314 MI.getOperand(0).getReg())
36315 .addReg(mallocPtrVReg)
36316 .addMBB(mallocMBB)
36317 .addReg(bumpSPPtrVReg)
36318 .addMBB(bumpMBB);
36319
36320 // Delete the original pseudo instruction.
36321 MI.eraseFromParent();
36322
36323 // And we're done.
36324 return continueMBB;
36325}
36326
36328X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36329 MachineBasicBlock *BB) const {
36330 MachineFunction *MF = BB->getParent();
36331 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36332 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36333 const MIMetadata MIMD(MI);
36334
36337 "SEH does not use catchret!");
36338
36339 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36340 if (!Subtarget.is32Bit())
36341 return BB;
36342
36343 // C++ EH creates a new target block to hold the restore code, and wires up
36344 // the new block to the return destination with a normal JMP_4.
36345 MachineBasicBlock *RestoreMBB =
36347 assert(BB->succ_size() == 1);
36348 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36349 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36350 BB->addSuccessor(RestoreMBB);
36351 MI.getOperand(0).setMBB(RestoreMBB);
36352
36353 // Marking this as an EH pad but not a funclet entry block causes PEI to
36354 // restore stack pointers in the block.
36355 RestoreMBB->setIsEHPad(true);
36356
36357 auto RestoreMBBI = RestoreMBB->begin();
36358 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36359 return BB;
36360}
36361
36363X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36364 MachineBasicBlock *BB) const {
36365 // This is pretty easy. We're taking the value that we received from
36366 // our load from the relocation, sticking it in either RDI (x86-64)
36367 // or EAX and doing an indirect call. The return value will then
36368 // be in the normal return register.
36369 MachineFunction *F = BB->getParent();
36370 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36371 const MIMetadata MIMD(MI);
36372
36373 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36374 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36375
36376 // Get a register mask for the lowered call.
36377 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36378 // proper register mask.
36379 const uint32_t *RegMask =
36380 Subtarget.is64Bit() ?
36383 if (Subtarget.is64Bit()) {
36385 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36386 .addReg(X86::RIP)
36387 .addImm(0)
36388 .addReg(0)
36389 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36390 MI.getOperand(3).getTargetFlags())
36391 .addReg(0);
36392 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36393 addDirectMem(MIB, X86::RDI);
36394 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36395 } else if (!isPositionIndependent()) {
36397 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36398 .addReg(0)
36399 .addImm(0)
36400 .addReg(0)
36401 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36402 MI.getOperand(3).getTargetFlags())
36403 .addReg(0);
36404 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36405 addDirectMem(MIB, X86::EAX);
36406 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36407 } else {
36409 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36410 .addReg(TII->getGlobalBaseReg(F))
36411 .addImm(0)
36412 .addReg(0)
36413 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36414 MI.getOperand(3).getTargetFlags())
36415 .addReg(0);
36416 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36417 addDirectMem(MIB, X86::EAX);
36418 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36419 }
36420
36421 MI.eraseFromParent(); // The pseudo instruction is gone now.
36422 return BB;
36423}
36424
36425static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36426 switch (RPOpc) {
36427 case X86::INDIRECT_THUNK_CALL32:
36428 return X86::CALLpcrel32;
36429 case X86::INDIRECT_THUNK_CALL64:
36430 return X86::CALL64pcrel32;
36431 case X86::INDIRECT_THUNK_TCRETURN32:
36432 return X86::TCRETURNdi;
36433 case X86::INDIRECT_THUNK_TCRETURN64:
36434 return X86::TCRETURNdi64;
36435 }
36436 llvm_unreachable("not indirect thunk opcode");
36437}
36438
36439static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36440 unsigned Reg) {
36441 if (Subtarget.useRetpolineExternalThunk()) {
36442 // When using an external thunk for retpolines, we pick names that match the
36443 // names GCC happens to use as well. This helps simplify the implementation
36444 // of the thunks for kernels where they have no easy ability to create
36445 // aliases and are doing non-trivial configuration of the thunk's body. For
36446 // example, the Linux kernel will do boot-time hot patching of the thunk
36447 // bodies and cannot easily export aliases of these to loaded modules.
36448 //
36449 // Note that at any point in the future, we may need to change the semantics
36450 // of how we implement retpolines and at that time will likely change the
36451 // name of the called thunk. Essentially, there is no hard guarantee that
36452 // LLVM will generate calls to specific thunks, we merely make a best-effort
36453 // attempt to help out kernels and other systems where duplicating the
36454 // thunks is costly.
36455 switch (Reg) {
36456 case X86::EAX:
36457 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36458 return "__x86_indirect_thunk_eax";
36459 case X86::ECX:
36460 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36461 return "__x86_indirect_thunk_ecx";
36462 case X86::EDX:
36463 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36464 return "__x86_indirect_thunk_edx";
36465 case X86::EDI:
36466 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36467 return "__x86_indirect_thunk_edi";
36468 case X86::R11:
36469 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36470 return "__x86_indirect_thunk_r11";
36471 }
36472 llvm_unreachable("unexpected reg for external indirect thunk");
36473 }
36474
36475 if (Subtarget.useRetpolineIndirectCalls() ||
36476 Subtarget.useRetpolineIndirectBranches()) {
36477 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36478 switch (Reg) {
36479 case X86::EAX:
36480 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36481 return "__llvm_retpoline_eax";
36482 case X86::ECX:
36483 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36484 return "__llvm_retpoline_ecx";
36485 case X86::EDX:
36486 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36487 return "__llvm_retpoline_edx";
36488 case X86::EDI:
36489 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36490 return "__llvm_retpoline_edi";
36491 case X86::R11:
36492 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36493 return "__llvm_retpoline_r11";
36494 }
36495 llvm_unreachable("unexpected reg for retpoline");
36496 }
36497
36498 if (Subtarget.useLVIControlFlowIntegrity()) {
36499 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36500 return "__llvm_lvi_thunk_r11";
36501 }
36502 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36503}
36504
36506X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36507 MachineBasicBlock *BB) const {
36508 // Copy the virtual register into the R11 physical register and
36509 // call the retpoline thunk.
36510 const MIMetadata MIMD(MI);
36511 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36512 Register CalleeVReg = MI.getOperand(0).getReg();
36513 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36514
36515 // Find an available scratch register to hold the callee. On 64-bit, we can
36516 // just use R11, but we scan for uses anyway to ensure we don't generate
36517 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36518 // already a register use operand to the call to hold the callee. If none
36519 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36520 // register and ESI is the base pointer to realigned stack frames with VLAs.
36521 SmallVector<unsigned, 3> AvailableRegs;
36522 if (Subtarget.is64Bit())
36523 AvailableRegs.push_back(X86::R11);
36524 else
36525 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36526
36527 // Zero out any registers that are already used.
36528 for (const auto &MO : MI.operands()) {
36529 if (MO.isReg() && MO.isUse())
36530 llvm::replace(AvailableRegs, static_cast<unsigned>(MO.getReg()), 0U);
36531 }
36532
36533 // Choose the first remaining non-zero available register.
36534 unsigned AvailableReg = 0;
36535 for (unsigned MaybeReg : AvailableRegs) {
36536 if (MaybeReg) {
36537 AvailableReg = MaybeReg;
36538 break;
36539 }
36540 }
36541 if (!AvailableReg)
36542 report_fatal_error("calling convention incompatible with retpoline, no "
36543 "available registers");
36544
36545 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36546
36547 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36548 .addReg(CalleeVReg);
36549 MI.getOperand(0).ChangeToES(Symbol);
36550 MI.setDesc(TII->get(Opc));
36552 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36553 return BB;
36554}
36555
36556/// SetJmp implies future control flow change upon calling the corresponding
36557/// LongJmp.
36558/// Instead of using the 'return' instruction, the long jump fixes the stack and
36559/// performs an indirect branch. To do so it uses the registers that were stored
36560/// in the jump buffer (when calling SetJmp).
36561/// In case the shadow stack is enabled we need to fix it as well, because some
36562/// return addresses will be skipped.
36563/// The function will save the SSP for future fixing in the function
36564/// emitLongJmpShadowStackFix.
36565/// \sa emitLongJmpShadowStackFix
36566/// \param [in] MI The temporary Machine Instruction for the builtin.
36567/// \param [in] MBB The Machine Basic Block that will be modified.
36568void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36569 MachineBasicBlock *MBB) const {
36570 const MIMetadata MIMD(MI);
36571 MachineFunction *MF = MBB->getParent();
36572 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36575
36576 // Memory Reference.
36577 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36578
36579 // Initialize a register with zero.
36580 MVT PVT = getPointerTy(MF->getDataLayout());
36581 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36582 Register ZReg = MRI.createVirtualRegister(PtrRC);
36583 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
36584 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
36585 .addDef(ZReg)
36586 .addReg(ZReg, RegState::Undef)
36587 .addReg(ZReg, RegState::Undef);
36588
36589 // Read the current SSP Register value to the zeroed register.
36590 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36591 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36592 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36593
36594 // Write the SSP register value to offset 3 in input memory buffer.
36595 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36596 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
36597 const int64_t SSPOffset = 3 * PVT.getStoreSize();
36598 const unsigned MemOpndSlot = 1;
36599 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36600 if (i == X86::AddrDisp)
36601 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
36602 else
36603 MIB.add(MI.getOperand(MemOpndSlot + i));
36604 }
36605 MIB.addReg(SSPCopyReg);
36606 MIB.setMemRefs(MMOs);
36607}
36608
36610X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
36611 MachineBasicBlock *MBB) const {
36612 const MIMetadata MIMD(MI);
36613 MachineFunction *MF = MBB->getParent();
36614 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36615 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36617
36618 const BasicBlock *BB = MBB->getBasicBlock();
36620
36621 // Memory Reference
36622 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36623
36624 unsigned DstReg;
36625 unsigned MemOpndSlot = 0;
36626
36627 unsigned CurOp = 0;
36628
36629 DstReg = MI.getOperand(CurOp++).getReg();
36630 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
36631 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
36632 (void)TRI;
36633 Register mainDstReg = MRI.createVirtualRegister(RC);
36634 Register restoreDstReg = MRI.createVirtualRegister(RC);
36635
36636 MemOpndSlot = CurOp;
36637
36638 MVT PVT = getPointerTy(MF->getDataLayout());
36639 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
36640 "Invalid Pointer Size!");
36641
36642 // For v = setjmp(buf), we generate
36643 //
36644 // thisMBB:
36645 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
36646 // SjLjSetup restoreMBB
36647 //
36648 // mainMBB:
36649 // v_main = 0
36650 //
36651 // sinkMBB:
36652 // v = phi(main, restore)
36653 //
36654 // restoreMBB:
36655 // if base pointer being used, load it from frame
36656 // v_restore = 1
36657
36658 MachineBasicBlock *thisMBB = MBB;
36659 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
36660 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36661 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
36662 MF->insert(I, mainMBB);
36663 MF->insert(I, sinkMBB);
36664 MF->push_back(restoreMBB);
36665 restoreMBB->setMachineBlockAddressTaken();
36666
36668
36669 // Transfer the remainder of BB and its successor edges to sinkMBB.
36670 sinkMBB->splice(sinkMBB->begin(), MBB,
36671 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36673
36674 // thisMBB:
36675 unsigned PtrStoreOpc = 0;
36676 unsigned LabelReg = 0;
36677 const int64_t LabelOffset = 1 * PVT.getStoreSize();
36678 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36680
36681 // Prepare IP either in reg or imm.
36682 if (!UseImmLabel) {
36683 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36684 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36685 LabelReg = MRI.createVirtualRegister(PtrRC);
36686 if (Subtarget.is64Bit()) {
36687 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
36688 .addReg(X86::RIP)
36689 .addImm(0)
36690 .addReg(0)
36691 .addMBB(restoreMBB)
36692 .addReg(0);
36693 } else {
36694 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
36695 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
36696 .addReg(XII->getGlobalBaseReg(MF))
36697 .addImm(0)
36698 .addReg(0)
36699 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
36700 .addReg(0);
36701 }
36702 } else
36703 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
36704 // Store IP
36705 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
36706 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36707 if (i == X86::AddrDisp)
36708 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
36709 else
36710 MIB.add(MI.getOperand(MemOpndSlot + i));
36711 }
36712 if (!UseImmLabel)
36713 MIB.addReg(LabelReg);
36714 else
36715 MIB.addMBB(restoreMBB);
36716 MIB.setMemRefs(MMOs);
36717
36718 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
36719 emitSetJmpShadowStackFix(MI, thisMBB);
36720 }
36721
36722 // Setup
36723 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
36724 .addMBB(restoreMBB);
36725
36726 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
36727 MIB.addRegMask(RegInfo->getNoPreservedMask());
36728 thisMBB->addSuccessor(mainMBB);
36729 thisMBB->addSuccessor(restoreMBB);
36730
36731 // mainMBB:
36732 // EAX = 0
36733 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
36734 mainMBB->addSuccessor(sinkMBB);
36735
36736 // sinkMBB:
36737 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
36738 .addReg(mainDstReg)
36739 .addMBB(mainMBB)
36740 .addReg(restoreDstReg)
36741 .addMBB(restoreMBB);
36742
36743 // restoreMBB:
36744 if (RegInfo->hasBasePointer(*MF)) {
36745 const bool Uses64BitFramePtr =
36746 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
36748 X86FI->setRestoreBasePointer(MF);
36749 Register FramePtr = RegInfo->getFrameRegister(*MF);
36750 Register BasePtr = RegInfo->getBaseRegister();
36751 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
36752 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
36753 FramePtr, true, X86FI->getRestoreBasePointerOffset())
36755 }
36756 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
36757 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
36758 restoreMBB->addSuccessor(sinkMBB);
36759
36760 MI.eraseFromParent();
36761 return sinkMBB;
36762}
36763
36764/// Fix the shadow stack using the previously saved SSP pointer.
36765/// \sa emitSetJmpShadowStackFix
36766/// \param [in] MI The temporary Machine Instruction for the builtin.
36767/// \param [in] MBB The Machine Basic Block that will be modified.
36768/// \return The sink MBB that will perform the future indirect branch.
36770X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
36771 MachineBasicBlock *MBB) const {
36772 const MIMetadata MIMD(MI);
36773 MachineFunction *MF = MBB->getParent();
36774 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36776
36777 // Memory Reference
36778 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36779
36780 MVT PVT = getPointerTy(MF->getDataLayout());
36781 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36782
36783 // checkSspMBB:
36784 // xor vreg1, vreg1
36785 // rdssp vreg1
36786 // test vreg1, vreg1
36787 // je sinkMBB # Jump if Shadow Stack is not supported
36788 // fallMBB:
36789 // mov buf+24/12(%rip), vreg2
36790 // sub vreg1, vreg2
36791 // jbe sinkMBB # No need to fix the Shadow Stack
36792 // fixShadowMBB:
36793 // shr 3/2, vreg2
36794 // incssp vreg2 # fix the SSP according to the lower 8 bits
36795 // shr 8, vreg2
36796 // je sinkMBB
36797 // fixShadowLoopPrepareMBB:
36798 // shl vreg2
36799 // mov 128, vreg3
36800 // fixShadowLoopMBB:
36801 // incssp vreg3
36802 // dec vreg2
36803 // jne fixShadowLoopMBB # Iterate until you finish fixing
36804 // # the Shadow Stack
36805 // sinkMBB:
36806
36808 const BasicBlock *BB = MBB->getBasicBlock();
36809
36810 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
36811 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
36812 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
36813 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
36814 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
36815 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36816 MF->insert(I, checkSspMBB);
36817 MF->insert(I, fallMBB);
36818 MF->insert(I, fixShadowMBB);
36819 MF->insert(I, fixShadowLoopPrepareMBB);
36820 MF->insert(I, fixShadowLoopMBB);
36821 MF->insert(I, sinkMBB);
36822
36823 // Transfer the remainder of BB and its successor edges to sinkMBB.
36824 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
36825 MBB->end());
36827
36828 MBB->addSuccessor(checkSspMBB);
36829
36830 // Initialize a register with zero.
36831 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
36832 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
36833
36834 if (PVT == MVT::i64) {
36835 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
36836 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
36837 .addImm(0)
36838 .addReg(ZReg)
36839 .addImm(X86::sub_32bit);
36840 ZReg = TmpZReg;
36841 }
36842
36843 // Read the current SSP Register value to the zeroed register.
36844 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36845 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36846 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36847
36848 // Check whether the result of the SSP register is zero and jump directly
36849 // to the sink.
36850 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
36851 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
36852 .addReg(SSPCopyReg)
36853 .addReg(SSPCopyReg);
36854 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
36855 .addMBB(sinkMBB)
36857 checkSspMBB->addSuccessor(sinkMBB);
36858 checkSspMBB->addSuccessor(fallMBB);
36859
36860 // Reload the previously saved SSP register value.
36861 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
36862 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36863 const int64_t SPPOffset = 3 * PVT.getStoreSize();
36865 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
36866 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36867 const MachineOperand &MO = MI.getOperand(i);
36868 if (i == X86::AddrDisp)
36869 MIB.addDisp(MO, SPPOffset);
36870 else if (MO.isReg()) // Don't add the whole operand, we don't want to
36871 // preserve kill flags.
36872 MIB.addReg(MO.getReg());
36873 else
36874 MIB.add(MO);
36875 }
36876 MIB.setMemRefs(MMOs);
36877
36878 // Subtract the current SSP from the previous SSP.
36879 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
36880 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
36881 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
36882 .addReg(PrevSSPReg)
36883 .addReg(SSPCopyReg);
36884
36885 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
36886 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
36887 .addMBB(sinkMBB)
36889 fallMBB->addSuccessor(sinkMBB);
36890 fallMBB->addSuccessor(fixShadowMBB);
36891
36892 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
36893 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
36894 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
36895 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
36896 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
36897 .addReg(SspSubReg)
36898 .addImm(Offset);
36899
36900 // Increase SSP when looking only on the lower 8 bits of the delta.
36901 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
36902 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
36903
36904 // Reset the lower 8 bits.
36905 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
36906 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
36907 .addReg(SspFirstShrReg)
36908 .addImm(8);
36909
36910 // Jump if the result of the shift is zero.
36911 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
36912 .addMBB(sinkMBB)
36914 fixShadowMBB->addSuccessor(sinkMBB);
36915 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
36916
36917 // Do a single shift left.
36918 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
36919 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
36920 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
36921 .addReg(SspSecondShrReg)
36922 .addImm(1);
36923
36924 // Save the value 128 to a register (will be used next with incssp).
36925 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
36926 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
36927 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
36928 .addImm(128);
36929 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
36930
36931 // Since incssp only looks at the lower 8 bits, we might need to do several
36932 // iterations of incssp until we finish fixing the shadow stack.
36933 Register DecReg = MRI.createVirtualRegister(PtrRC);
36934 Register CounterReg = MRI.createVirtualRegister(PtrRC);
36935 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
36936 .addReg(SspAfterShlReg)
36937 .addMBB(fixShadowLoopPrepareMBB)
36938 .addReg(DecReg)
36939 .addMBB(fixShadowLoopMBB);
36940
36941 // Every iteration we increase the SSP by 128.
36942 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
36943
36944 // Every iteration we decrement the counter by 1.
36945 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
36946 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
36947
36948 // Jump if the counter is not zero yet.
36949 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
36950 .addMBB(fixShadowLoopMBB)
36952 fixShadowLoopMBB->addSuccessor(sinkMBB);
36953 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
36954
36955 return sinkMBB;
36956}
36957
36959X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
36960 MachineBasicBlock *MBB) const {
36961 const MIMetadata MIMD(MI);
36962 MachineFunction *MF = MBB->getParent();
36963 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36965
36966 // Memory Reference
36967 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36968
36969 MVT PVT = getPointerTy(MF->getDataLayout());
36970 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
36971 "Invalid Pointer Size!");
36972
36973 const TargetRegisterClass *RC =
36974 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36975 Register Tmp = MRI.createVirtualRegister(RC);
36976 // Since FP is only updated here but NOT referenced, it's treated as GPR.
36977 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
36978 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
36979 Register SP = RegInfo->getStackRegister();
36980
36982
36983 const int64_t LabelOffset = 1 * PVT.getStoreSize();
36984 const int64_t SPOffset = 2 * PVT.getStoreSize();
36985
36986 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36987 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
36988
36989 MachineBasicBlock *thisMBB = MBB;
36990
36991 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
36992 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
36993 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
36994 }
36995
36996 // Reload FP
36997 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
36998 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36999 const MachineOperand &MO = MI.getOperand(i);
37000 if (MO.isReg()) // Don't add the whole operand, we don't want to
37001 // preserve kill flags.
37002 MIB.addReg(MO.getReg());
37003 else
37004 MIB.add(MO);
37005 }
37006 MIB.setMemRefs(MMOs);
37008
37009 // Reload IP
37010 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37011 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37012 const MachineOperand &MO = MI.getOperand(i);
37013 if (i == X86::AddrDisp)
37014 MIB.addDisp(MO, LabelOffset);
37015 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37016 // preserve kill flags.
37017 MIB.addReg(MO.getReg());
37018 else
37019 MIB.add(MO);
37020 }
37021 MIB.setMemRefs(MMOs);
37022
37023 // Reload SP
37024 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37025 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37026 if (i == X86::AddrDisp)
37027 MIB.addDisp(MI.getOperand(i), SPOffset);
37028 else
37029 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37030 // the last instruction of the expansion.
37031 }
37032 MIB.setMemRefs(MMOs);
37034
37035 // Jump
37036 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37037
37038 MI.eraseFromParent();
37039 return thisMBB;
37040}
37041
37042void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37044 MachineBasicBlock *DispatchBB,
37045 int FI) const {
37046 const MIMetadata MIMD(MI);
37047 MachineFunction *MF = MBB->getParent();
37049 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37050
37051 MVT PVT = getPointerTy(MF->getDataLayout());
37052 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37053
37054 unsigned Op = 0;
37055 unsigned VR = 0;
37056
37057 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37059
37060 if (UseImmLabel) {
37061 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37062 } else {
37063 const TargetRegisterClass *TRC =
37064 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37065 VR = MRI->createVirtualRegister(TRC);
37066 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37067
37068 if (Subtarget.is64Bit())
37069 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37070 .addReg(X86::RIP)
37071 .addImm(1)
37072 .addReg(0)
37073 .addMBB(DispatchBB)
37074 .addReg(0);
37075 else
37076 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37077 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37078 .addImm(1)
37079 .addReg(0)
37080 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37081 .addReg(0);
37082 }
37083
37084 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37085 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37086 if (UseImmLabel)
37087 MIB.addMBB(DispatchBB);
37088 else
37089 MIB.addReg(VR);
37090}
37091
37093X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37094 MachineBasicBlock *BB) const {
37095 const MIMetadata MIMD(MI);
37096 MachineFunction *MF = BB->getParent();
37098 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37099 int FI = MF->getFrameInfo().getFunctionContextIndex();
37100
37101 // Get a mapping of the call site numbers to all of the landing pads they're
37102 // associated with.
37104 unsigned MaxCSNum = 0;
37105 for (auto &MBB : *MF) {
37106 if (!MBB.isEHPad())
37107 continue;
37108
37109 MCSymbol *Sym = nullptr;
37110 for (const auto &MI : MBB) {
37111 if (MI.isDebugInstr())
37112 continue;
37113
37114 assert(MI.isEHLabel() && "expected EH_LABEL");
37115 Sym = MI.getOperand(0).getMCSymbol();
37116 break;
37117 }
37118
37119 if (!MF->hasCallSiteLandingPad(Sym))
37120 continue;
37121
37122 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37123 CallSiteNumToLPad[CSI].push_back(&MBB);
37124 MaxCSNum = std::max(MaxCSNum, CSI);
37125 }
37126 }
37127
37128 // Get an ordered list of the machine basic blocks for the jump table.
37129 std::vector<MachineBasicBlock *> LPadList;
37131 LPadList.reserve(CallSiteNumToLPad.size());
37132
37133 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37134 for (auto &LP : CallSiteNumToLPad[CSI]) {
37135 LPadList.push_back(LP);
37136 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
37137 }
37138 }
37139
37140 assert(!LPadList.empty() &&
37141 "No landing pad destinations for the dispatch jump table!");
37142
37143 // Create the MBBs for the dispatch code.
37144
37145 // Shove the dispatch's address into the return slot in the function context.
37146 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37147 DispatchBB->setIsEHPad(true);
37148
37149 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37150 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37151 DispatchBB->addSuccessor(TrapBB);
37152
37153 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37154 DispatchBB->addSuccessor(DispContBB);
37155
37156 // Insert MBBs.
37157 MF->push_back(DispatchBB);
37158 MF->push_back(DispContBB);
37159 MF->push_back(TrapBB);
37160
37161 // Insert code into the entry block that creates and registers the function
37162 // context.
37163 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37164
37165 // Create the jump table and associated information
37166 unsigned JTE = getJumpTableEncoding();
37167 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37168 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37169
37170 const X86RegisterInfo &RI = TII->getRegisterInfo();
37171 // Add a register mask with no preserved registers. This results in all
37172 // registers being marked as clobbered.
37173 if (RI.hasBasePointer(*MF)) {
37174 const bool FPIs64Bit =
37175 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37176 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37177 MFI->setRestoreBasePointer(MF);
37178
37179 Register FP = RI.getFrameRegister(*MF);
37180 Register BP = RI.getBaseRegister();
37181 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37182 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37185 } else {
37186 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37188 }
37189
37190 // IReg is used as an index in a memory operand and therefore can't be SP
37191 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37192 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37193 Subtarget.is64Bit() ? 8 : 4);
37194 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37195 .addReg(IReg)
37196 .addImm(LPadList.size());
37197 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37198 .addMBB(TrapBB)
37200
37201 if (Subtarget.is64Bit()) {
37202 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37203 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37204
37205 // leaq .LJTI0_0(%rip), BReg
37206 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37207 .addReg(X86::RIP)
37208 .addImm(1)
37209 .addReg(0)
37210 .addJumpTableIndex(MJTI)
37211 .addReg(0);
37212 // movzx IReg64, IReg
37213 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37214 .addImm(0)
37215 .addReg(IReg)
37216 .addImm(X86::sub_32bit);
37217
37218 switch (JTE) {
37220 // jmpq *(BReg,IReg64,8)
37221 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37222 .addReg(BReg)
37223 .addImm(8)
37224 .addReg(IReg64)
37225 .addImm(0)
37226 .addReg(0);
37227 break;
37229 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37230 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37231 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37232
37233 // movl (BReg,IReg64,4), OReg
37234 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37235 .addReg(BReg)
37236 .addImm(4)
37237 .addReg(IReg64)
37238 .addImm(0)
37239 .addReg(0);
37240 // movsx OReg64, OReg
37241 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37242 .addReg(OReg);
37243 // addq BReg, OReg64, TReg
37244 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37245 .addReg(OReg64)
37246 .addReg(BReg);
37247 // jmpq *TReg
37248 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37249 break;
37250 }
37251 default:
37252 llvm_unreachable("Unexpected jump table encoding");
37253 }
37254 } else {
37255 // jmpl *.LJTI0_0(,IReg,4)
37256 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37257 .addReg(0)
37258 .addImm(4)
37259 .addReg(IReg)
37260 .addJumpTableIndex(MJTI)
37261 .addReg(0);
37262 }
37263
37264 // Add the jump table entries as successors to the MBB.
37266 for (auto &LP : LPadList)
37267 if (SeenMBBs.insert(LP).second)
37268 DispContBB->addSuccessor(LP);
37269
37270 // N.B. the order the invoke BBs are processed in doesn't matter here.
37272 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37273 for (MachineBasicBlock *MBB : InvokeBBs) {
37274 // Remove the landing pad successor from the invoke block and replace it
37275 // with the new dispatch block.
37276 // Keep a copy of Successors since it's modified inside the loop.
37278 MBB->succ_rend());
37279 // FIXME: Avoid quadratic complexity.
37280 for (auto *MBBS : Successors) {
37281 if (MBBS->isEHPad()) {
37282 MBB->removeSuccessor(MBBS);
37283 MBBLPads.push_back(MBBS);
37284 }
37285 }
37286
37287 MBB->addSuccessor(DispatchBB);
37288
37289 // Find the invoke call and mark all of the callee-saved registers as
37290 // 'implicit defined' so that they're spilled. This prevents code from
37291 // moving instructions to before the EH block, where they will never be
37292 // executed.
37293 for (auto &II : reverse(*MBB)) {
37294 if (!II.isCall())
37295 continue;
37296
37298 for (auto &MOp : II.operands())
37299 if (MOp.isReg())
37300 DefRegs[MOp.getReg()] = true;
37301
37302 MachineInstrBuilder MIB(*MF, &II);
37303 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37304 unsigned Reg = SavedRegs[RegIdx];
37305 if (!DefRegs[Reg])
37307 }
37308
37309 break;
37310 }
37311 }
37312
37313 // Mark all former landing pads as non-landing pads. The dispatch is the only
37314 // landing pad now.
37315 for (auto &LP : MBBLPads)
37316 LP->setIsEHPad(false);
37317
37318 // The instruction is gone now.
37319 MI.eraseFromParent();
37320 return BB;
37321}
37322
37324X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37325 MachineBasicBlock *BB) const {
37326 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37327 // calls may require proper stack alignment.
37328 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37329 const MIMetadata MIMD(MI);
37330 MachineFunction &MF = *BB->getParent();
37331
37332 // Emit CALLSEQ_START right before the instruction.
37333 MF.getFrameInfo().setAdjustsStack(true);
37334 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37335 MachineInstrBuilder CallseqStart =
37336 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37337 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37338
37339 // Emit CALLSEQ_END right after the instruction.
37340 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37341 MachineInstrBuilder CallseqEnd =
37342 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37343 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37344
37345 return BB;
37346}
37347
37350 MachineBasicBlock *BB) const {
37351 MachineFunction *MF = BB->getParent();
37352 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37353 const MIMetadata MIMD(MI);
37354
37355 auto TMMImmToTMMReg = [](unsigned Imm) {
37356 assert (Imm < 8 && "Illegal tmm index");
37357 return X86::TMM0 + Imm;
37358 };
37359 auto TMMImmToTMMPair = [](unsigned Imm) {
37360 assert(Imm < 8 && "Illegal tmm pair index.");
37361 return X86::TMM0_TMM1 + Imm / 2;
37362 };
37363 switch (MI.getOpcode()) {
37364 default:
37365 llvm_unreachable("Unexpected instr type to insert");
37366 case X86::INDIRECT_THUNK_CALL32:
37367 case X86::INDIRECT_THUNK_CALL64:
37368 case X86::INDIRECT_THUNK_TCRETURN32:
37369 case X86::INDIRECT_THUNK_TCRETURN64:
37370 return EmitLoweredIndirectThunk(MI, BB);
37371 case X86::CATCHRET:
37372 return EmitLoweredCatchRet(MI, BB);
37373 case X86::SEG_ALLOCA_32:
37374 case X86::SEG_ALLOCA_64:
37375 return EmitLoweredSegAlloca(MI, BB);
37376 case X86::PROBED_ALLOCA_32:
37377 case X86::PROBED_ALLOCA_64:
37378 return EmitLoweredProbedAlloca(MI, BB);
37379 case X86::TLSCall_32:
37380 case X86::TLSCall_64:
37381 return EmitLoweredTLSCall(MI, BB);
37382 case X86::CMOV_FR16:
37383 case X86::CMOV_FR16X:
37384 case X86::CMOV_FR32:
37385 case X86::CMOV_FR32X:
37386 case X86::CMOV_FR64:
37387 case X86::CMOV_FR64X:
37388 case X86::CMOV_GR8:
37389 case X86::CMOV_GR16:
37390 case X86::CMOV_GR32:
37391 case X86::CMOV_RFP32:
37392 case X86::CMOV_RFP64:
37393 case X86::CMOV_RFP80:
37394 case X86::CMOV_VR64:
37395 case X86::CMOV_VR128:
37396 case X86::CMOV_VR128X:
37397 case X86::CMOV_VR256:
37398 case X86::CMOV_VR256X:
37399 case X86::CMOV_VR512:
37400 case X86::CMOV_VK1:
37401 case X86::CMOV_VK2:
37402 case X86::CMOV_VK4:
37403 case X86::CMOV_VK8:
37404 case X86::CMOV_VK16:
37405 case X86::CMOV_VK32:
37406 case X86::CMOV_VK64:
37407 return EmitLoweredSelect(MI, BB);
37408
37409 case X86::FP80_ADDr:
37410 case X86::FP80_ADDm32: {
37411 // Change the floating point control register to use double extended
37412 // precision when performing the addition.
37413 int OrigCWFrameIdx =
37414 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37415 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37416 OrigCWFrameIdx);
37417
37418 // Load the old value of the control word...
37419 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37420 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37421 OrigCWFrameIdx);
37422
37423 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37424 // precision.
37425 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37426 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37427 .addReg(OldCW, RegState::Kill)
37428 .addImm(0x300);
37429
37430 // Extract to 16 bits.
37431 Register NewCW16 =
37432 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37433 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37434 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37435
37436 // Prepare memory for FLDCW.
37437 int NewCWFrameIdx =
37438 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37439 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37440 NewCWFrameIdx)
37441 .addReg(NewCW16, RegState::Kill);
37442
37443 // Reload the modified control word now...
37444 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37445 NewCWFrameIdx);
37446
37447 // Do the addition.
37448 if (MI.getOpcode() == X86::FP80_ADDr) {
37449 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37450 .add(MI.getOperand(0))
37451 .add(MI.getOperand(1))
37452 .add(MI.getOperand(2));
37453 } else {
37454 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37455 .add(MI.getOperand(0))
37456 .add(MI.getOperand(1))
37457 .add(MI.getOperand(2))
37458 .add(MI.getOperand(3))
37459 .add(MI.getOperand(4))
37460 .add(MI.getOperand(5))
37461 .add(MI.getOperand(6));
37462 }
37463
37464 // Reload the original control word now.
37465 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37466 OrigCWFrameIdx);
37467
37468 MI.eraseFromParent(); // The pseudo instruction is gone now.
37469 return BB;
37470 }
37471
37472 case X86::FP32_TO_INT16_IN_MEM:
37473 case X86::FP32_TO_INT32_IN_MEM:
37474 case X86::FP32_TO_INT64_IN_MEM:
37475 case X86::FP64_TO_INT16_IN_MEM:
37476 case X86::FP64_TO_INT32_IN_MEM:
37477 case X86::FP64_TO_INT64_IN_MEM:
37478 case X86::FP80_TO_INT16_IN_MEM:
37479 case X86::FP80_TO_INT32_IN_MEM:
37480 case X86::FP80_TO_INT64_IN_MEM: {
37481 // Change the floating point control register to use "round towards zero"
37482 // mode when truncating to an integer value.
37483 int OrigCWFrameIdx =
37484 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37485 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37486 OrigCWFrameIdx);
37487
37488 // Load the old value of the control word...
37489 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37490 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37491 OrigCWFrameIdx);
37492
37493 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37494 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37495 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37496 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37497
37498 // Extract to 16 bits.
37499 Register NewCW16 =
37500 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37501 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37502 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37503
37504 // Prepare memory for FLDCW.
37505 int NewCWFrameIdx =
37506 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37507 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37508 NewCWFrameIdx)
37509 .addReg(NewCW16, RegState::Kill);
37510
37511 // Reload the modified control word now...
37512 addFrameReference(BuildMI(*BB, MI, MIMD,
37513 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37514
37515 // Get the X86 opcode to use.
37516 unsigned Opc;
37517 switch (MI.getOpcode()) {
37518 // clang-format off
37519 default: llvm_unreachable("illegal opcode!");
37520 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37521 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37522 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37523 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37524 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37525 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37526 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37527 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37528 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37529 // clang-format on
37530 }
37531
37533 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37534 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37535
37536 // Reload the original control word now.
37537 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37538 OrigCWFrameIdx);
37539
37540 MI.eraseFromParent(); // The pseudo instruction is gone now.
37541 return BB;
37542 }
37543
37544 // xbegin
37545 case X86::XBEGIN:
37546 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37547
37548 case X86::VAARG_64:
37549 case X86::VAARG_X32:
37550 return EmitVAARGWithCustomInserter(MI, BB);
37551
37552 case X86::EH_SjLj_SetJmp32:
37553 case X86::EH_SjLj_SetJmp64:
37554 return emitEHSjLjSetJmp(MI, BB);
37555
37556 case X86::EH_SjLj_LongJmp32:
37557 case X86::EH_SjLj_LongJmp64:
37558 return emitEHSjLjLongJmp(MI, BB);
37559
37560 case X86::Int_eh_sjlj_setup_dispatch:
37561 return EmitSjLjDispatchBlock(MI, BB);
37562
37563 case TargetOpcode::STATEPOINT:
37564 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37565 // this point in the process. We diverge later.
37566 return emitPatchPoint(MI, BB);
37567
37568 case TargetOpcode::STACKMAP:
37569 case TargetOpcode::PATCHPOINT:
37570 return emitPatchPoint(MI, BB);
37571
37572 case TargetOpcode::PATCHABLE_EVENT_CALL:
37573 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37574 return emitPatchableEventCall(MI, BB);
37575
37576 case X86::LCMPXCHG8B: {
37577 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37578 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37579 // requires a memory operand. If it happens that current architecture is
37580 // i686 and for current function we need a base pointer
37581 // - which is ESI for i686 - register allocator would not be able to
37582 // allocate registers for an address in form of X(%reg, %reg, Y)
37583 // - there never would be enough unreserved registers during regalloc
37584 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37585 // We are giving a hand to register allocator by precomputing the address in
37586 // a new vreg using LEA.
37587
37588 // If it is not i686 or there is no base pointer - nothing to do here.
37589 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
37590 return BB;
37591
37592 // Even though this code does not necessarily needs the base pointer to
37593 // be ESI, we check for that. The reason: if this assert fails, there are
37594 // some changes happened in the compiler base pointer handling, which most
37595 // probably have to be addressed somehow here.
37596 assert(TRI->getBaseRegister() == X86::ESI &&
37597 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
37598 "base pointer in mind");
37599
37601 MVT SPTy = getPointerTy(MF->getDataLayout());
37602 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
37603 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
37604
37606 // Regalloc does not need any help when the memory operand of CMPXCHG8B
37607 // does not use index register.
37608 if (AM.IndexReg == X86::NoRegister)
37609 return BB;
37610
37611 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
37612 // four operand definitions that are E[ABCD] registers. We skip them and
37613 // then insert the LEA.
37614 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
37615 while (RMBBI != BB->rend() &&
37616 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
37617 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
37618 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
37619 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
37620 ++RMBBI;
37621 }
37624 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
37625
37626 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
37627
37628 return BB;
37629 }
37630 case X86::LCMPXCHG16B_NO_RBX: {
37631 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37632 Register BasePtr = TRI->getBaseRegister();
37633 if (TRI->hasBasePointer(*MF) &&
37634 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
37635 if (!BB->isLiveIn(BasePtr))
37636 BB->addLiveIn(BasePtr);
37637 // Save RBX into a virtual register.
37638 Register SaveRBX =
37639 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37640 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
37641 .addReg(X86::RBX);
37642 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37644 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
37645 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
37646 MIB.add(MI.getOperand(Idx));
37647 MIB.add(MI.getOperand(X86::AddrNumOperands));
37648 MIB.addReg(SaveRBX);
37649 } else {
37650 // Simple case, just copy the virtual register to RBX.
37651 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
37652 .add(MI.getOperand(X86::AddrNumOperands));
37654 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
37655 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
37656 MIB.add(MI.getOperand(Idx));
37657 }
37658 MI.eraseFromParent();
37659 return BB;
37660 }
37661 case X86::MWAITX: {
37662 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37663 Register BasePtr = TRI->getBaseRegister();
37664 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
37665 // If no need to save the base pointer, we generate MWAITXrrr,
37666 // else we generate pseudo MWAITX_SAVE_RBX.
37667 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
37668 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
37669 .addReg(MI.getOperand(0).getReg());
37670 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
37671 .addReg(MI.getOperand(1).getReg());
37672 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
37673 .addReg(MI.getOperand(2).getReg());
37674 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
37675 MI.eraseFromParent();
37676 } else {
37677 if (!BB->isLiveIn(BasePtr)) {
37678 BB->addLiveIn(BasePtr);
37679 }
37680 // Parameters can be copied into ECX and EAX but not EBX yet.
37681 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
37682 .addReg(MI.getOperand(0).getReg());
37683 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
37684 .addReg(MI.getOperand(1).getReg());
37685 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
37686 // Save RBX into a virtual register.
37687 Register SaveRBX =
37688 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37689 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
37690 .addReg(X86::RBX);
37691 // Generate mwaitx pseudo.
37692 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37693 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
37694 .addDef(Dst) // Destination tied in with SaveRBX.
37695 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
37696 .addUse(SaveRBX); // Save of base pointer.
37697 MI.eraseFromParent();
37698 }
37699 return BB;
37700 }
37701 case TargetOpcode::PREALLOCATED_SETUP: {
37702 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
37703 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37704 MFI->setHasPreallocatedCall(true);
37705 int64_t PreallocatedId = MI.getOperand(0).getImm();
37706 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
37707 assert(StackAdjustment != 0 && "0 stack adjustment");
37708 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
37709 << StackAdjustment << "\n");
37710 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
37711 .addReg(X86::ESP)
37712 .addImm(StackAdjustment);
37713 MI.eraseFromParent();
37714 return BB;
37715 }
37716 case TargetOpcode::PREALLOCATED_ARG: {
37717 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
37718 int64_t PreallocatedId = MI.getOperand(1).getImm();
37719 int64_t ArgIdx = MI.getOperand(2).getImm();
37720 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37721 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
37722 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
37723 << ", arg offset " << ArgOffset << "\n");
37724 // stack pointer + offset
37725 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
37726 MI.getOperand(0).getReg()),
37727 X86::ESP, false, ArgOffset);
37728 MI.eraseFromParent();
37729 return BB;
37730 }
37731 case X86::PTDPBSSD:
37732 case X86::PTDPBSUD:
37733 case X86::PTDPBUSD:
37734 case X86::PTDPBUUD:
37735 case X86::PTDPBF16PS:
37736 case X86::PTDPFP16PS:
37737 case X86::PTCMMIMFP16PS:
37738 case X86::PTCMMRLFP16PS:
37739 case X86::PTDPBF8PS:
37740 case X86::PTDPBHF8PS:
37741 case X86::PTDPHBF8PS:
37742 case X86::PTDPHF8PS:
37743 case X86::PTTDPBF16PS:
37744 case X86::PTTDPFP16PS:
37745 case X86::PTTCMMIMFP16PS:
37746 case X86::PTTCMMRLFP16PS:
37747 case X86::PTCONJTCMMIMFP16PS:
37748 case X86::PTMMULTF32PS:
37749 case X86::PTTMMULTF32PS: {
37750 unsigned Opc;
37751 switch (MI.getOpcode()) {
37752 default: llvm_unreachable("illegal opcode!");
37753 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
37754 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
37755 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
37756 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
37757 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
37758 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
37759 case X86::PTCMMIMFP16PS:
37760 Opc = X86::TCMMIMFP16PS;
37761 break;
37762 case X86::PTCMMRLFP16PS:
37763 Opc = X86::TCMMRLFP16PS;
37764 break;
37765 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
37766 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
37767 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
37768 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
37769 case X86::PTTDPBF16PS:
37770 Opc = X86::TTDPBF16PS;
37771 break;
37772 case X86::PTTDPFP16PS:
37773 Opc = X86::TTDPFP16PS;
37774 break;
37775 case X86::PTTCMMIMFP16PS:
37776 Opc = X86::TTCMMIMFP16PS;
37777 break;
37778 case X86::PTTCMMRLFP16PS:
37779 Opc = X86::TTCMMRLFP16PS;
37780 break;
37781 case X86::PTCONJTCMMIMFP16PS:
37782 Opc = X86::TCONJTCMMIMFP16PS;
37783 break;
37784 case X86::PTMMULTF32PS:
37785 Opc = X86::TMMULTF32PS;
37786 break;
37787 case X86::PTTMMULTF32PS:
37788 Opc = X86::TTMMULTF32PS;
37789 break;
37790 }
37791
37792 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37793 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
37794 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
37795 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37796 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
37797
37798 MI.eraseFromParent(); // The pseudo is gone now.
37799 return BB;
37800 }
37801 case X86::PTILEZERO: {
37802 unsigned Imm = MI.getOperand(0).getImm();
37803 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
37804 MI.eraseFromParent(); // The pseudo is gone now.
37805 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37807 return BB;
37808 }
37809 case X86::PTILEZEROV: {
37810 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37812 return BB;
37813 }
37814 case X86::PTILELOADDRS:
37815 case X86::PTILELOADDRST1:
37816 case X86::PTILELOADD:
37817 case X86::PTILELOADDT1:
37818 case X86::PTILESTORED: {
37819 unsigned Opc;
37820 switch (MI.getOpcode()) {
37821 default: llvm_unreachable("illegal opcode!");
37822#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
37823 case X86::PTILELOADD:
37824 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
37825 break;
37826 case X86::PTILELOADDT1:
37827 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
37828 break;
37829 case X86::PTILESTORED:
37830 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
37831 break;
37832 case X86::PTILELOADDRS:
37833 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
37834 break;
37835 case X86::PTILELOADDRST1:
37836 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
37837 break;
37838 }
37839#undef GET_EGPR_IF_ENABLED
37840
37841 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37842 unsigned CurOp = 0;
37843 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
37844 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
37846
37847 MIB.add(MI.getOperand(CurOp++)); // base
37848 MIB.add(MI.getOperand(CurOp++)); // scale
37849 MIB.add(MI.getOperand(CurOp++)); // index -- stride
37850 MIB.add(MI.getOperand(CurOp++)); // displacement
37851 MIB.add(MI.getOperand(CurOp++)); // segment
37852
37853 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
37854 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
37856
37857 MI.eraseFromParent(); // The pseudo is gone now.
37858 return BB;
37859 }
37860 case X86::PT2RPNTLVWZ0:
37861 case X86::PT2RPNTLVWZ0T1:
37862 case X86::PT2RPNTLVWZ1:
37863 case X86::PT2RPNTLVWZ1T1:
37864 case X86::PT2RPNTLVWZ0RS:
37865 case X86::PT2RPNTLVWZ0RST1:
37866 case X86::PT2RPNTLVWZ1RS:
37867 case X86::PT2RPNTLVWZ1RST1: {
37868 const DebugLoc &DL = MI.getDebugLoc();
37869 unsigned Opc;
37870#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
37871 switch (MI.getOpcode()) {
37872 default:
37873 llvm_unreachable("Unexpected instruction!");
37874 case X86::PT2RPNTLVWZ0:
37875 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
37876 break;
37877 case X86::PT2RPNTLVWZ0T1:
37878 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
37879 break;
37880 case X86::PT2RPNTLVWZ1:
37881 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
37882 break;
37883 case X86::PT2RPNTLVWZ1T1:
37884 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
37885 break;
37886 case X86::PT2RPNTLVWZ0RS:
37887 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
37888 break;
37889 case X86::PT2RPNTLVWZ0RST1:
37890 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
37891 break;
37892 case X86::PT2RPNTLVWZ1RS:
37893 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
37894 break;
37895 case X86::PT2RPNTLVWZ1RST1:
37896 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
37897 break;
37898 }
37899#undef GET_EGPR_IF_ENABLED
37900 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37901 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
37902
37903 MIB.add(MI.getOperand(1)); // base
37904 MIB.add(MI.getOperand(2)); // scale
37905 MIB.add(MI.getOperand(3)); // index
37906 MIB.add(MI.getOperand(4)); // displacement
37907 MIB.add(MI.getOperand(5)); // segment
37908 MI.eraseFromParent(); // The pseudo is gone now.
37909 return BB;
37910 }
37911 case X86::PTTRANSPOSED:
37912 case X86::PTCONJTFP16: {
37913 const DebugLoc &DL = MI.getDebugLoc();
37914 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
37915 : X86::TCONJTFP16;
37916
37917 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37918 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
37919 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37920
37921 MI.eraseFromParent(); // The pseudo is gone now.
37922 return BB;
37923 }
37924 case X86::PTCVTROWPS2BF16Hrri:
37925 case X86::PTCVTROWPS2BF16Lrri:
37926 case X86::PTCVTROWPS2PHHrri:
37927 case X86::PTCVTROWPS2PHLrri:
37928 case X86::PTCVTROWD2PSrri:
37929 case X86::PTILEMOVROWrri: {
37930 const DebugLoc &DL = MI.getDebugLoc();
37931 unsigned Opc;
37932 switch (MI.getOpcode()) {
37933 default:
37934 llvm_unreachable("Unexpected instruction!");
37935 case X86::PTCVTROWD2PSrri:
37936 Opc = X86::TCVTROWD2PSrri;
37937 break;
37938 case X86::PTCVTROWPS2BF16Hrri:
37939 Opc = X86::TCVTROWPS2BF16Hrri;
37940 break;
37941 case X86::PTCVTROWPS2PHHrri:
37942 Opc = X86::TCVTROWPS2PHHrri;
37943 break;
37944 case X86::PTCVTROWPS2BF16Lrri:
37945 Opc = X86::TCVTROWPS2BF16Lrri;
37946 break;
37947 case X86::PTCVTROWPS2PHLrri:
37948 Opc = X86::TCVTROWPS2PHLrri;
37949 break;
37950 case X86::PTILEMOVROWrri:
37951 Opc = X86::TILEMOVROWrri;
37952 break;
37953 }
37954 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37955 MIB.add(MI.getOperand(0));
37956 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37957 MIB.addImm(MI.getOperand(2).getImm());
37958
37959 MI.eraseFromParent(); // The pseudo is gone now.
37960 return BB;
37961 }
37962 case X86::PTCVTROWPS2BF16Hrre:
37963 case X86::PTCVTROWPS2BF16Lrre:
37964 case X86::PTCVTROWPS2PHHrre:
37965 case X86::PTCVTROWPS2PHLrre:
37966 case X86::PTCVTROWD2PSrre:
37967 case X86::PTILEMOVROWrre: {
37968 const DebugLoc &DL = MI.getDebugLoc();
37969 unsigned Opc;
37970 switch (MI.getOpcode()) {
37971 default:
37972 llvm_unreachable("Unexpected instruction!");
37973 case X86::PTCVTROWD2PSrre:
37974 Opc = X86::TCVTROWD2PSrre;
37975 break;
37976 case X86::PTCVTROWPS2BF16Hrre:
37977 Opc = X86::TCVTROWPS2BF16Hrre;
37978 break;
37979 case X86::PTCVTROWPS2BF16Lrre:
37980 Opc = X86::TCVTROWPS2BF16Lrre;
37981 break;
37982 case X86::PTCVTROWPS2PHHrre:
37983 Opc = X86::TCVTROWPS2PHHrre;
37984 break;
37985 case X86::PTCVTROWPS2PHLrre:
37986 Opc = X86::TCVTROWPS2PHLrre;
37987 break;
37988 case X86::PTILEMOVROWrre:
37989 Opc = X86::TILEMOVROWrre;
37990 break;
37991 }
37992 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37993 MIB.add(MI.getOperand(0));
37994 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37995 MIB.add(MI.getOperand(2));
37996
37997 MI.eraseFromParent(); // The pseudo is gone now.
37998 return BB;
37999 }
38000 }
38001}
38002
38003//===----------------------------------------------------------------------===//
38004// X86 Optimization Hooks
38005//===----------------------------------------------------------------------===//
38006
38007bool
38009 const APInt &DemandedBits,
38010 const APInt &DemandedElts,
38011 TargetLoweringOpt &TLO) const {
38012 EVT VT = Op.getValueType();
38013 unsigned Opcode = Op.getOpcode();
38014 unsigned EltSize = VT.getScalarSizeInBits();
38015
38016 if (VT.isVector()) {
38017 // If the constant is only all signbits in the active bits, then we should
38018 // extend it to the entire constant to allow it act as a boolean constant
38019 // vector.
38020 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38021 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38022 return false;
38023 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38024 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38025 continue;
38026 const APInt &Val = V.getConstantOperandAPInt(i);
38027 if (Val.getBitWidth() > Val.getNumSignBits() &&
38028 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38029 return true;
38030 }
38031 return false;
38032 };
38033 // For vectors - if we have a constant, then try to sign extend.
38034 // TODO: Handle AND cases.
38035 unsigned ActiveBits = DemandedBits.getActiveBits();
38036 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38037 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38038 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38039 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38040 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38042 SDValue NewC =
38044 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38045 SDValue NewOp =
38046 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38047 return TLO.CombineTo(Op, NewOp);
38048 }
38049 return false;
38050 }
38051
38052 // Only optimize Ands to prevent shrinking a constant that could be
38053 // matched by movzx.
38054 if (Opcode != ISD::AND)
38055 return false;
38056
38057 // Make sure the RHS really is a constant.
38058 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38059 if (!C)
38060 return false;
38061
38062 const APInt &Mask = C->getAPIntValue();
38063
38064 // Clear all non-demanded bits initially.
38065 APInt ShrunkMask = Mask & DemandedBits;
38066
38067 // Find the width of the shrunk mask.
38068 unsigned Width = ShrunkMask.getActiveBits();
38069
38070 // If the mask is all 0s there's nothing to do here.
38071 if (Width == 0)
38072 return false;
38073
38074 // Find the next power of 2 width, rounding up to a byte.
38075 Width = llvm::bit_ceil(std::max(Width, 8U));
38076 // Truncate the width to size to handle illegal types.
38077 Width = std::min(Width, EltSize);
38078
38079 // Calculate a possible zero extend mask for this constant.
38080 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38081
38082 // If we aren't changing the mask, just return true to keep it and prevent
38083 // the caller from optimizing.
38084 if (ZeroExtendMask == Mask)
38085 return true;
38086
38087 // Make sure the new mask can be represented by a combination of mask bits
38088 // and non-demanded bits.
38089 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38090 return false;
38091
38092 // Replace the constant with the zero extend mask.
38093 SDLoc DL(Op);
38094 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38095 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38096 return TLO.CombineTo(Op, NewOp);
38097}
38098
38100 KnownBits &Known,
38101 const APInt &DemandedElts,
38102 const SelectionDAG &DAG, unsigned Depth) {
38103 KnownBits Known2;
38104 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38105 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38106 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38107 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38108 Known = KnownBits::abdu(Known, Known2).zext(16);
38109 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38110 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38111 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38112 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38113 Known = Known.zext(64);
38114}
38115
38117 KnownBits &Known,
38118 const APInt &DemandedElts,
38119 const SelectionDAG &DAG,
38120 unsigned Depth) {
38121 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38122
38123 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38124 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38125 APInt DemandedLoElts =
38126 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38127 APInt DemandedHiElts =
38128 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38129 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38130 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38131 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38132 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38133 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38134 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38135 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38136}
38137
38139 KnownBits &Known,
38140 const APInt &DemandedElts,
38141 const SelectionDAG &DAG,
38142 unsigned Depth) {
38143 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38144
38145 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38146 // pairs.
38147 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38148 APInt DemandedLoElts =
38149 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38150 APInt DemandedHiElts =
38151 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38152 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38153 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38154 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38155 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38156 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38157 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38158 Known = KnownBits::sadd_sat(Lo, Hi);
38159}
38160
38162 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38163 const SelectionDAG &DAG,
38164 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38165 KnownBitsFunc) {
38166 APInt DemandedEltsLHS, DemandedEltsRHS;
38167 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38168 DemandedElts, DemandedEltsLHS,
38169 DemandedEltsRHS);
38170
38171 const auto ComputeForSingleOpFunc =
38172 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38173 return KnownBitsFunc(
38174 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38175 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38176 };
38177
38178 if (DemandedEltsRHS.isZero())
38179 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38180 if (DemandedEltsLHS.isZero())
38181 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38182
38183 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38184 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38185}
38186
38188 KnownBits &Known,
38189 const APInt &DemandedElts,
38190 const SelectionDAG &DAG,
38191 unsigned Depth) const {
38192 unsigned BitWidth = Known.getBitWidth();
38193 unsigned NumElts = DemandedElts.getBitWidth();
38194 unsigned Opc = Op.getOpcode();
38195 EVT VT = Op.getValueType();
38196 assert((Opc >= ISD::BUILTIN_OP_END ||
38197 Opc == ISD::INTRINSIC_WO_CHAIN ||
38198 Opc == ISD::INTRINSIC_W_CHAIN ||
38199 Opc == ISD::INTRINSIC_VOID) &&
38200 "Should use MaskedValueIsZero if you don't know whether Op"
38201 " is a target node!");
38202
38203 Known.resetAll();
38204 switch (Opc) {
38205 default: break;
38206 case X86ISD::MUL_IMM: {
38207 KnownBits Known2;
38208 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38209 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38210 Known = KnownBits::mul(Known, Known2);
38211 break;
38212 }
38213 case X86ISD::BSF: {
38215
38216 KnownBits Known2;
38217 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38218 if (Known2.isNonZero()) {
38219 // If we have a known 1, its position is our upper bound.
38220 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38221 unsigned LowBits = llvm::bit_width(PossibleTZ);
38222 Known.Zero.setBitsFrom(LowBits);
38223 } else if (!Op.getOperand(0).isUndef()) {
38224 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38225 Known = Known.intersectWith(Known2);
38226 }
38227 break;
38228 }
38229 case X86ISD::BSR: {
38230 // TODO: Bound with input known bits?
38232
38233 if (!Op.getOperand(0).isUndef() &&
38234 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38235 KnownBits Known2;
38236 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38237 Known = Known.intersectWith(Known2);
38238 }
38239 break;
38240 }
38241 case X86ISD::SETCC:
38242 Known.Zero.setBitsFrom(1);
38243 break;
38244 case X86ISD::MOVMSK: {
38245 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38246 Known.Zero.setBitsFrom(NumLoBits);
38247 break;
38248 }
38249 case X86ISD::PEXTRB:
38250 case X86ISD::PEXTRW: {
38251 SDValue Src = Op.getOperand(0);
38252 EVT SrcVT = Src.getValueType();
38253 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38254 Op.getConstantOperandVal(1));
38255 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38256 Known = Known.anyextOrTrunc(BitWidth);
38257 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38258 break;
38259 }
38260 case X86ISD::VSRAI:
38261 case X86ISD::VSHLI:
38262 case X86ISD::VSRLI: {
38263 unsigned ShAmt = Op.getConstantOperandVal(1);
38264 if (ShAmt >= VT.getScalarSizeInBits()) {
38265 // Out of range logical bit shifts are guaranteed to be zero.
38266 // Out of range arithmetic bit shifts splat the sign bit.
38267 if (Opc != X86ISD::VSRAI) {
38268 Known.setAllZero();
38269 break;
38270 }
38271
38272 ShAmt = VT.getScalarSizeInBits() - 1;
38273 }
38274
38275 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38276 if (Opc == X86ISD::VSHLI) {
38277 Known.Zero <<= ShAmt;
38278 Known.One <<= ShAmt;
38279 // Low bits are known zero.
38280 Known.Zero.setLowBits(ShAmt);
38281 } else if (Opc == X86ISD::VSRLI) {
38282 Known.Zero.lshrInPlace(ShAmt);
38283 Known.One.lshrInPlace(ShAmt);
38284 // High bits are known zero.
38285 Known.Zero.setHighBits(ShAmt);
38286 } else {
38287 Known.Zero.ashrInPlace(ShAmt);
38288 Known.One.ashrInPlace(ShAmt);
38289 }
38290 break;
38291 }
38292 case X86ISD::PACKUS: {
38293 // PACKUS is just a truncation if the upper half is zero.
38294 APInt DemandedLHS, DemandedRHS;
38295 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38296
38297 Known.One = APInt::getAllOnes(BitWidth * 2);
38298 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38299
38300 KnownBits Known2;
38301 if (!!DemandedLHS) {
38302 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38303 Known = Known.intersectWith(Known2);
38304 }
38305 if (!!DemandedRHS) {
38306 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38307 Known = Known.intersectWith(Known2);
38308 }
38309
38310 if (Known.countMinLeadingZeros() < BitWidth)
38311 Known.resetAll();
38312 Known = Known.trunc(BitWidth);
38313 break;
38314 }
38315 case X86ISD::PSHUFB: {
38316 SDValue Src = Op.getOperand(0);
38317 SDValue Idx = Op.getOperand(1);
38318
38319 // If the index vector is never negative (MSB is zero), then all elements
38320 // come from the source vector. This is useful for cases where
38321 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38322 // below will handle the more common constant shuffle mask case.
38323 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38324 if (KnownIdx.isNonNegative())
38325 Known = DAG.computeKnownBits(Src, Depth + 1);
38326 break;
38327 }
38328 case X86ISD::VBROADCAST: {
38329 SDValue Src = Op.getOperand(0);
38330 if (!Src.getSimpleValueType().isVector()) {
38331 Known = DAG.computeKnownBits(Src, Depth + 1);
38332 return;
38333 }
38334 break;
38335 }
38336 case X86ISD::AND: {
38337 if (Op.getResNo() == 0) {
38338 KnownBits Known2;
38339 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38340 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38341 Known &= Known2;
38342 }
38343 break;
38344 }
38345 case X86ISD::ANDNP: {
38346 KnownBits Known2;
38347 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38348 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38349
38350 // ANDNP = (~X & Y);
38351 Known.One &= Known2.Zero;
38352 Known.Zero |= Known2.One;
38353 break;
38354 }
38355 case X86ISD::FOR: {
38356 KnownBits Known2;
38357 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38358 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38359
38360 Known |= Known2;
38361 break;
38362 }
38363 case X86ISD::PSADBW: {
38364 SDValue LHS = Op.getOperand(0);
38365 SDValue RHS = Op.getOperand(1);
38366 assert(VT.getScalarType() == MVT::i64 &&
38367 LHS.getValueType() == RHS.getValueType() &&
38368 LHS.getValueType().getScalarType() == MVT::i8 &&
38369 "Unexpected PSADBW types");
38370 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38371 break;
38372 }
38373 case X86ISD::PCMPGT:
38374 case X86ISD::PCMPEQ: {
38375 KnownBits KnownLhs =
38376 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38377 KnownBits KnownRhs =
38378 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38379 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38380 ? KnownBits::eq(KnownLhs, KnownRhs)
38381 : KnownBits::sgt(KnownLhs, KnownRhs);
38382 if (Res) {
38383 if (*Res)
38384 Known.setAllOnes();
38385 else
38386 Known.setAllZero();
38387 }
38388 break;
38389 }
38390 case X86ISD::VPMADDWD: {
38391 SDValue LHS = Op.getOperand(0);
38392 SDValue RHS = Op.getOperand(1);
38393 assert(VT.getVectorElementType() == MVT::i32 &&
38394 LHS.getValueType() == RHS.getValueType() &&
38395 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38396 "Unexpected PMADDWD types");
38397 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38398 break;
38399 }
38400 case X86ISD::VPMADDUBSW: {
38401 SDValue LHS = Op.getOperand(0);
38402 SDValue RHS = Op.getOperand(1);
38403 assert(VT.getVectorElementType() == MVT::i16 &&
38404 LHS.getValueType() == RHS.getValueType() &&
38405 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38406 "Unexpected PMADDUBSW types");
38407 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38408 break;
38409 }
38410 case X86ISD::PMULUDQ: {
38411 KnownBits Known2;
38412 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38413 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38414
38415 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38416 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38417 Known = KnownBits::mul(Known, Known2);
38418 break;
38419 }
38420 case X86ISD::CMOV: {
38421 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38422 // If we don't know any bits, early out.
38423 if (Known.isUnknown())
38424 break;
38425 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38426
38427 // Only known if known in both the LHS and RHS.
38428 Known = Known.intersectWith(Known2);
38429 break;
38430 }
38431 case X86ISD::BEXTR:
38432 case X86ISD::BEXTRI: {
38433 SDValue Op0 = Op.getOperand(0);
38434 SDValue Op1 = Op.getOperand(1);
38435
38436 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38437 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38438 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38439
38440 // If the length is 0, the result is 0.
38441 if (Length == 0) {
38442 Known.setAllZero();
38443 break;
38444 }
38445
38446 if ((Shift + Length) <= BitWidth) {
38447 Known = DAG.computeKnownBits(Op0, Depth + 1);
38448 Known = Known.extractBits(Length, Shift);
38449 Known = Known.zextOrTrunc(BitWidth);
38450 }
38451 }
38452 break;
38453 }
38454 case X86ISD::PDEP: {
38455 KnownBits Known2;
38456 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38457 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38458 // Zeros are retained from the mask operand. But not ones.
38459 Known.One.clearAllBits();
38460 // The result will have at least as many trailing zeros as the non-mask
38461 // operand since bits can only map to the same or higher bit position.
38462 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38463 break;
38464 }
38465 case X86ISD::PEXT: {
38466 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38467 // The result has as many leading zeros as the number of zeroes in the mask.
38468 unsigned Count = Known.Zero.popcount();
38469 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
38470 Known.One.clearAllBits();
38471 break;
38472 }
38473 case X86ISD::VTRUNC:
38474 case X86ISD::VTRUNCS:
38475 case X86ISD::VTRUNCUS:
38476 case X86ISD::CVTSI2P:
38477 case X86ISD::CVTUI2P:
38478 case X86ISD::CVTP2SI:
38479 case X86ISD::CVTP2UI:
38480 case X86ISD::MCVTP2SI:
38481 case X86ISD::MCVTP2UI:
38482 case X86ISD::CVTTP2SI:
38483 case X86ISD::CVTTP2UI:
38484 case X86ISD::MCVTTP2SI:
38485 case X86ISD::MCVTTP2UI:
38486 case X86ISD::MCVTSI2P:
38487 case X86ISD::MCVTUI2P:
38488 case X86ISD::VFPROUND:
38489 case X86ISD::VMFPROUND:
38490 case X86ISD::CVTPS2PH:
38491 case X86ISD::MCVTPS2PH:
38492 case X86ISD::MCVTTP2SIS:
38493 case X86ISD::MCVTTP2UIS: {
38494 // Truncations/Conversions - upper elements are known zero.
38495 EVT SrcVT = Op.getOperand(0).getValueType();
38496 if (SrcVT.isVector()) {
38497 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38498 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38499 Known.setAllZero();
38500 }
38501 break;
38502 }
38509 // Strict Conversions - upper elements are known zero.
38510 EVT SrcVT = Op.getOperand(1).getValueType();
38511 if (SrcVT.isVector()) {
38512 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38513 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38514 Known.setAllZero();
38515 }
38516 break;
38517 }
38518 case X86ISD::MOVQ2DQ: {
38519 // Move from MMX to XMM. Upper half of XMM should be 0.
38520 if (DemandedElts.countr_zero() >= (NumElts / 2))
38521 Known.setAllZero();
38522 break;
38523 }
38525 APInt UndefElts;
38526 SmallVector<APInt, 16> EltBits;
38527 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38528 /*AllowWholeUndefs*/ false,
38529 /*AllowPartialUndefs*/ false)) {
38530 Known.Zero.setAllBits();
38531 Known.One.setAllBits();
38532 for (unsigned I = 0; I != NumElts; ++I) {
38533 if (!DemandedElts[I])
38534 continue;
38535 if (UndefElts[I]) {
38536 Known.resetAll();
38537 break;
38538 }
38539 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38540 Known = Known.intersectWith(Known2);
38541 }
38542 return;
38543 }
38544 break;
38545 }
38546 case X86ISD::HADD:
38547 case X86ISD::HSUB: {
38549 Op, DemandedElts, Depth, DAG,
38550 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38552 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38553 KnownLHS, KnownRHS);
38554 });
38555 break;
38556 }
38558 switch (Op->getConstantOperandVal(0)) {
38559 case Intrinsic::x86_sse2_pmadd_wd:
38560 case Intrinsic::x86_avx2_pmadd_wd:
38561 case Intrinsic::x86_avx512_pmaddw_d_512: {
38562 SDValue LHS = Op.getOperand(1);
38563 SDValue RHS = Op.getOperand(2);
38564 assert(VT.getScalarType() == MVT::i32 &&
38565 LHS.getValueType() == RHS.getValueType() &&
38566 LHS.getValueType().getScalarType() == MVT::i16 &&
38567 "Unexpected PMADDWD types");
38568 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38569 break;
38570 }
38571 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38572 case Intrinsic::x86_avx2_pmadd_ub_sw:
38573 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38574 SDValue LHS = Op.getOperand(1);
38575 SDValue RHS = Op.getOperand(2);
38576 assert(VT.getScalarType() == MVT::i16 &&
38577 LHS.getValueType() == RHS.getValueType() &&
38578 LHS.getValueType().getScalarType() == MVT::i8 &&
38579 "Unexpected PMADDUBSW types");
38580 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38581 break;
38582 }
38583 case Intrinsic::x86_sse2_psad_bw:
38584 case Intrinsic::x86_avx2_psad_bw:
38585 case Intrinsic::x86_avx512_psad_bw_512: {
38586 SDValue LHS = Op.getOperand(1);
38587 SDValue RHS = Op.getOperand(2);
38588 assert(VT.getScalarType() == MVT::i64 &&
38589 LHS.getValueType() == RHS.getValueType() &&
38590 LHS.getValueType().getScalarType() == MVT::i8 &&
38591 "Unexpected PSADBW types");
38592 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38593 break;
38594 }
38595 }
38596 break;
38597 }
38598 }
38599
38600 // Handle target shuffles.
38601 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38602 if (isTargetShuffle(Opc)) {
38605 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
38606 unsigned NumOps = Ops.size();
38607 unsigned NumElts = VT.getVectorNumElements();
38608 if (Mask.size() == NumElts) {
38609 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38610 Known.Zero.setAllBits(); Known.One.setAllBits();
38611 for (unsigned i = 0; i != NumElts; ++i) {
38612 if (!DemandedElts[i])
38613 continue;
38614 int M = Mask[i];
38615 if (M == SM_SentinelUndef) {
38616 // For UNDEF elements, we don't know anything about the common state
38617 // of the shuffle result.
38618 Known.resetAll();
38619 break;
38620 }
38621 if (M == SM_SentinelZero) {
38622 Known.One.clearAllBits();
38623 continue;
38624 }
38625 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
38626 "Shuffle index out of range");
38627
38628 unsigned OpIdx = (unsigned)M / NumElts;
38629 unsigned EltIdx = (unsigned)M % NumElts;
38630 if (Ops[OpIdx].getValueType() != VT) {
38631 // TODO - handle target shuffle ops with different value types.
38632 Known.resetAll();
38633 break;
38634 }
38635 DemandedOps[OpIdx].setBit(EltIdx);
38636 }
38637 // Known bits are the values that are shared by every demanded element.
38638 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
38639 if (!DemandedOps[i])
38640 continue;
38641 KnownBits Known2 =
38642 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
38643 Known = Known.intersectWith(Known2);
38644 }
38645 }
38646 }
38647 }
38648}
38649
38651 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
38652 unsigned Depth) const {
38653 EVT VT = Op.getValueType();
38654 unsigned VTBits = VT.getScalarSizeInBits();
38655 unsigned Opcode = Op.getOpcode();
38656 switch (Opcode) {
38658 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
38659 return VTBits;
38660
38661 case X86ISD::VTRUNC: {
38662 SDValue Src = Op.getOperand(0);
38663 MVT SrcVT = Src.getSimpleValueType();
38664 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
38665 assert(VTBits < NumSrcBits && "Illegal truncation input type");
38666 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38667 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
38668 if (Tmp > (NumSrcBits - VTBits))
38669 return Tmp - (NumSrcBits - VTBits);
38670 return 1;
38671 }
38672
38673 case X86ISD::PACKSS: {
38674 // PACKSS is just a truncation if the sign bits extend to the packed size.
38675 APInt DemandedLHS, DemandedRHS;
38676 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
38677 DemandedRHS);
38678
38679 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
38680 // patterns often used to compact vXi64 allsignbit patterns.
38681 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
38683 if (BC.getOpcode() == X86ISD::PACKSS &&
38684 BC.getScalarValueSizeInBits() == 16 &&
38685 V.getScalarValueSizeInBits() == 32) {
38688 if (BC0.getScalarValueSizeInBits() == 64 &&
38689 BC1.getScalarValueSizeInBits() == 64 &&
38690 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
38691 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
38692 return 32;
38693 }
38694 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
38695 };
38696
38697 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
38698 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
38699 if (!!DemandedLHS)
38700 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
38701 if (!!DemandedRHS)
38702 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
38703 unsigned Tmp = std::min(Tmp0, Tmp1);
38704 if (Tmp > (SrcBits - VTBits))
38705 return Tmp - (SrcBits - VTBits);
38706 return 1;
38707 }
38708
38709 case X86ISD::VBROADCAST: {
38710 SDValue Src = Op.getOperand(0);
38711 if (!Src.getSimpleValueType().isVector())
38712 return DAG.ComputeNumSignBits(Src, Depth + 1);
38713 break;
38714 }
38715
38716 case X86ISD::VSHLI: {
38717 SDValue Src = Op.getOperand(0);
38718 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
38719 if (ShiftVal.uge(VTBits))
38720 return VTBits; // Shifted all bits out --> zero.
38721 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38722 if (ShiftVal.uge(Tmp))
38723 return 1; // Shifted all sign bits out --> unknown.
38724 return Tmp - ShiftVal.getZExtValue();
38725 }
38726
38727 case X86ISD::VSRAI: {
38728 SDValue Src = Op.getOperand(0);
38729 APInt ShiftVal = Op.getConstantOperandAPInt(1);
38730 if (ShiftVal.uge(VTBits - 1))
38731 return VTBits; // Sign splat.
38732 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38733 ShiftVal += Tmp;
38734 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
38735 }
38736
38737 case X86ISD::FSETCC:
38738 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
38739 if (VT == MVT::f32 || VT == MVT::f64 ||
38740 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
38741 return VTBits;
38742 break;
38743
38744 case X86ISD::PCMPGT:
38745 case X86ISD::PCMPEQ:
38746 case X86ISD::CMPP:
38747 case X86ISD::VPCOM:
38748 case X86ISD::VPCOMU:
38749 // Vector compares return zero/all-bits result values.
38750 return VTBits;
38751
38752 case X86ISD::ANDNP: {
38753 unsigned Tmp0 =
38754 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
38755 if (Tmp0 == 1) return 1; // Early out.
38756 unsigned Tmp1 =
38757 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
38758 return std::min(Tmp0, Tmp1);
38759 }
38760
38761 case X86ISD::CMOV: {
38762 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
38763 if (Tmp0 == 1) return 1; // Early out.
38764 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
38765 return std::min(Tmp0, Tmp1);
38766 }
38767 }
38768
38769 // Handle target shuffles.
38770 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38771 if (isTargetShuffle(Opcode)) {
38774 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
38775 unsigned NumOps = Ops.size();
38776 unsigned NumElts = VT.getVectorNumElements();
38777 if (Mask.size() == NumElts) {
38778 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38779 for (unsigned i = 0; i != NumElts; ++i) {
38780 if (!DemandedElts[i])
38781 continue;
38782 int M = Mask[i];
38783 if (M == SM_SentinelUndef) {
38784 // For UNDEF elements, we don't know anything about the common state
38785 // of the shuffle result.
38786 return 1;
38787 } else if (M == SM_SentinelZero) {
38788 // Zero = all sign bits.
38789 continue;
38790 }
38791 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
38792 "Shuffle index out of range");
38793
38794 unsigned OpIdx = (unsigned)M / NumElts;
38795 unsigned EltIdx = (unsigned)M % NumElts;
38796 if (Ops[OpIdx].getValueType() != VT) {
38797 // TODO - handle target shuffle ops with different value types.
38798 return 1;
38799 }
38800 DemandedOps[OpIdx].setBit(EltIdx);
38801 }
38802 unsigned Tmp0 = VTBits;
38803 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
38804 if (!DemandedOps[i])
38805 continue;
38806 unsigned Tmp1 =
38807 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
38808 Tmp0 = std::min(Tmp0, Tmp1);
38809 }
38810 return Tmp0;
38811 }
38812 }
38813 }
38814
38815 // Fallback case.
38816 return 1;
38817}
38818
38820 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
38821 return N->getOperand(0);
38822 return N;
38823}
38824
38825// Helper to look for a normal load that can be narrowed into a vzload with the
38826// specified VT and memory VT. Returns SDValue() on failure.
38828 SelectionDAG &DAG) {
38829 // Can't if the load is volatile or atomic.
38830 if (!LN->isSimple())
38831 return SDValue();
38832
38833 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38834 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38835 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
38836 LN->getPointerInfo(), LN->getOriginalAlign(),
38837 LN->getMemOperand()->getFlags());
38838}
38839
38840// Attempt to match a combined shuffle mask against supported unary shuffle
38841// instructions.
38842// TODO: Investigate sharing more of this with shuffle lowering.
38843static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
38844 bool AllowFloatDomain, bool AllowIntDomain,
38845 SDValue V1, const SelectionDAG &DAG,
38846 const X86Subtarget &Subtarget, unsigned &Shuffle,
38847 MVT &SrcVT, MVT &DstVT) {
38848 unsigned NumMaskElts = Mask.size();
38849 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
38850
38851 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
38852 if (Mask[0] == 0 &&
38853 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
38854 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
38856 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
38857 Shuffle = X86ISD::VZEXT_MOVL;
38858 if (MaskEltSize == 16)
38859 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38860 else
38861 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38862 return true;
38863 }
38864 }
38865
38866 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
38867 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
38868 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
38869 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
38870 unsigned MaxScale = 64 / MaskEltSize;
38871 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
38872 DAG.ComputeNumSignBits(V1) == MaskEltSize;
38873 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
38874 bool MatchAny = true;
38875 bool MatchZero = true;
38876 bool MatchSign = UseSign;
38877 unsigned NumDstElts = NumMaskElts / Scale;
38878 for (unsigned i = 0;
38879 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
38880 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
38881 MatchAny = MatchSign = MatchZero = false;
38882 break;
38883 }
38884 unsigned Pos = (i * Scale) + 1;
38885 unsigned Len = Scale - 1;
38886 MatchAny &= isUndefInRange(Mask, Pos, Len);
38887 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
38888 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
38889 }
38890 if (MatchAny || MatchSign || MatchZero) {
38891 assert((MatchSign || MatchZero) &&
38892 "Failed to match sext/zext but matched aext?");
38893 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
38894 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
38895 : MVT::getIntegerVT(MaskEltSize);
38896 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
38897
38898 Shuffle = unsigned(
38899 MatchAny ? ISD::ANY_EXTEND
38900 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
38901 if (SrcVT.getVectorNumElements() != NumDstElts)
38902 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
38903
38904 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
38905 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
38906 return true;
38907 }
38908 }
38909 }
38910
38911 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
38912 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
38913 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
38914 isUndefOrEqual(Mask[0], 0) &&
38915 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
38916 Shuffle = X86ISD::VZEXT_MOVL;
38917 if (MaskEltSize == 16)
38918 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38919 else
38920 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38921 return true;
38922 }
38923
38924 // Check if we have SSE3 which will let us use MOVDDUP etc. The
38925 // instructions are no slower than UNPCKLPD but has the option to
38926 // fold the input operand into even an unaligned memory load.
38927 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
38928 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
38929 Shuffle = X86ISD::MOVDDUP;
38930 SrcVT = DstVT = MVT::v2f64;
38931 return true;
38932 }
38933 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
38934 Shuffle = X86ISD::MOVSLDUP;
38935 SrcVT = DstVT = MVT::v4f32;
38936 return true;
38937 }
38938 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
38939 Shuffle = X86ISD::MOVSHDUP;
38940 SrcVT = DstVT = MVT::v4f32;
38941 return true;
38942 }
38943 }
38944
38945 if (MaskVT.is256BitVector() && AllowFloatDomain) {
38946 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
38947 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
38948 Shuffle = X86ISD::MOVDDUP;
38949 SrcVT = DstVT = MVT::v4f64;
38950 return true;
38951 }
38952 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
38953 V1)) {
38954 Shuffle = X86ISD::MOVSLDUP;
38955 SrcVT = DstVT = MVT::v8f32;
38956 return true;
38957 }
38958 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
38959 V1)) {
38960 Shuffle = X86ISD::MOVSHDUP;
38961 SrcVT = DstVT = MVT::v8f32;
38962 return true;
38963 }
38964 }
38965
38966 if (MaskVT.is512BitVector() && AllowFloatDomain) {
38967 assert(Subtarget.hasAVX512() &&
38968 "AVX512 required for 512-bit vector shuffles");
38969 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
38970 V1)) {
38971 Shuffle = X86ISD::MOVDDUP;
38972 SrcVT = DstVT = MVT::v8f64;
38973 return true;
38974 }
38976 MaskVT, Mask,
38977 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
38978 Shuffle = X86ISD::MOVSLDUP;
38979 SrcVT = DstVT = MVT::v16f32;
38980 return true;
38981 }
38983 MaskVT, Mask,
38984 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
38985 Shuffle = X86ISD::MOVSHDUP;
38986 SrcVT = DstVT = MVT::v16f32;
38987 return true;
38988 }
38989 }
38990
38991 return false;
38992}
38993
38994// Attempt to match a combined shuffle mask against supported unary immediate
38995// permute instructions.
38996// TODO: Investigate sharing more of this with shuffle lowering.
38998 const APInt &Zeroable,
38999 bool AllowFloatDomain, bool AllowIntDomain,
39000 const SelectionDAG &DAG,
39001 const X86Subtarget &Subtarget,
39002 unsigned &Shuffle, MVT &ShuffleVT,
39003 unsigned &PermuteImm) {
39004 unsigned NumMaskElts = Mask.size();
39005 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39006 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39007 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39008 bool ContainsZeros = isAnyZero(Mask);
39009
39010 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39011 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39012 // Check for lane crossing permutes.
39013 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39014 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39015 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39016 Shuffle = X86ISD::VPERMI;
39017 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39018 PermuteImm = getV4X86ShuffleImm(Mask);
39019 return true;
39020 }
39021 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39022 SmallVector<int, 4> RepeatedMask;
39023 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39024 Shuffle = X86ISD::VPERMI;
39025 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39026 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39027 return true;
39028 }
39029 }
39030 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39031 // VPERMILPD can permute with a non-repeating shuffle.
39032 Shuffle = X86ISD::VPERMILPI;
39033 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39034 PermuteImm = 0;
39035 for (int i = 0, e = Mask.size(); i != e; ++i) {
39036 int M = Mask[i];
39037 if (M == SM_SentinelUndef)
39038 continue;
39039 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39040 PermuteImm |= (M & 1) << i;
39041 }
39042 return true;
39043 }
39044 }
39045
39046 // We are checking for shuffle match or shift match. Loop twice so we can
39047 // order which we try and match first depending on target preference.
39048 for (unsigned Order = 0; Order < 2; ++Order) {
39049 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39050 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39051 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39052 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39053 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39054 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39055 SmallVector<int, 4> RepeatedMask;
39056 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39057 // Narrow the repeated mask to create 32-bit element permutes.
39058 SmallVector<int, 4> WordMask = RepeatedMask;
39059 if (MaskScalarSizeInBits == 64)
39060 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39061
39062 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39063 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39064 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39065 PermuteImm = getV4X86ShuffleImm(WordMask);
39066 return true;
39067 }
39068 }
39069
39070 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39071 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39072 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39073 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39074 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39075 SmallVector<int, 4> RepeatedMask;
39076 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39077 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39078 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39079
39080 // PSHUFLW: permute lower 4 elements only.
39081 if (isUndefOrInRange(LoMask, 0, 4) &&
39082 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39083 Shuffle = X86ISD::PSHUFLW;
39084 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39085 PermuteImm = getV4X86ShuffleImm(LoMask);
39086 return true;
39087 }
39088
39089 // PSHUFHW: permute upper 4 elements only.
39090 if (isUndefOrInRange(HiMask, 4, 8) &&
39091 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39092 // Offset the HiMask so that we can create the shuffle immediate.
39093 int OffsetHiMask[4];
39094 for (int i = 0; i != 4; ++i)
39095 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39096
39097 Shuffle = X86ISD::PSHUFHW;
39098 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39099 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39100 return true;
39101 }
39102 }
39103 }
39104 } else {
39105 // Attempt to match against bit rotates.
39106 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39107 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39108 Subtarget.hasAVX512())) {
39109 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39110 Subtarget, Mask);
39111 if (0 < RotateAmt) {
39112 Shuffle = X86ISD::VROTLI;
39113 PermuteImm = (unsigned)RotateAmt;
39114 return true;
39115 }
39116 }
39117 }
39118 // Attempt to match against byte/bit shifts.
39119 if (AllowIntDomain &&
39120 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39121 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39122 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39123 int ShiftAmt =
39124 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39125 Zeroable, Subtarget);
39126 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39127 32 <= ShuffleVT.getScalarSizeInBits())) {
39128 // Byte shifts can be slower so only match them on second attempt.
39129 if (Order == 0 &&
39130 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39131 continue;
39132
39133 PermuteImm = (unsigned)ShiftAmt;
39134 return true;
39135 }
39136
39137 }
39138 }
39139
39140 return false;
39141}
39142
39143// Attempt to match a combined unary shuffle mask against supported binary
39144// shuffle instructions.
39145// TODO: Investigate sharing more of this with shuffle lowering.
39146static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39147 bool AllowFloatDomain, bool AllowIntDomain,
39148 SDValue &V1, SDValue &V2, const SDLoc &DL,
39149 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39150 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39151 bool IsUnary) {
39152 unsigned NumMaskElts = Mask.size();
39153 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39154 unsigned SizeInBits = MaskVT.getSizeInBits();
39155
39156 if (MaskVT.is128BitVector()) {
39157 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39158 AllowFloatDomain) {
39159 V2 = V1;
39160 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39161 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39162 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39163 return true;
39164 }
39165 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39166 AllowFloatDomain) {
39167 V2 = V1;
39168 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39169 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39170 return true;
39171 }
39172 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39173 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39174 std::swap(V1, V2);
39175 Shuffle = X86ISD::MOVSD;
39176 SrcVT = DstVT = MVT::v2f64;
39177 return true;
39178 }
39179 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39180 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39181 Shuffle = X86ISD::MOVSS;
39182 SrcVT = DstVT = MVT::v4f32;
39183 return true;
39184 }
39185 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39186 DAG) &&
39187 Subtarget.hasFP16()) {
39188 Shuffle = X86ISD::MOVSH;
39189 SrcVT = DstVT = MVT::v8f16;
39190 return true;
39191 }
39192 }
39193
39194 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39195 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39196 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39197 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39198 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39199 Subtarget)) {
39200 DstVT = MaskVT;
39201 return true;
39202 }
39203 }
39204 // TODO: Can we handle this inside matchShuffleWithPACK?
39205 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39206 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39207 V1.getScalarValueSizeInBits() == 64 &&
39208 V2.getScalarValueSizeInBits() == 64) {
39209 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39210 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39211 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39212 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39213 SrcVT = MVT::v4i32;
39214 DstVT = MVT::v8i16;
39215 Shuffle = X86ISD::PACKUS;
39216 return true;
39217 }
39218 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39219 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39220 SrcVT = MVT::v8i16;
39221 DstVT = MVT::v16i8;
39222 Shuffle = X86ISD::PACKUS;
39223 return true;
39224 }
39225 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39226 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39227 SrcVT = MVT::v4i32;
39228 DstVT = MVT::v8i16;
39229 Shuffle = X86ISD::PACKSS;
39230 return true;
39231 }
39232 }
39233
39234 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39235 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39236 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39237 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39238 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39239 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39240 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39241 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39242 Subtarget)) {
39243 SrcVT = DstVT = MaskVT;
39244 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39245 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39246 return true;
39247 }
39248 }
39249
39250 // Attempt to match against a OR if we're performing a blend shuffle and the
39251 // non-blended source element is zero in each case.
39252 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39253 if (SizeInBits == V1.getValueSizeInBits() &&
39254 SizeInBits == V2.getValueSizeInBits() &&
39255 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39256 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39257 bool IsBlend = true;
39258 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39259 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39260 unsigned Scale1 = NumV1Elts / NumMaskElts;
39261 unsigned Scale2 = NumV2Elts / NumMaskElts;
39262 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39263 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39264 for (unsigned i = 0; i != NumMaskElts; ++i) {
39265 int M = Mask[i];
39266 if (M == SM_SentinelUndef)
39267 continue;
39268 if (M == SM_SentinelZero) {
39269 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39270 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39271 continue;
39272 }
39273 if (M == (int)i) {
39274 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39275 continue;
39276 }
39277 if (M == (int)(i + NumMaskElts)) {
39278 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39279 continue;
39280 }
39281 IsBlend = false;
39282 break;
39283 }
39284 if (IsBlend) {
39285 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39286 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39287 Shuffle = ISD::OR;
39288 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39289 return true;
39290 }
39291 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39292 // FIXME: handle mismatched sizes?
39293 // TODO: investigate if `ISD::OR` handling in
39294 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39295 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39296 unsigned NumElts = V.getValueType().getVectorNumElements();
39297 KnownBits Known(NumElts);
39298 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39299 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39300 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39301 if (PeepholeKnown.isZero())
39302 Known.Zero.setBit(EltIdx);
39303 if (PeepholeKnown.isAllOnes())
39304 Known.One.setBit(EltIdx);
39305 }
39306 return Known;
39307 };
39308
39309 KnownBits V1Known = computeKnownBitsElementWise(V1);
39310 KnownBits V2Known = computeKnownBitsElementWise(V2);
39311
39312 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39313 int M = Mask[i];
39314 if (M == SM_SentinelUndef)
39315 continue;
39316 if (M == SM_SentinelZero) {
39317 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39318 continue;
39319 }
39320 if (M == (int)i) {
39321 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39322 continue;
39323 }
39324 if (M == (int)(i + NumMaskElts)) {
39325 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39326 continue;
39327 }
39328 llvm_unreachable("will not get here.");
39329 }
39330 if (IsBlend) {
39331 Shuffle = ISD::OR;
39332 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39333 return true;
39334 }
39335 }
39336 }
39337 }
39338
39339 return false;
39340}
39341
39343 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39344 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39345 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39346 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39347 unsigned NumMaskElts = Mask.size();
39348 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39349
39350 // Attempt to match against VALIGND/VALIGNQ rotate.
39351 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39352 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39353 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39354 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39355 if (!isAnyZero(Mask)) {
39356 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39357 if (0 < Rotation) {
39358 Shuffle = X86ISD::VALIGN;
39359 if (EltSizeInBits == 64)
39360 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
39361 else
39362 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
39363 PermuteImm = Rotation;
39364 return true;
39365 }
39366 }
39367 }
39368
39369 // Attempt to match against PALIGNR byte rotate.
39370 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39371 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39372 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39373 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39374 if (0 < ByteRotation) {
39375 Shuffle = X86ISD::PALIGNR;
39376 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39377 PermuteImm = ByteRotation;
39378 return true;
39379 }
39380 }
39381
39382 // Attempt to combine to X86ISD::BLENDI.
39383 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39384 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39385 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39386 uint64_t BlendMask = 0;
39387 bool ForceV1Zero = false, ForceV2Zero = false;
39388 SmallVector<int, 8> TargetMask(Mask);
39389 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39390 ForceV2Zero, BlendMask)) {
39391 if (MaskVT == MVT::v16i16) {
39392 // We can only use v16i16 PBLENDW if the lanes are repeated.
39393 SmallVector<int, 8> RepeatedMask;
39394 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39395 RepeatedMask)) {
39396 assert(RepeatedMask.size() == 8 &&
39397 "Repeated mask size doesn't match!");
39398 PermuteImm = 0;
39399 for (int i = 0; i < 8; ++i)
39400 if (RepeatedMask[i] >= 8)
39401 PermuteImm |= 1 << i;
39402 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39403 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39404 Shuffle = X86ISD::BLENDI;
39405 ShuffleVT = MaskVT;
39406 return true;
39407 }
39408 } else {
39409 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39410 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39411 PermuteImm = (unsigned)BlendMask;
39412 Shuffle = X86ISD::BLENDI;
39413 ShuffleVT = MaskVT;
39414 return true;
39415 }
39416 }
39417 }
39418
39419 // Attempt to combine to INSERTPS, but only if it has elements that need to
39420 // be set to zero.
39421 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39422 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39423 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39424 Shuffle = X86ISD::INSERTPS;
39425 ShuffleVT = MVT::v4f32;
39426 return true;
39427 }
39428
39429 // Attempt to combine to SHUFPD.
39430 if (AllowFloatDomain && EltSizeInBits == 64 &&
39431 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39432 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39433 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39434 bool ForceV1Zero = false, ForceV2Zero = false;
39435 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39436 PermuteImm, Mask, Zeroable)) {
39437 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39438 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39439 Shuffle = X86ISD::SHUFP;
39440 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39441 return true;
39442 }
39443 }
39444
39445 // Attempt to combine to SHUFPS.
39446 if (AllowFloatDomain && EltSizeInBits == 32 &&
39447 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39448 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39449 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39450 SmallVector<int, 4> RepeatedMask;
39451 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39452 // Match each half of the repeated mask, to determine if its just
39453 // referencing one of the vectors, is zeroable or entirely undef.
39454 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39455 int M0 = RepeatedMask[Offset];
39456 int M1 = RepeatedMask[Offset + 1];
39457
39458 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39459 return DAG.getUNDEF(MaskVT);
39460 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39461 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39462 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39463 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39464 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39465 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39466 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39467 return V1;
39468 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39469 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39470 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39471 return V2;
39472 }
39473
39474 return SDValue();
39475 };
39476
39477 int ShufMask[4] = {-1, -1, -1, -1};
39478 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39479 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39480
39481 if (Lo && Hi) {
39482 V1 = Lo;
39483 V2 = Hi;
39484 Shuffle = X86ISD::SHUFP;
39485 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39486 PermuteImm = getV4X86ShuffleImm(ShufMask);
39487 return true;
39488 }
39489 }
39490 }
39491
39492 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39493 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39494 MaskVT.is128BitVector() &&
39495 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39496 Shuffle = X86ISD::INSERTPS;
39497 ShuffleVT = MVT::v4f32;
39498 return true;
39499 }
39500
39501 return false;
39502}
39503
39505 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39506 bool HasVariableMask, bool AllowVariableCrossLaneMask,
39507 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39508 const X86Subtarget &Subtarget);
39509
39510/// Combine an arbitrary chain of shuffles into a single instruction if
39511/// possible.
39512///
39513/// This is the leaf of the recursive combine below. When we have found some
39514/// chain of single-use x86 shuffle instructions and accumulated the combined
39515/// shuffle mask represented by them, this will try to pattern match that mask
39516/// into either a single instruction if there is a special purpose instruction
39517/// for this operation, or into a PSHUFB instruction which is a fully general
39518/// instruction but should only be used to replace chains over a certain depth.
39520 ArrayRef<int> BaseMask, int Depth,
39521 bool HasVariableMask,
39522 bool AllowVariableCrossLaneMask,
39523 bool AllowVariablePerLaneMask,
39524 SelectionDAG &DAG,
39525 const X86Subtarget &Subtarget) {
39526 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39527 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39528 "Unexpected number of shuffle inputs!");
39529
39530 SDLoc DL(Root);
39531 MVT RootVT = Root.getSimpleValueType();
39532 unsigned RootSizeInBits = RootVT.getSizeInBits();
39533 unsigned NumRootElts = RootVT.getVectorNumElements();
39534
39535 // Canonicalize shuffle input op to the requested type.
39536 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39537 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39538 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39539 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39540 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39541 return DAG.getBitcast(VT, Op);
39542 };
39543
39544 // Find the inputs that enter the chain. Note that multiple uses are OK
39545 // here, we're not going to remove the operands we find.
39546 bool UnaryShuffle = (Inputs.size() == 1);
39547 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39548 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39549 : peekThroughBitcasts(Inputs[1]));
39550
39551 MVT VT1 = V1.getSimpleValueType();
39552 MVT VT2 = V2.getSimpleValueType();
39553 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
39554 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
39555
39556 SDValue Res;
39557
39558 unsigned NumBaseMaskElts = BaseMask.size();
39559 if (NumBaseMaskElts == 1) {
39560 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
39561 return CanonicalizeShuffleInput(RootVT, V1);
39562 }
39563
39564 bool OptForSize = DAG.shouldOptForSize();
39565 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39566 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
39567 (RootVT.isFloatingPoint() && Depth >= 1) ||
39568 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
39569
39570 // Don't combine if we are a AVX512/EVEX target and the mask element size
39571 // is different from the root element size - this would prevent writemasks
39572 // from being reused.
39573 bool IsMaskedShuffle = false;
39574 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
39575 if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
39576 Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
39577 IsMaskedShuffle = true;
39578 }
39579 }
39580
39581 // If we are shuffling a splat (and not introducing zeros) then we can just
39582 // use it directly. This works for smaller elements as well as they already
39583 // repeat across each mask element.
39584 if (UnaryShuffle && !isAnyZero(BaseMask) &&
39585 V1.getValueSizeInBits() >= RootSizeInBits &&
39586 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39587 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
39588 return CanonicalizeShuffleInput(RootVT, V1);
39589 }
39590
39591 SmallVector<int, 64> Mask(BaseMask);
39592
39593 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
39594 // etc. can be simplified.
39595 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
39596 SmallVector<int> ScaledMask, IdentityMask;
39597 unsigned NumElts = VT1.getVectorNumElements();
39598 if (Mask.size() <= NumElts &&
39599 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
39600 for (unsigned i = 0; i != NumElts; ++i)
39601 IdentityMask.push_back(i);
39602 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
39603 V2))
39604 return CanonicalizeShuffleInput(RootVT, V1);
39605 }
39606 }
39607
39608 // Handle 128/256-bit lane shuffles of 512-bit vectors.
39609 if (RootVT.is512BitVector() &&
39610 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
39611 // If the upper subvectors are zeroable, then an extract+insert is more
39612 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
39613 // to zero the upper subvectors.
39614 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
39615 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39616 return SDValue(); // Nothing to do!
39617 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
39618 "Unexpected lane shuffle");
39619 Res = CanonicalizeShuffleInput(RootVT, V1);
39620 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
39621 bool UseZero = isAnyZero(Mask);
39622 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
39623 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
39624 }
39625
39626 // Narrow shuffle mask to v4x128.
39627 SmallVector<int, 4> ScaledMask;
39628 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
39629 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
39630
39631 // Try to lower to vshuf64x2/vshuf32x4.
39632 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
39633 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
39634 SelectionDAG &DAG) {
39635 int PermMask[4] = {-1, -1, -1, -1};
39636 // Ensure elements came from the same Op.
39637 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
39638 for (int i = 0; i < 4; ++i) {
39639 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
39640 if (ScaledMask[i] < 0)
39641 continue;
39642
39643 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
39644 unsigned OpIndex = i / 2;
39645 if (Ops[OpIndex].isUndef())
39646 Ops[OpIndex] = Op;
39647 else if (Ops[OpIndex] != Op)
39648 return SDValue();
39649
39650 PermMask[i] = ScaledMask[i] % 4;
39651 }
39652
39653 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
39654 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
39655 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
39656 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
39657 };
39658
39659 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
39660 // doesn't work because our mask is for 128 bits and we don't have an MVT
39661 // to match that.
39662 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
39663 isUndefOrInRange(ScaledMask[1], 0, 2) &&
39664 isUndefOrInRange(ScaledMask[2], 2, 4) &&
39665 isUndefOrInRange(ScaledMask[3], 2, 4) &&
39666 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
39667 ScaledMask[0] == (ScaledMask[2] % 2)) &&
39668 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
39669 ScaledMask[1] == (ScaledMask[3] % 2));
39670
39671 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
39672 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39673 return SDValue(); // Nothing to do!
39674 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
39675 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
39676 return DAG.getBitcast(RootVT, V);
39677 }
39678 }
39679
39680 // Handle 128-bit lane shuffles of 256-bit vectors.
39681 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
39682 // If the upper half is zeroable, then an extract+insert is more optimal
39683 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
39684 // zero the upper half.
39685 if (isUndefOrZero(Mask[1])) {
39686 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39687 return SDValue(); // Nothing to do!
39688 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
39689 Res = CanonicalizeShuffleInput(RootVT, V1);
39690 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
39691 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
39692 256);
39693 }
39694
39695 // If we're inserting the low subvector, an insert-subvector 'concat'
39696 // pattern is quicker than VPERM2X128.
39697 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
39698 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
39699 !Subtarget.hasAVX2()) {
39700 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39701 return SDValue(); // Nothing to do!
39702 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
39703 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
39704 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
39705 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
39706 }
39707
39708 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
39709 return SDValue(); // Nothing to do!
39710
39711 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
39712 // we need to use the zeroing feature.
39713 // Prefer blends for sequential shuffles unless we are optimizing for size.
39714 if (UnaryShuffle &&
39715 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
39716 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
39717 unsigned PermMask = 0;
39718 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
39719 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
39720 return DAG.getNode(
39721 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
39722 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
39723 }
39724
39725 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39726 return SDValue(); // Nothing to do!
39727
39728 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
39729 if (!UnaryShuffle && !IsMaskedShuffle) {
39730 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
39731 "Unexpected shuffle sentinel value");
39732 // Prefer blends to X86ISD::VPERM2X128.
39733 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
39734 unsigned PermMask = 0;
39735 PermMask |= ((Mask[0] & 3) << 0);
39736 PermMask |= ((Mask[1] & 3) << 4);
39737 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
39738 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
39739 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
39740 CanonicalizeShuffleInput(RootVT, LHS),
39741 CanonicalizeShuffleInput(RootVT, RHS),
39742 DAG.getTargetConstant(PermMask, DL, MVT::i8));
39743 }
39744 }
39745 }
39746
39747 // For masks that have been widened to 128-bit elements or more,
39748 // narrow back down to 64-bit elements.
39749 if (BaseMaskEltSizeInBits > 64) {
39750 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
39751 int MaskScale = BaseMaskEltSizeInBits / 64;
39752 SmallVector<int, 64> ScaledMask;
39753 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39754 Mask = std::move(ScaledMask);
39755 }
39756
39757 // For masked shuffles, we're trying to match the root width for better
39758 // writemask folding, attempt to scale the mask.
39759 // TODO - variable shuffles might need this to be widened again.
39760 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
39761 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
39762 int MaskScale = NumRootElts / Mask.size();
39763 SmallVector<int, 64> ScaledMask;
39764 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39765 Mask = std::move(ScaledMask);
39766 }
39767
39768 unsigned NumMaskElts = Mask.size();
39769 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
39770 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39771
39772 // Determine the effective mask value type.
39773 FloatDomain &= (32 <= MaskEltSizeInBits);
39774 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
39775 : MVT::getIntegerVT(MaskEltSizeInBits);
39776 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
39777
39778 // Only allow legal mask types.
39779 if (!TLI.isTypeLegal(MaskVT))
39780 return SDValue();
39781
39782 // Attempt to match the mask against known shuffle patterns.
39783 MVT ShuffleSrcVT, ShuffleVT;
39784 unsigned Shuffle, PermuteImm;
39785
39786 // Which shuffle domains are permitted?
39787 // Permit domain crossing at higher combine depths.
39788 // TODO: Should we indicate which domain is preferred if both are allowed?
39789 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
39790 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
39791 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
39792
39793 // Determine zeroable mask elements.
39794 APInt KnownUndef, KnownZero;
39795 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
39796 APInt Zeroable = KnownUndef | KnownZero;
39797
39798 if (UnaryShuffle) {
39799 // Attempt to match against broadcast-from-vector.
39800 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
39801 if ((Subtarget.hasAVX2() ||
39802 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
39803 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
39804 if (isUndefOrEqual(Mask, 0)) {
39805 if (V1.getValueType() == MaskVT &&
39807 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
39808 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39809 return SDValue(); // Nothing to do!
39810 Res = V1.getOperand(0);
39811 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39812 return DAG.getBitcast(RootVT, Res);
39813 }
39814 if (Subtarget.hasAVX2()) {
39815 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39816 return SDValue(); // Nothing to do!
39817 Res = CanonicalizeShuffleInput(MaskVT, V1);
39818 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39819 return DAG.getBitcast(RootVT, Res);
39820 }
39821 }
39822 }
39823
39824 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
39825 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
39826 (!IsMaskedShuffle ||
39827 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39828 if (Depth == 0 && Root.getOpcode() == Shuffle)
39829 return SDValue(); // Nothing to do!
39830 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39831 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
39832 return DAG.getBitcast(RootVT, Res);
39833 }
39834
39835 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39836 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
39837 PermuteImm) &&
39838 (!IsMaskedShuffle ||
39839 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39840 if (Depth == 0 && Root.getOpcode() == Shuffle)
39841 return SDValue(); // Nothing to do!
39842 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
39843 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
39844 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39845 return DAG.getBitcast(RootVT, Res);
39846 }
39847 }
39848
39849 // Attempt to combine to INSERTPS, but only if the inserted element has come
39850 // from a scalar.
39851 // TODO: Handle other insertions here as well?
39852 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
39853 Subtarget.hasSSE41() &&
39854 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
39855 if (MaskEltSizeInBits == 32) {
39856 SDValue SrcV1 = V1, SrcV2 = V2;
39857 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
39858 DAG) &&
39859 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
39860 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39861 return SDValue(); // Nothing to do!
39862 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39863 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
39864 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
39865 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39866 return DAG.getBitcast(RootVT, Res);
39867 }
39868 }
39869 if (MaskEltSizeInBits == 64 &&
39870 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
39871 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39872 V2.getScalarValueSizeInBits() <= 32) {
39873 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39874 return SDValue(); // Nothing to do!
39875 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
39876 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39877 CanonicalizeShuffleInput(MVT::v4f32, V1),
39878 CanonicalizeShuffleInput(MVT::v4f32, V2),
39879 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39880 return DAG.getBitcast(RootVT, Res);
39881 }
39882 }
39883
39884 SDValue NewV1 = V1; // Save operands in case early exit happens.
39885 SDValue NewV2 = V2;
39886 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
39887 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
39888 ShuffleVT, UnaryShuffle) &&
39889 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39890 if (Depth == 0 && Root.getOpcode() == Shuffle)
39891 return SDValue(); // Nothing to do!
39892 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
39893 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
39894 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
39895 return DAG.getBitcast(RootVT, Res);
39896 }
39897
39898 NewV1 = V1; // Save operands in case early exit happens.
39899 NewV2 = V2;
39900 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39901 AllowIntDomain, NewV1, NewV2, DL, DAG,
39902 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
39903 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39904 if (Depth == 0 && Root.getOpcode() == Shuffle)
39905 return SDValue(); // Nothing to do!
39906 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
39907 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
39908 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
39909 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39910 return DAG.getBitcast(RootVT, Res);
39911 }
39912
39913 // Typically from here on, we need an integer version of MaskVT.
39914 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
39915 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
39916
39917 // Annoyingly, SSE4A instructions don't map into the above match helpers.
39918 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
39919 uint64_t BitLen, BitIdx;
39920 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
39921 Zeroable)) {
39922 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
39923 return SDValue(); // Nothing to do!
39924 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39925 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
39926 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39927 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39928 return DAG.getBitcast(RootVT, Res);
39929 }
39930
39931 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
39932 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
39933 return SDValue(); // Nothing to do!
39934 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39935 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
39936 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
39937 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39938 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39939 return DAG.getBitcast(RootVT, Res);
39940 }
39941 }
39942
39943 // Match shuffle against TRUNCATE patterns.
39944 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
39945 // Match against a VTRUNC instruction, accounting for src/dst sizes.
39946 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
39947 Subtarget)) {
39948 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
39949 ShuffleSrcVT.getVectorNumElements();
39950 unsigned Opc =
39951 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
39952 if (Depth == 0 && Root.getOpcode() == Opc)
39953 return SDValue(); // Nothing to do!
39954 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39955 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
39956 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
39957 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
39958 return DAG.getBitcast(RootVT, Res);
39959 }
39960
39961 // Do we need a more general binary truncation pattern?
39962 if (RootSizeInBits < 512 &&
39963 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
39964 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
39965 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
39966 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
39967 // Bail if this was already a truncation or PACK node.
39968 // We sometimes fail to match PACK if we demand known undef elements.
39969 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
39970 Root.getOpcode() == X86ISD::PACKSS ||
39971 Root.getOpcode() == X86ISD::PACKUS))
39972 return SDValue(); // Nothing to do!
39973 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
39974 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
39975 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39976 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
39977 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
39978 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
39979 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
39980 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
39981 return DAG.getBitcast(RootVT, Res);
39982 }
39983 }
39984
39985 // Don't try to re-form single instruction chains under any circumstances now
39986 // that we've done encoding canonicalization for them.
39987 if (Depth < 1)
39988 return SDValue();
39989
39990 // Depth threshold above which we can efficiently use variable mask shuffles.
39991 int VariableCrossLaneShuffleDepth =
39992 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
39993 int VariablePerLaneShuffleDepth =
39994 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
39995 AllowVariableCrossLaneMask &=
39996 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
39997 AllowVariablePerLaneMask &=
39998 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
39999 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
40000 // higher depth before combining them.
40001 bool AllowBWIVPERMV3 =
40002 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
40003
40004 // If root was a VPERMV3 node, always allow a variable shuffle.
40005 if (Root.getOpcode() == X86ISD::VPERMV3)
40006 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40007
40008 bool MaskContainsZeros = isAnyZero(Mask);
40009
40010 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40011 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40012 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40013 if (Subtarget.hasAVX2() &&
40014 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40015 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40016 Res = CanonicalizeShuffleInput(MaskVT, V1);
40017 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40018 return DAG.getBitcast(RootVT, Res);
40019 }
40020 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40021 if ((Subtarget.hasAVX512() &&
40022 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40023 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40024 (Subtarget.hasBWI() &&
40025 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40026 (Subtarget.hasVBMI() &&
40027 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40028 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40029 V2 = DAG.getUNDEF(MaskVT);
40030 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40031 return DAG.getBitcast(RootVT, Res);
40032 }
40033 }
40034
40035 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40036 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40037 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40038 ((Subtarget.hasAVX512() &&
40039 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40040 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40041 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40042 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40043 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40044 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40045 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40046 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40047 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40048 for (unsigned i = 0; i != NumMaskElts; ++i)
40049 if (Mask[i] == SM_SentinelZero)
40050 Mask[i] = NumMaskElts + i;
40051 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40052 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40053 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40054 return DAG.getBitcast(RootVT, Res);
40055 }
40056
40057 // If that failed and either input is extracted then try to combine as a
40058 // shuffle with the larger type.
40060 Inputs, Root, BaseMask, Depth, HasVariableMask,
40061 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
40062 Subtarget))
40063 return WideShuffle;
40064
40065 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40066 // (non-VLX will pad to 512-bit shuffles).
40067 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40068 ((Subtarget.hasAVX512() &&
40069 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40070 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40071 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40072 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40073 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40074 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40075 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40076 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40077 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40078 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40079 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40080 return DAG.getBitcast(RootVT, Res);
40081 }
40082 return SDValue();
40083 }
40084
40085 // See if we can combine a single input shuffle with zeros to a bit-mask,
40086 // which is much simpler than any shuffle.
40087 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40088 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40089 TLI.isTypeLegal(MaskVT)) {
40090 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40091 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40092 APInt UndefElts(NumMaskElts, 0);
40093 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40094 for (unsigned i = 0; i != NumMaskElts; ++i) {
40095 int M = Mask[i];
40096 if (M == SM_SentinelUndef) {
40097 UndefElts.setBit(i);
40098 continue;
40099 }
40100 if (M == SM_SentinelZero)
40101 continue;
40102 EltBits[i] = AllOnes;
40103 }
40104 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40105 Res = CanonicalizeShuffleInput(MaskVT, V1);
40106 unsigned AndOpcode =
40108 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40109 return DAG.getBitcast(RootVT, Res);
40110 }
40111
40112 // If we have a single input shuffle with different shuffle patterns in the
40113 // the 128-bit lanes use the variable mask to VPERMILPS.
40114 // TODO Combine other mask types at higher depths.
40115 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40116 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40117 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40118 SmallVector<SDValue, 16> VPermIdx;
40119 for (int M : Mask) {
40120 SDValue Idx =
40121 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40122 VPermIdx.push_back(Idx);
40123 }
40124 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40125 Res = CanonicalizeShuffleInput(MaskVT, V1);
40126 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40127 return DAG.getBitcast(RootVT, Res);
40128 }
40129
40130 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40131 // to VPERMIL2PD/VPERMIL2PS.
40132 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40133 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40134 MaskVT == MVT::v8f32)) {
40135 // VPERMIL2 Operation.
40136 // Bits[3] - Match Bit.
40137 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40138 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40139 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40140 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40141 SmallVector<int, 8> VPerm2Idx;
40142 unsigned M2ZImm = 0;
40143 for (int M : Mask) {
40144 if (M == SM_SentinelUndef) {
40145 VPerm2Idx.push_back(-1);
40146 continue;
40147 }
40148 if (M == SM_SentinelZero) {
40149 M2ZImm = 2;
40150 VPerm2Idx.push_back(8);
40151 continue;
40152 }
40153 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40154 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40155 VPerm2Idx.push_back(Index);
40156 }
40157 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40158 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40159 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40160 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40161 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40162 return DAG.getBitcast(RootVT, Res);
40163 }
40164
40165 // If we have 3 or more shuffle instructions or a chain involving a variable
40166 // mask, we can replace them with a single PSHUFB instruction profitably.
40167 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40168 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40169 // more aggressive.
40170 if (UnaryShuffle && AllowVariablePerLaneMask &&
40171 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40172 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40173 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40174 SmallVector<SDValue, 16> PSHUFBMask;
40175 int NumBytes = RootVT.getSizeInBits() / 8;
40176 int Ratio = NumBytes / NumMaskElts;
40177 for (int i = 0; i < NumBytes; ++i) {
40178 int M = Mask[i / Ratio];
40179 if (M == SM_SentinelUndef) {
40180 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40181 continue;
40182 }
40183 if (M == SM_SentinelZero) {
40184 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40185 continue;
40186 }
40187 M = Ratio * M + i % Ratio;
40188 assert((M / 16) == (i / 16) && "Lane crossing detected");
40189 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40190 }
40191 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40192 Res = CanonicalizeShuffleInput(ByteVT, V1);
40193 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40194 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40195 return DAG.getBitcast(RootVT, Res);
40196 }
40197
40198 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40199 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40200 // slower than PSHUFB on targets that support both.
40201 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40202 Subtarget.hasXOP()) {
40203 // VPPERM Mask Operation
40204 // Bits[4:0] - Byte Index (0 - 31)
40205 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40206 SmallVector<SDValue, 16> VPPERMMask;
40207 int NumBytes = 16;
40208 int Ratio = NumBytes / NumMaskElts;
40209 for (int i = 0; i < NumBytes; ++i) {
40210 int M = Mask[i / Ratio];
40211 if (M == SM_SentinelUndef) {
40212 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40213 continue;
40214 }
40215 if (M == SM_SentinelZero) {
40216 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40217 continue;
40218 }
40219 M = Ratio * M + i % Ratio;
40220 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40221 }
40222 MVT ByteVT = MVT::v16i8;
40223 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40224 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40225 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40226 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40227 return DAG.getBitcast(RootVT, Res);
40228 }
40229
40230 // If that failed and either input is extracted then try to combine as a
40231 // shuffle with the larger type.
40233 Inputs, Root, BaseMask, Depth, HasVariableMask,
40234 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
40235 return WideShuffle;
40236
40237 // If we have a dual input shuffle then lower to VPERMV3,
40238 // (non-VLX will pad to 512-bit shuffles)
40239 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40240 ((Subtarget.hasAVX512() &&
40241 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40242 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40243 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40244 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40245 MaskVT == MVT::v16i32)) ||
40246 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40247 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40248 MaskVT == MVT::v32i16)) ||
40249 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40250 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40251 MaskVT == MVT::v64i8)))) {
40252 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40253 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40254 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40255 return DAG.getBitcast(RootVT, Res);
40256 }
40257
40258 // Failed to find any combines.
40259 return SDValue();
40260}
40261
40262// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40263// instruction if possible.
40264//
40265// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40266// type size to attempt to combine:
40267// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40268// -->
40269// extract_subvector(shuffle(x,y,m2),0)
40271 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
40272 bool HasVariableMask, bool AllowVariableCrossLaneMask,
40273 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40274 const X86Subtarget &Subtarget) {
40275 unsigned NumMaskElts = BaseMask.size();
40276 unsigned NumInputs = Inputs.size();
40277 if (NumInputs == 0)
40278 return SDValue();
40279
40280 EVT RootVT = Root.getValueType();
40281 unsigned RootSizeInBits = RootVT.getSizeInBits();
40282 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40283 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40284
40285 // Peek through subvectors to find widest legal vector.
40286 // TODO: Handle ISD::TRUNCATE
40287 unsigned WideSizeInBits = RootSizeInBits;
40288 for (SDValue Input : Inputs) {
40289 Input = peekThroughBitcasts(Input);
40290 while (1) {
40291 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40292 Input = peekThroughBitcasts(Input.getOperand(0));
40293 continue;
40294 }
40295 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40296 Input.getOperand(0).isUndef()) {
40297 Input = peekThroughBitcasts(Input.getOperand(1));
40298 continue;
40299 }
40300 break;
40301 }
40302 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40303 WideSizeInBits < Input.getValueSizeInBits())
40304 WideSizeInBits = Input.getValueSizeInBits();
40305 }
40306
40307 // Bail if we fail to find a source larger than the existing root.
40308 unsigned Scale = WideSizeInBits / RootSizeInBits;
40309 if (WideSizeInBits <= RootSizeInBits ||
40310 (WideSizeInBits % RootSizeInBits) != 0)
40311 return SDValue();
40312
40313 // Create new mask for larger type.
40314 SmallVector<int, 64> WideMask(BaseMask);
40315 for (int &M : WideMask) {
40316 if (M < 0)
40317 continue;
40318 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
40319 }
40320 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
40321
40322 // Attempt to peek through inputs and adjust mask when we extract from an
40323 // upper subvector.
40324 int AdjustedMasks = 0;
40325 SmallVector<SDValue, 4> WideInputs(Inputs);
40326 for (unsigned I = 0; I != NumInputs; ++I) {
40327 SDValue &Input = WideInputs[I];
40328 Input = peekThroughBitcasts(Input);
40329 while (1) {
40330 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40331 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40333 if (Idx != 0) {
40334 ++AdjustedMasks;
40335 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40336 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40337
40338 int lo = I * WideMask.size();
40339 int hi = (I + 1) * WideMask.size();
40340 for (int &M : WideMask)
40341 if (lo <= M && M < hi)
40342 M += Idx;
40343 }
40344 Input = peekThroughBitcasts(Input.getOperand(0));
40345 continue;
40346 }
40347 // TODO: Handle insertions into upper subvectors.
40348 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40349 Input.getOperand(0).isUndef() &&
40350 isNullConstant(Input.getOperand(2))) {
40351 Input = peekThroughBitcasts(Input.getOperand(1));
40352 continue;
40353 }
40354 break;
40355 }
40356 }
40357
40358 // Remove unused/repeated shuffle source ops.
40359 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40360 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40361
40362 // Bail if we're always extracting from the lowest subvectors,
40363 // combineX86ShuffleChain should match this for the current width, or the
40364 // shuffle still references too many inputs.
40365 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40366 return SDValue();
40367
40368 // Minor canonicalization of the accumulated shuffle mask to make it easier
40369 // to match below. All this does is detect masks with sequential pairs of
40370 // elements, and shrink them to the half-width mask. It does this in a loop
40371 // so it will reduce the size of the mask to the minimal width mask which
40372 // performs an equivalent shuffle.
40373 while (WideMask.size() > 1) {
40374 SmallVector<int, 64> WidenedMask;
40375 if (!canWidenShuffleElements(WideMask, WidenedMask))
40376 break;
40377 WideMask = std::move(WidenedMask);
40378 }
40379
40380 // Canonicalization of binary shuffle masks to improve pattern matching by
40381 // commuting the inputs.
40382 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40384 std::swap(WideInputs[0], WideInputs[1]);
40385 }
40386
40387 // Increase depth for every upper subvector we've peeked through.
40388 Depth += AdjustedMasks;
40389
40390 // Attempt to combine wider chain.
40391 // TODO: Can we use a better Root?
40392 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40393 WideInputs.back().getValueSizeInBits()
40394 ? WideInputs.front()
40395 : WideInputs.back();
40396 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40397 "WideRootSize mismatch");
40398
40399 if (SDValue WideShuffle =
40400 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
40401 HasVariableMask, AllowVariableCrossLaneMask,
40402 AllowVariablePerLaneMask, DAG, Subtarget)) {
40403 WideShuffle =
40404 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
40405 return DAG.getBitcast(RootVT, WideShuffle);
40406 }
40407
40408 return SDValue();
40409}
40410
40411// Canonicalize the combined shuffle mask chain with horizontal ops.
40412// NOTE: This may update the Ops and Mask.
40415 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40416 const X86Subtarget &Subtarget) {
40417 if (Mask.empty() || Ops.empty())
40418 return SDValue();
40419
40421 for (SDValue Op : Ops)
40423
40424 // All ops must be the same horizop + type.
40425 SDValue BC0 = BC[0];
40426 EVT VT0 = BC0.getValueType();
40427 unsigned Opcode0 = BC0.getOpcode();
40428 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40429 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40430 }))
40431 return SDValue();
40432
40433 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40434 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40435 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40436 if (!isHoriz && !isPack)
40437 return SDValue();
40438
40439 // Do all ops have a single use?
40440 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40441 return Op.hasOneUse() &&
40443 });
40444
40445 int NumElts = VT0.getVectorNumElements();
40446 int NumLanes = VT0.getSizeInBits() / 128;
40447 int NumEltsPerLane = NumElts / NumLanes;
40448 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40449 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40450 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40451
40452 if (NumEltsPerLane >= 4 &&
40453 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40454 SmallVector<int> LaneMask, ScaledMask;
40455 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40456 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40457 // See if we can remove the shuffle by resorting the HOP chain so that
40458 // the HOP args are pre-shuffled.
40459 // TODO: Generalize to any sized/depth chain.
40460 // TODO: Add support for PACKSS/PACKUS.
40461 if (isHoriz) {
40462 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40463 auto GetHOpSrc = [&](int M) {
40464 if (M == SM_SentinelUndef)
40465 return DAG.getUNDEF(VT0);
40466 if (M == SM_SentinelZero)
40467 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40468 SDValue Src0 = BC[M / 4];
40469 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40470 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40471 return Src1.getOperand(M % 2);
40472 return SDValue();
40473 };
40474 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40475 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40476 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40477 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40478 if (M0 && M1 && M2 && M3) {
40479 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40480 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40481 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40482 }
40483 }
40484 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40485 if (Ops.size() >= 2) {
40486 SDValue LHS, RHS;
40487 auto GetHOpSrc = [&](int M, int &OutM) {
40488 // TODO: Support SM_SentinelZero
40489 if (M < 0)
40490 return M == SM_SentinelUndef;
40491 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40492 if (!LHS || LHS == Src) {
40493 LHS = Src;
40494 OutM = (M % 2);
40495 return true;
40496 }
40497 if (!RHS || RHS == Src) {
40498 RHS = Src;
40499 OutM = (M % 2) + 2;
40500 return true;
40501 }
40502 return false;
40503 };
40504 int PostMask[4] = {-1, -1, -1, -1};
40505 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40506 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40507 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40508 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40509 LHS = DAG.getBitcast(SrcVT, LHS);
40510 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40511 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40512 // Use SHUFPS for the permute so this will work on SSE2 targets,
40513 // shuffle combining and domain handling will simplify this later on.
40514 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40515 Res = DAG.getBitcast(ShuffleVT, Res);
40516 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40517 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40518 }
40519 }
40520 }
40521 }
40522
40523 if (2 < Ops.size())
40524 return SDValue();
40525
40526 SDValue BC1 = BC[BC.size() - 1];
40527 if (Mask.size() == VT0.getVectorNumElements()) {
40528 // Canonicalize binary shuffles of horizontal ops that use the
40529 // same sources to an unary shuffle.
40530 // TODO: Try to perform this fold even if the shuffle remains.
40531 if (Ops.size() == 2) {
40532 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40533 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40534 };
40535 // Commute if all BC0's ops are contained in BC1.
40536 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40537 ContainsOps(BC1, BC0.getOperand(1))) {
40539 std::swap(Ops[0], Ops[1]);
40540 std::swap(BC0, BC1);
40541 }
40542
40543 // If BC1 can be represented by BC0, then convert to unary shuffle.
40544 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40545 ContainsOps(BC0, BC1.getOperand(1))) {
40546 for (int &M : Mask) {
40547 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40548 continue;
40549 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40550 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40551 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40552 M += NumHalfEltsPerLane;
40553 }
40554 }
40555 }
40556
40557 // Canonicalize unary horizontal ops to only refer to lower halves.
40558 for (int i = 0; i != NumElts; ++i) {
40559 int &M = Mask[i];
40560 if (isUndefOrZero(M))
40561 continue;
40562 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40563 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40564 M -= NumHalfEltsPerLane;
40565 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40566 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40567 M -= NumHalfEltsPerLane;
40568 }
40569 }
40570
40571 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
40572 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
40573 // represents the LHS/RHS inputs for the lower/upper halves.
40574 SmallVector<int, 16> TargetMask128, WideMask128;
40575 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
40576 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
40577 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
40578 bool SingleOp = (Ops.size() == 1);
40579 if (isPack || OneUseOps ||
40580 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
40581 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
40582 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
40583 Lo = Lo.getOperand(WideMask128[0] & 1);
40584 Hi = Hi.getOperand(WideMask128[1] & 1);
40585 if (SingleOp) {
40586 SDValue Undef = DAG.getUNDEF(SrcVT);
40587 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
40588 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
40589 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
40590 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
40591 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
40592 }
40593 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
40594 }
40595 }
40596
40597 // If we are post-shuffling a 256-bit hop and not requiring the upper
40598 // elements, then try to narrow to a 128-bit hop directly.
40599 SmallVector<int, 16> WideMask64;
40600 if (Ops.size() == 1 && NumLanes == 2 &&
40601 scaleShuffleElements(Mask, 4, WideMask64) &&
40602 isUndefInRange(WideMask64, 2, 2)) {
40603 int M0 = WideMask64[0];
40604 int M1 = WideMask64[1];
40605 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
40607 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
40608 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
40609 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
40610 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
40611 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
40612 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
40613 }
40614 }
40615
40616 return SDValue();
40617}
40618
40619// Attempt to constant fold all of the constant source ops.
40620// Returns true if the entire shuffle is folded to a constant.
40621// TODO: Extend this to merge multiple constant Ops and update the mask.
40623 ArrayRef<int> Mask,
40624 bool HasVariableMask,
40625 SelectionDAG &DAG, const SDLoc &DL,
40626 const X86Subtarget &Subtarget) {
40627 unsigned SizeInBits = VT.getSizeInBits();
40628 unsigned NumMaskElts = Mask.size();
40629 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
40630 unsigned NumOps = Ops.size();
40631
40632 // Extract constant bits from each source op.
40633 SmallVector<APInt, 16> UndefEltsOps(NumOps);
40634 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
40635 for (unsigned I = 0; I != NumOps; ++I)
40636 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
40637 RawBitsOps[I],
40638 /*AllowWholeUndefs*/ true,
40639 /*AllowPartialUndefs*/ true))
40640 return SDValue();
40641
40642 // If we're optimizing for size, only fold if at least one of the constants is
40643 // only used once or the combined shuffle has included a variable mask
40644 // shuffle, this is to avoid constant pool bloat.
40645 bool IsOptimizingSize = DAG.shouldOptForSize();
40646 if (IsOptimizingSize && !HasVariableMask &&
40647 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
40648 return SDValue();
40649
40650 // Shuffle the constant bits according to the mask.
40651 APInt UndefElts(NumMaskElts, 0);
40652 APInt ZeroElts(NumMaskElts, 0);
40653 APInt ConstantElts(NumMaskElts, 0);
40654 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
40655 APInt::getZero(MaskSizeInBits));
40656 for (unsigned i = 0; i != NumMaskElts; ++i) {
40657 int M = Mask[i];
40658 if (M == SM_SentinelUndef) {
40659 UndefElts.setBit(i);
40660 continue;
40661 } else if (M == SM_SentinelZero) {
40662 ZeroElts.setBit(i);
40663 continue;
40664 }
40665 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
40666
40667 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
40668 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
40669
40670 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
40671 if (SrcUndefElts[SrcMaskIdx]) {
40672 UndefElts.setBit(i);
40673 continue;
40674 }
40675
40676 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
40677 APInt &Bits = SrcEltBits[SrcMaskIdx];
40678 if (!Bits) {
40679 ZeroElts.setBit(i);
40680 continue;
40681 }
40682
40683 ConstantElts.setBit(i);
40684 ConstantBitData[i] = Bits;
40685 }
40686 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
40687
40688 // Attempt to create a zero vector.
40689 if ((UndefElts | ZeroElts).isAllOnes())
40690 return getZeroVector(VT, Subtarget, DAG, DL);
40691
40692 // Create the constant data.
40693 MVT MaskSVT;
40694 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
40695 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
40696 else
40697 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
40698
40699 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
40700 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
40701 return SDValue();
40702
40703 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
40704 return DAG.getBitcast(VT, CstOp);
40705}
40706
40707namespace llvm {
40708 namespace X86 {
40709 enum {
40712 } // namespace X86
40713} // namespace llvm
40714
40715/// Fully generic combining of x86 shuffle instructions.
40716///
40717/// This should be the last combine run over the x86 shuffle instructions. Once
40718/// they have been fully optimized, this will recursively consider all chains
40719/// of single-use shuffle instructions, build a generic model of the cumulative
40720/// shuffle operation, and check for simpler instructions which implement this
40721/// operation. We use this primarily for two purposes:
40722///
40723/// 1) Collapse generic shuffles to specialized single instructions when
40724/// equivalent. In most cases, this is just an encoding size win, but
40725/// sometimes we will collapse multiple generic shuffles into a single
40726/// special-purpose shuffle.
40727/// 2) Look for sequences of shuffle instructions with 3 or more total
40728/// instructions, and replace them with the slightly more expensive SSSE3
40729/// PSHUFB instruction if available. We do this as the last combining step
40730/// to ensure we avoid using PSHUFB if we can implement the shuffle with
40731/// a suitable short sequence of other instructions. The PSHUFB will either
40732/// use a register or have to read from memory and so is slightly (but only
40733/// slightly) more expensive than the other shuffle instructions.
40734///
40735/// Because this is inherently a quadratic operation (for each shuffle in
40736/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
40737/// This should never be an issue in practice as the shuffle lowering doesn't
40738/// produce sequences of more than 8 instructions.
40739///
40740/// FIXME: We will currently miss some cases where the redundant shuffling
40741/// would simplify under the threshold for PSHUFB formation because of
40742/// combine-ordering. To fix this, we should do the redundant instruction
40743/// combining in this recursive walk.
40745 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
40746 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
40747 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
40748 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40749 const X86Subtarget &Subtarget) {
40750 assert(!RootMask.empty() &&
40751 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
40752 "Illegal shuffle root mask");
40753 MVT RootVT = Root.getSimpleValueType();
40754 assert(RootVT.isVector() && "Shuffles operate on vector types!");
40755 unsigned RootSizeInBits = RootVT.getSizeInBits();
40756 SDLoc DL(Root);
40757
40758 // Bound the depth of our recursive combine because this is ultimately
40759 // quadratic in nature.
40760 if (Depth >= MaxDepth)
40761 return SDValue();
40762
40763 // Directly rip through bitcasts to find the underlying operand.
40764 SDValue Op = SrcOps[SrcOpIndex];
40766
40767 EVT VT = Op.getValueType();
40768 if (!VT.isVector() || !VT.isSimple())
40769 return SDValue(); // Bail if we hit a non-simple non-vector.
40770
40771 // FIXME: Just bail on f16 for now.
40772 if (VT.getVectorElementType() == MVT::f16)
40773 return SDValue();
40774
40775 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
40776 "Can only combine shuffles upto size of the root op.");
40777
40778 // Create a demanded elts mask from the referenced elements of Op.
40779 APInt OpDemandedElts = APInt::getZero(RootMask.size());
40780 for (int M : RootMask) {
40781 int BaseIdx = RootMask.size() * SrcOpIndex;
40782 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
40783 OpDemandedElts.setBit(M - BaseIdx);
40784 }
40785 if (RootSizeInBits != VT.getSizeInBits()) {
40786 // Op is smaller than Root - extract the demanded elts for the subvector.
40787 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
40788 unsigned NumOpMaskElts = RootMask.size() / Scale;
40789 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
40790 assert(OpDemandedElts
40791 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
40792 .isZero() &&
40793 "Out of range elements referenced in root mask");
40794 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
40795 }
40796 OpDemandedElts =
40797 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
40798
40799 // Extract target shuffle mask and resolve sentinels and inputs.
40800 SmallVector<int, 64> OpMask;
40801 SmallVector<SDValue, 2> OpInputs;
40802 APInt OpUndef, OpZero;
40803 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
40804 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
40805 OpZero, DAG, Depth, false)) {
40806 // Shuffle inputs must not be larger than the shuffle result.
40807 // TODO: Relax this for single input faux shuffles (e.g. trunc).
40808 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
40809 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
40810 }))
40811 return SDValue();
40812 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40813 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
40814 !isNullConstant(Op.getOperand(1))) {
40815 SDValue SrcVec = Op.getOperand(0);
40816 int ExtractIdx = Op.getConstantOperandVal(1);
40817 unsigned NumElts = VT.getVectorNumElements();
40818 OpInputs.assign({SrcVec});
40819 OpMask.assign(NumElts, SM_SentinelUndef);
40820 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
40821 OpZero = OpUndef = APInt::getZero(NumElts);
40822 } else {
40823 return SDValue();
40824 }
40825
40826 // If the shuffle result was smaller than the root, we need to adjust the
40827 // mask indices and pad the mask with undefs.
40828 if (RootSizeInBits > VT.getSizeInBits()) {
40829 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
40830 unsigned OpMaskSize = OpMask.size();
40831 if (OpInputs.size() > 1) {
40832 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
40833 for (int &M : OpMask) {
40834 if (M < 0)
40835 continue;
40836 int EltIdx = M % OpMaskSize;
40837 int OpIdx = M / OpMaskSize;
40838 M = (PaddedMaskSize * OpIdx) + EltIdx;
40839 }
40840 }
40841 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
40842 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
40843 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
40844 }
40845
40848
40849 // We don't need to merge masks if the root is empty.
40850 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
40851 if (EmptyRoot) {
40852 // Only resolve zeros if it will remove an input, otherwise we might end
40853 // up in an infinite loop.
40854 bool ResolveKnownZeros = true;
40855 if (!OpZero.isZero()) {
40856 APInt UsedInputs = APInt::getZero(OpInputs.size());
40857 for (int i = 0, e = OpMask.size(); i != e; ++i) {
40858 int M = OpMask[i];
40859 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
40860 continue;
40861 UsedInputs.setBit(M / OpMask.size());
40862 if (UsedInputs.isAllOnes()) {
40863 ResolveKnownZeros = false;
40864 break;
40865 }
40866 }
40867 }
40868 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
40869 ResolveKnownZeros);
40870
40871 Mask = OpMask;
40872 Ops.append(OpInputs.begin(), OpInputs.end());
40873 } else {
40874 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
40875
40876 // Add the inputs to the Ops list, avoiding duplicates.
40877 Ops.append(SrcOps.begin(), SrcOps.end());
40878
40879 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
40880 // Attempt to find an existing match.
40881 SDValue InputBC = peekThroughBitcasts(Input);
40882 for (int i = 0, e = Ops.size(); i < e; ++i)
40883 if (InputBC == peekThroughBitcasts(Ops[i]))
40884 return i;
40885 // Match failed - should we replace an existing Op?
40886 if (InsertionPoint >= 0) {
40887 Ops[InsertionPoint] = Input;
40888 return InsertionPoint;
40889 }
40890 // Add to the end of the Ops list.
40891 Ops.push_back(Input);
40892 return Ops.size() - 1;
40893 };
40894
40895 SmallVector<int, 2> OpInputIdx;
40896 for (SDValue OpInput : OpInputs)
40897 OpInputIdx.push_back(
40898 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
40899
40900 assert(((RootMask.size() > OpMask.size() &&
40901 RootMask.size() % OpMask.size() == 0) ||
40902 (OpMask.size() > RootMask.size() &&
40903 OpMask.size() % RootMask.size() == 0) ||
40904 OpMask.size() == RootMask.size()) &&
40905 "The smaller number of elements must divide the larger.");
40906
40907 // This function can be performance-critical, so we rely on the power-of-2
40908 // knowledge that we have about the mask sizes to replace div/rem ops with
40909 // bit-masks and shifts.
40910 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
40911 "Non-power-of-2 shuffle mask sizes");
40912 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
40913 "Non-power-of-2 shuffle mask sizes");
40914 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
40915 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
40916
40917 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
40918 unsigned RootRatio =
40919 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
40920 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
40921 assert((RootRatio == 1 || OpRatio == 1) &&
40922 "Must not have a ratio for both incoming and op masks!");
40923
40924 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
40925 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
40926 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
40927 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
40928 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
40929
40930 Mask.resize(MaskWidth, SM_SentinelUndef);
40931
40932 // Merge this shuffle operation's mask into our accumulated mask. Note that
40933 // this shuffle's mask will be the first applied to the input, followed by
40934 // the root mask to get us all the way to the root value arrangement. The
40935 // reason for this order is that we are recursing up the operation chain.
40936 for (unsigned i = 0; i < MaskWidth; ++i) {
40937 unsigned RootIdx = i >> RootRatioLog2;
40938 if (RootMask[RootIdx] < 0) {
40939 // This is a zero or undef lane, we're done.
40940 Mask[i] = RootMask[RootIdx];
40941 continue;
40942 }
40943
40944 unsigned RootMaskedIdx =
40945 RootRatio == 1
40946 ? RootMask[RootIdx]
40947 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
40948
40949 // Just insert the scaled root mask value if it references an input other
40950 // than the SrcOp we're currently inserting.
40951 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
40952 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
40953 Mask[i] = RootMaskedIdx;
40954 continue;
40955 }
40956
40957 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
40958 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
40959 if (OpMask[OpIdx] < 0) {
40960 // The incoming lanes are zero or undef, it doesn't matter which ones we
40961 // are using.
40962 Mask[i] = OpMask[OpIdx];
40963 continue;
40964 }
40965
40966 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
40967 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
40968 : (OpMask[OpIdx] << OpRatioLog2) +
40969 (RootMaskedIdx & (OpRatio - 1));
40970
40971 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
40972 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
40973 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
40974 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
40975
40976 Mask[i] = OpMaskedIdx;
40977 }
40978 }
40979
40980 // Peek through vector widenings and set out of bounds mask indices to undef.
40981 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
40982 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
40983 SDValue &Op = Ops[I];
40984 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
40985 isNullConstant(Op.getOperand(2))) {
40986 Op = Op.getOperand(1);
40987 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
40988 int Lo = I * Mask.size();
40989 int Hi = (I + 1) * Mask.size();
40990 int NewHi = Lo + (Mask.size() / Scale);
40991 for (int &M : Mask) {
40992 if (Lo <= M && NewHi <= M && M < Hi)
40993 M = SM_SentinelUndef;
40994 }
40995 }
40996 }
40997
40998 // Peek through any free extract_subvector nodes back to root size.
40999 for (SDValue &Op : Ops)
41000 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41001 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41002 isNullConstant(Op.getOperand(1)))
41003 Op = Op.getOperand(0);
41004
41005 // Remove unused/repeated shuffle source ops.
41007
41008 // Handle the all undef/zero/ones cases early.
41009 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41010 return DAG.getUNDEF(RootVT);
41011 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41012 return getZeroVector(RootVT, Subtarget, DAG, DL);
41013 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41015 return getOnesVector(RootVT, DAG, DL);
41016
41017 assert(!Ops.empty() && "Shuffle with no inputs detected");
41018 HasVariableMask |= IsOpVariableMask;
41019
41020 // Update the list of shuffle nodes that have been combined so far.
41021 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41022 CombinedNodes.push_back(Op.getNode());
41023
41024 // See if we can recurse into each shuffle source op (if it's a target
41025 // shuffle). The source op should only be generally combined if it either has
41026 // a single use (i.e. current Op) or all its users have already been combined,
41027 // if not then we can still combine but should prevent generation of variable
41028 // shuffles to avoid constant pool bloat.
41029 // Don't recurse if we already have more source ops than we can combine in
41030 // the remaining recursion depth.
41031 if (Ops.size() < (MaxDepth - Depth)) {
41032 for (int i = 0, e = Ops.size(); i < e; ++i) {
41033 // For empty roots, we need to resolve zeroable elements before combining
41034 // them with other shuffles.
41035 SmallVector<int, 64> ResolvedMask = Mask;
41036 if (EmptyRoot)
41037 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41038 bool AllowCrossLaneVar = false;
41039 bool AllowPerLaneVar = false;
41040 if (Ops[i].getNode()->hasOneUse() ||
41041 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41042 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41043 AllowPerLaneVar = AllowVariablePerLaneMask;
41044 }
41046 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
41047 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
41048 Subtarget))
41049 return Res;
41050 }
41051 }
41052
41053 // Attempt to constant fold all of the constant source ops.
41055 RootVT, Ops, Mask, HasVariableMask, DAG, DL, Subtarget))
41056 return Cst;
41057
41058 // If constant fold failed and we only have constants - then we have
41059 // multiple uses by a single non-variable shuffle - just bail.
41060 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41061 APInt UndefElts;
41062 SmallVector<APInt> RawBits;
41063 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41064 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41065 RawBits,
41066 /*AllowWholeUndefs*/ true,
41067 /*AllowPartialUndefs*/ true);
41068 })) {
41069 return SDValue();
41070 }
41071
41072 // Canonicalize the combined shuffle mask chain with horizontal ops.
41073 // NOTE: This will update the Ops and Mask.
41075 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41076 return DAG.getBitcast(RootVT, HOp);
41077
41078 // Try to refine our inputs given our knowledge of target shuffle mask.
41079 for (auto I : enumerate(Ops)) {
41080 int OpIdx = I.index();
41081 SDValue &Op = I.value();
41082
41083 // What range of shuffle mask element values results in picking from Op?
41084 int Lo = OpIdx * Mask.size();
41085 int Hi = Lo + Mask.size();
41086
41087 // Which elements of Op do we demand, given the mask's granularity?
41088 APInt OpDemandedElts(Mask.size(), 0);
41089 for (int MaskElt : Mask) {
41090 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41091 int OpEltIdx = MaskElt - Lo;
41092 OpDemandedElts.setBit(OpEltIdx);
41093 }
41094 }
41095
41096 // Is the shuffle result smaller than the root?
41097 if (Op.getValueSizeInBits() < RootSizeInBits) {
41098 // We padded the mask with undefs. But we now need to undo that.
41099 unsigned NumExpectedVectorElts = Mask.size();
41100 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41101 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41102 assert(!OpDemandedElts.extractBits(
41103 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41104 "Demanding the virtual undef widening padding?");
41105 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41106 }
41107
41108 // The Op itself may be of different VT, so we need to scale the mask.
41109 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41110 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41111
41112 // Can this operand be simplified any further, given it's demanded elements?
41113 if (SDValue NewOp =
41115 Op, OpScaledDemandedElts, DAG))
41116 Op = NewOp;
41117 }
41118 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41119
41120 // Widen any subvector shuffle inputs we've collected.
41121 // TODO: Remove this to avoid generating temporary nodes, we should only
41122 // widen once combineX86ShuffleChain has found a match.
41123 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41124 return Op.getValueSizeInBits() < RootSizeInBits;
41125 })) {
41126 for (SDValue &Op : Ops)
41127 if (Op.getValueSizeInBits() < RootSizeInBits)
41128 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41129 RootSizeInBits);
41130 // Reresolve - we might have repeated subvector sources.
41132 }
41133
41134 // We can only combine unary and binary shuffle mask cases.
41135 if (Ops.size() <= 2) {
41136 // Minor canonicalization of the accumulated shuffle mask to make it easier
41137 // to match below. All this does is detect masks with sequential pairs of
41138 // elements, and shrink them to the half-width mask. It does this in a loop
41139 // so it will reduce the size of the mask to the minimal width mask which
41140 // performs an equivalent shuffle.
41141 while (Mask.size() > 1) {
41142 SmallVector<int, 64> WidenedMask;
41143 if (!canWidenShuffleElements(Mask, WidenedMask))
41144 break;
41145 Mask = std::move(WidenedMask);
41146 }
41147
41148 // Canonicalization of binary shuffle masks to improve pattern matching by
41149 // commuting the inputs.
41150 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41152 std::swap(Ops[0], Ops[1]);
41153 }
41154
41155 // Try to combine into a single shuffle instruction.
41156 if (SDValue Shuffle = combineX86ShuffleChain(
41157 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41158 AllowVariablePerLaneMask, DAG, Subtarget))
41159 return Shuffle;
41160
41161 // If all the operands come from the same larger vector, fallthrough and try
41162 // to use combineX86ShuffleChainWithExtract.
41165 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41166 (RootSizeInBits / Mask.size()) != 64 ||
41167 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41168 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41169 LHS.getOperand(0) != RHS.getOperand(0))
41170 return SDValue();
41171 }
41172
41173 // If that failed and any input is extracted then try to combine as a
41174 // shuffle with the larger type.
41176 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41177 AllowVariablePerLaneMask, DAG, Subtarget);
41178}
41179
41180/// Helper entry wrapper to combineX86ShufflesRecursively.
41182 const X86Subtarget &Subtarget) {
41184 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
41185 /*HasVarMask*/ false,
41186 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
41187 Subtarget);
41188}
41189
41190/// Get the PSHUF-style mask from PSHUF node.
41191///
41192/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41193/// PSHUF-style masks that can be reused with such instructions.
41195 MVT VT = N.getSimpleValueType();
41198 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41199 (void)HaveMask;
41200 assert(HaveMask);
41201
41202 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41203 // matter. Check that the upper masks are repeats and remove them.
41204 if (VT.getSizeInBits() > 128) {
41205 int LaneElts = 128 / VT.getScalarSizeInBits();
41206#ifndef NDEBUG
41207 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41208 for (int j = 0; j < LaneElts; ++j)
41209 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41210 "Mask doesn't repeat in high 128-bit lanes!");
41211#endif
41212 Mask.resize(LaneElts);
41213 }
41214
41215 switch (N.getOpcode()) {
41216 case X86ISD::PSHUFD:
41217 return Mask;
41218 case X86ISD::PSHUFLW:
41219 Mask.resize(4);
41220 return Mask;
41221 case X86ISD::PSHUFHW:
41222 Mask.erase(Mask.begin(), Mask.begin() + 4);
41223 for (int &M : Mask)
41224 M -= 4;
41225 return Mask;
41226 default:
41227 llvm_unreachable("No valid shuffle instruction found!");
41228 }
41229}
41230
41231/// Search for a combinable shuffle across a chain ending in pshufd.
41232///
41233/// We walk up the chain and look for a combinable shuffle, skipping over
41234/// shuffles that we could hoist this shuffle's transformation past without
41235/// altering anything.
41238 const SDLoc &DL,
41239 SelectionDAG &DAG) {
41240 assert(N.getOpcode() == X86ISD::PSHUFD &&
41241 "Called with something other than an x86 128-bit half shuffle!");
41242
41243 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41244 // of the shuffles in the chain so that we can form a fresh chain to replace
41245 // this one.
41247 SDValue V = N.getOperand(0);
41248 for (; V.hasOneUse(); V = V.getOperand(0)) {
41249 switch (V.getOpcode()) {
41250 default:
41251 return SDValue(); // Nothing combined!
41252
41253 case ISD::BITCAST:
41254 // Skip bitcasts as we always know the type for the target specific
41255 // instructions.
41256 continue;
41257
41258 case X86ISD::PSHUFD:
41259 // Found another dword shuffle.
41260 break;
41261
41262 case X86ISD::PSHUFLW:
41263 // Check that the low words (being shuffled) are the identity in the
41264 // dword shuffle, and the high words are self-contained.
41265 if (Mask[0] != 0 || Mask[1] != 1 ||
41266 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41267 return SDValue();
41268
41269 Chain.push_back(V);
41270 continue;
41271
41272 case X86ISD::PSHUFHW:
41273 // Check that the high words (being shuffled) are the identity in the
41274 // dword shuffle, and the low words are self-contained.
41275 if (Mask[2] != 2 || Mask[3] != 3 ||
41276 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41277 return SDValue();
41278
41279 Chain.push_back(V);
41280 continue;
41281
41282 case X86ISD::UNPCKL:
41283 case X86ISD::UNPCKH:
41284 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41285 // shuffle into a preceding word shuffle.
41286 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41287 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41288 return SDValue();
41289
41290 // Search for a half-shuffle which we can combine with.
41291 unsigned CombineOp =
41292 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41293 if (V.getOperand(0) != V.getOperand(1) ||
41294 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41295 return SDValue();
41296 Chain.push_back(V);
41297 V = V.getOperand(0);
41298 do {
41299 switch (V.getOpcode()) {
41300 default:
41301 return SDValue(); // Nothing to combine.
41302
41303 case X86ISD::PSHUFLW:
41304 case X86ISD::PSHUFHW:
41305 if (V.getOpcode() == CombineOp)
41306 break;
41307
41308 Chain.push_back(V);
41309
41310 [[fallthrough]];
41311 case ISD::BITCAST:
41312 V = V.getOperand(0);
41313 continue;
41314 }
41315 break;
41316 } while (V.hasOneUse());
41317 break;
41318 }
41319 // Break out of the loop if we break out of the switch.
41320 break;
41321 }
41322
41323 if (!V.hasOneUse())
41324 // We fell out of the loop without finding a viable combining instruction.
41325 return SDValue();
41326
41327 // Merge this node's mask and our incoming mask.
41329 for (int &M : Mask)
41330 M = VMask[M];
41331 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41332 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41333
41334 // Rebuild the chain around this new shuffle.
41335 while (!Chain.empty()) {
41336 SDValue W = Chain.pop_back_val();
41337
41338 if (V.getValueType() != W.getOperand(0).getValueType())
41339 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41340
41341 switch (W.getOpcode()) {
41342 default:
41343 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41344
41345 case X86ISD::UNPCKL:
41346 case X86ISD::UNPCKH:
41347 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41348 break;
41349
41350 case X86ISD::PSHUFD:
41351 case X86ISD::PSHUFLW:
41352 case X86ISD::PSHUFHW:
41353 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41354 break;
41355 }
41356 }
41357 if (V.getValueType() != N.getValueType())
41358 V = DAG.getBitcast(N.getValueType(), V);
41359
41360 // Return the new chain to replace N.
41361 return V;
41362}
41363
41364// Attempt to commute shufps LHS loads:
41365// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41367 SelectionDAG &DAG) {
41368 // TODO: Add vXf64 support.
41369 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41370 return SDValue();
41371
41372 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41373 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41374 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41375 return SDValue();
41376 SDValue N0 = V.getOperand(0);
41377 SDValue N1 = V.getOperand(1);
41378 unsigned Imm = V.getConstantOperandVal(2);
41379 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41380 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41382 return SDValue();
41383 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41384 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41385 DAG.getTargetConstant(Imm, DL, MVT::i8));
41386 };
41387
41388 switch (N.getOpcode()) {
41389 case X86ISD::VPERMILPI:
41390 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41391 unsigned Imm = N.getConstantOperandVal(1);
41392 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41393 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41394 }
41395 break;
41396 case X86ISD::SHUFP: {
41397 SDValue N0 = N.getOperand(0);
41398 SDValue N1 = N.getOperand(1);
41399 unsigned Imm = N.getConstantOperandVal(2);
41400 if (N0 == N1) {
41401 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41402 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41403 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41404 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41405 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41406 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41407 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41408 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41409 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41410 }
41411 break;
41412 }
41413 }
41414
41415 return SDValue();
41416}
41417
41418// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41419// iff we don't demand the same element index for both X and Y.
41420static SDValue
41422 const APInt &DemandedElts, SelectionDAG &DAG,
41423 const X86Subtarget &Subtarget, const SDLoc &DL) {
41424 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41425 if (!N0.hasOneUse() || !N1.hasOneUse())
41426 return SDValue();
41427
41428 unsigned NumElts = VT.getVectorNumElements();
41431
41432 // See if both operands are shuffles, and that we can scale the shuffle masks
41433 // to the same width as the blend mask.
41434 // TODO: Support SM_SentinelZero?
41435 SmallVector<SDValue, 2> Ops0, Ops1;
41436 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41437 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41438 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41439 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41440 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41441 return SDValue();
41442
41443 // Determine the demanded elts from both permutes.
41444 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41445 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41446 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41447 Demanded1,
41448 /*AllowUndefElts=*/true) ||
41449 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41450 DemandedRHS0, /*AllowUndefElts=*/true) ||
41451 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41452 DemandedRHS1, /*AllowUndefElts=*/true))
41453 return SDValue();
41454
41455 // Confirm that we only use a single operand from both permutes and that we
41456 // don't demand the same index from both.
41457 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41458 DemandedLHS0.intersects(DemandedLHS1))
41459 return SDValue();
41460
41461 // Use the permute demanded elts masks as the new blend mask.
41462 // Create the new permute mask as a blend of the 2 original permute masks.
41463 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41464 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41465 for (unsigned I = 0; I != NumElts; ++I) {
41466 if (Demanded0[I]) {
41467 int M = ScaledMask0[I];
41468 if (0 <= M) {
41469 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41470 "BlendMask demands LHS AND RHS");
41471 NewBlendMask[M] = M;
41472 NewPermuteMask[I] = M;
41473 }
41474 } else if (Demanded1[I]) {
41475 int M = ScaledMask1[I];
41476 if (0 <= M) {
41477 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41478 "BlendMask demands LHS AND RHS");
41479 NewBlendMask[M] = M + NumElts;
41480 NewPermuteMask[I] = M;
41481 }
41482 }
41483 }
41484 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41485 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41486
41487 // v16i16 shuffles can explode in complexity very easily, only accept them if
41488 // the blend mask is the same in the 128-bit subvectors (or can widen to
41489 // v8i32) and the permute can be widened as well.
41490 if (VT == MVT::v16i16) {
41491 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41492 !canWidenShuffleElements(NewBlendMask))
41493 return SDValue();
41494 if (!canWidenShuffleElements(NewPermuteMask))
41495 return SDValue();
41496 }
41497
41498 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41499 // widened to a lane permute (vperm2f128).
41500 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41502 NewPermuteMask) &&
41503 !canScaleShuffleElements(NewPermuteMask, 2))
41504 return SDValue();
41505
41506 SDValue NewBlend =
41507 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
41508 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
41509 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
41510 NewPermuteMask);
41511}
41512
41513// TODO - move this to TLI like isBinOp?
41514static bool isUnaryOp(unsigned Opcode) {
41515 switch (Opcode) {
41516 case ISD::CTLZ:
41517 case ISD::CTTZ:
41518 case ISD::CTPOP:
41519 return true;
41520 }
41521 return false;
41522}
41523
41524// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41525// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41527 const SDLoc &DL) {
41528 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41529 EVT ShuffleVT = N.getValueType();
41530 unsigned Opc = N.getOpcode();
41531
41532 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true,
41533 bool FoldLoad = false) {
41534 // AllZeros/AllOnes constants are freely shuffled and will peek through
41535 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
41536 // merge with target shuffles if it has one use so shuffle combining is
41537 // likely to kick in. Shuffles of splats are expected to be removed.
41538 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
41539 ISD::isBuildVectorAllZeros(Op.getNode()) ||
41542 getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
41543 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
41544 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41545 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
41546 (FoldLoad && isShuffleFoldableLoad(Op)) ||
41547 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
41548 };
41549 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
41550 // Ensure we only shuffle whole vector src elements, unless its a logical
41551 // binops where we can more aggressively move shuffles from dst to src.
41552 return isLogicOp(BinOp) ||
41553 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
41554 };
41555
41556 switch (Opc) {
41557 // Unary and Unary+Permute Shuffles.
41558 case X86ISD::PSHUFB: {
41559 // Don't merge PSHUFB if it contains zero'd elements.
41560 SmallVector<int> Mask;
41562 if (!getTargetShuffleMask(N, false, Ops, Mask))
41563 break;
41564 [[fallthrough]];
41565 }
41566 case X86ISD::VBROADCAST:
41567 case X86ISD::MOVDDUP:
41568 case X86ISD::PSHUFD:
41569 case X86ISD::PSHUFHW:
41570 case X86ISD::PSHUFLW:
41571 case X86ISD::VPERMI:
41572 case X86ISD::VPERMILPI: {
41573 if (N.getOperand(0).getValueType() == ShuffleVT &&
41574 N->isOnlyUserOf(N.getOperand(0).getNode())) {
41575 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41576 unsigned SrcOpcode = N0.getOpcode();
41577 EVT OpVT = N0.getValueType();
41578 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
41581 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::VPERMI,
41582 Opc != X86ISD::PSHUFB) ||
41583 IsMergeableWithShuffle(Op01, Opc != X86ISD::VPERMI,
41584 Opc != X86ISD::PSHUFB)) {
41585 SDValue LHS, RHS;
41586 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41587 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41588 if (N.getNumOperands() == 2) {
41589 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
41590 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
41591 } else {
41592 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
41593 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
41594 }
41595 return DAG.getBitcast(ShuffleVT,
41596 DAG.getNode(SrcOpcode, DL, OpVT,
41597 DAG.getBitcast(OpVT, LHS),
41598 DAG.getBitcast(OpVT, RHS)));
41599 }
41600 }
41601 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
41602 OpVT.getScalarSizeInBits() ==
41604 SDValue Op00 = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
41605 SDValue Res =
41606 N.getNumOperands() == 2
41607 ? DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1))
41608 : DAG.getNode(Opc, DL, ShuffleVT, Op00);
41609 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
41610 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
41611 }
41612 }
41613 break;
41614 }
41615 // Binary and Binary+Permute Shuffles.
41616 case X86ISD::INSERTPS: {
41617 // Don't merge INSERTPS if it contains zero'd elements.
41618 unsigned InsertPSMask = N.getConstantOperandVal(2);
41619 unsigned ZeroMask = InsertPSMask & 0xF;
41620 if (ZeroMask != 0)
41621 break;
41622 [[fallthrough]];
41623 }
41624 case X86ISD::MOVSD:
41625 case X86ISD::MOVSS:
41626 case X86ISD::BLENDI:
41627 case X86ISD::SHUFP:
41628 case X86ISD::UNPCKH:
41629 case X86ISD::UNPCKL: {
41630 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
41631 N->isOnlyUserOf(N.getOperand(1).getNode())) {
41632 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41633 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
41634 unsigned SrcOpcode = N0.getOpcode();
41635 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
41636 N0.getValueType() == N1.getValueType() &&
41637 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41638 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41643 // Ensure the total number of shuffles doesn't increase by folding this
41644 // shuffle through to the source ops.
41645 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
41646 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
41647 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
41648 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
41649 SDValue LHS, RHS;
41650 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41651 Op10 = DAG.getBitcast(ShuffleVT, Op10);
41652 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41653 Op11 = DAG.getBitcast(ShuffleVT, Op11);
41654 if (N.getNumOperands() == 3) {
41655 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
41656 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
41657 } else {
41658 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
41659 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
41660 }
41661 EVT OpVT = N0.getValueType();
41662 return DAG.getBitcast(ShuffleVT,
41663 DAG.getNode(SrcOpcode, DL, OpVT,
41664 DAG.getBitcast(OpVT, LHS),
41665 DAG.getBitcast(OpVT, RHS)));
41666 }
41667 }
41668 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
41669 N0.getValueType() == N1.getValueType() &&
41670 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41671 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41674 SDValue Res;
41675 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41676 Op10 = DAG.getBitcast(ShuffleVT, Op10);
41677 if (N.getNumOperands() == 3) {
41678 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
41679 } else {
41680 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
41681 }
41682 EVT OpVT = N0.getValueType();
41683 return DAG.getBitcast(
41684 ShuffleVT,
41685 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
41686 }
41687 // TODO: We can generalize this for other shuffles/conversions.
41688 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
41689 N1.getOpcode() == SrcOpcode &&
41690 N0.getValueType() == N1.getValueType() &&
41691 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
41692 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
41693 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41694 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41695 EVT OpSrcVT = N0.getOperand(0).getValueType();
41696 EVT OpDstVT = N0.getValueType();
41697 SDValue Res =
41698 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
41699 return DAG.getBitcast(ShuffleVT,
41700 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
41701 }
41702 }
41703 break;
41704 }
41705 }
41706 return SDValue();
41707}
41708
41709/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
41711 SelectionDAG &DAG,
41712 const SDLoc &DL) {
41713 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
41714
41715 MVT VT = V.getSimpleValueType();
41716 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
41717 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
41718 unsigned SrcOpc0 = Src0.getOpcode();
41719 unsigned SrcOpc1 = Src1.getOpcode();
41720 EVT SrcVT0 = Src0.getValueType();
41721 EVT SrcVT1 = Src1.getValueType();
41722
41723 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
41724 return SDValue();
41725
41726 switch (SrcOpc0) {
41727 case X86ISD::MOVDDUP: {
41728 SDValue LHS = Src0.getOperand(0);
41729 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41730 SDValue Res =
41731 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
41732 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
41733 return DAG.getBitcast(VT, Res);
41734 }
41735 case X86ISD::VPERMILPI:
41736 // TODO: Handle v4f64 permutes with different low/high lane masks.
41737 if (SrcVT0 == MVT::v4f64) {
41738 uint64_t Mask = Src0.getConstantOperandVal(1);
41739 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
41740 break;
41741 }
41742 [[fallthrough]];
41743 case X86ISD::VSHLI:
41744 case X86ISD::VSRLI:
41745 case X86ISD::VSRAI:
41746 case X86ISD::PSHUFD:
41747 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
41748 SDValue LHS = Src0.getOperand(0);
41749 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41750 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
41751 V.getOperand(2));
41752 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
41753 return DAG.getBitcast(VT, Res);
41754 }
41755 break;
41756 }
41757
41758 return SDValue();
41759}
41760
41761static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
41764 const X86Subtarget &Subtarget);
41765
41766/// Try to combine x86 target specific shuffles.
41768 SelectionDAG &DAG,
41770 const X86Subtarget &Subtarget) {
41771 using namespace SDPatternMatch;
41772
41773 MVT VT = N.getSimpleValueType();
41774 unsigned NumElts = VT.getVectorNumElements();
41776 unsigned Opcode = N.getOpcode();
41777 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41778
41779 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
41780 return R;
41781
41782 // Handle specific target shuffles.
41783 switch (Opcode) {
41784 case X86ISD::MOVDDUP: {
41785 SDValue Src = N.getOperand(0);
41786 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
41787 if (VT == MVT::v2f64 && Src.hasOneUse() &&
41788 ISD::isNormalLoad(Src.getNode())) {
41789 LoadSDNode *LN = cast<LoadSDNode>(Src);
41790 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
41791 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
41792 DCI.CombineTo(N.getNode(), Movddup);
41793 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41795 return N; // Return N so it doesn't get rechecked!
41796 }
41797 }
41798
41799 return SDValue();
41800 }
41801 case X86ISD::VBROADCAST: {
41802 SDValue Src = N.getOperand(0);
41803 SDValue BC = peekThroughBitcasts(Src);
41804 EVT SrcVT = Src.getValueType();
41805 EVT BCVT = BC.getValueType();
41806
41807 // If broadcasting from another shuffle, attempt to simplify it.
41808 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
41809 if (isTargetShuffle(BC.getOpcode()) &&
41810 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
41811 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
41812 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
41814 for (unsigned i = 0; i != Scale; ++i)
41815 DemandedMask[i] = i;
41817 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
41819 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
41820 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
41821 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41822 DAG.getBitcast(SrcVT, Res));
41823 }
41824
41825 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
41826 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
41827 if (Src.getOpcode() == ISD::BITCAST &&
41828 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
41829 TLI.isTypeLegal(BCVT) &&
41831 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
41832 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
41834 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41835 }
41836
41837 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
41838 // If we're re-broadcasting a smaller type then broadcast with that type and
41839 // bitcast.
41840 // TODO: Do this for any splat?
41841 if (Src.getOpcode() == ISD::BITCAST &&
41842 (BC.getOpcode() == X86ISD::VBROADCAST ||
41844 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
41845 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
41846 MVT NewVT =
41848 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
41849 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41850 }
41851
41852 // Reduce broadcast source vector to lowest 128-bits.
41853 if (SrcVT.getSizeInBits() > 128)
41854 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41855 extract128BitVector(Src, 0, DAG, DL));
41856
41857 // broadcast(scalar_to_vector(x)) -> broadcast(x).
41858 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
41859 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
41860 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41861
41862 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
41863 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41864 isNullConstant(Src.getOperand(1)) &&
41865 Src.getValueType() ==
41866 Src.getOperand(0).getValueType().getScalarType() &&
41867 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
41868 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41869
41870 // Share broadcast with the longest vector and extract low subvector (free).
41871 // Ensure the same SDValue from the SDNode use is being used.
41872 for (SDNode *User : Src->users())
41873 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
41874 Src == User->getOperand(0) &&
41875 User->getValueSizeInBits(0).getFixedValue() >
41876 VT.getFixedSizeInBits()) {
41877 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
41878 VT.getSizeInBits());
41879 }
41880
41881 // vbroadcast(scalarload X) -> vbroadcast_load X
41882 // For float loads, extract other uses of the scalar from the broadcast.
41883 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
41884 ISD::isNormalLoad(Src.getNode())) {
41885 LoadSDNode *LN = cast<LoadSDNode>(Src);
41886 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41887 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41888 SDValue BcastLd =
41890 LN->getMemoryVT(), LN->getMemOperand());
41891 // If the load value is used only by N, replace it via CombineTo N.
41892 bool NoReplaceExtract = Src.hasOneUse();
41893 DCI.CombineTo(N.getNode(), BcastLd);
41894 if (NoReplaceExtract) {
41895 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41897 } else {
41898 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
41899 DAG.getVectorIdxConstant(0, DL));
41900 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
41901 }
41902 return N; // Return N so it doesn't get rechecked!
41903 }
41904
41905 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
41906 // i16. So shrink it ourselves if we can make a broadcast_load.
41907 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
41908 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
41909 assert(Subtarget.hasAVX2() && "Expected AVX2");
41910 SDValue TruncIn = Src.getOperand(0);
41911
41912 // If this is a truncate of a non extending load we can just narrow it to
41913 // use a broadcast_load.
41914 if (ISD::isNormalLoad(TruncIn.getNode())) {
41915 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
41916 // Unless its volatile or atomic.
41917 if (LN->isSimple()) {
41918 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41919 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41920 SDValue BcastLd = DAG.getMemIntrinsicNode(
41921 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41922 LN->getPointerInfo(), LN->getOriginalAlign(),
41923 LN->getMemOperand()->getFlags());
41924 DCI.CombineTo(N.getNode(), BcastLd);
41925 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41926 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41927 return N; // Return N so it doesn't get rechecked!
41928 }
41929 }
41930
41931 // If this is a truncate of an i16 extload, we can directly replace it.
41932 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
41933 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
41934 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
41935 if (LN->getMemoryVT().getSizeInBits() == 16) {
41936 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41937 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41938 SDValue BcastLd =
41940 LN->getMemoryVT(), LN->getMemOperand());
41941 DCI.CombineTo(N.getNode(), BcastLd);
41942 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41943 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41944 return N; // Return N so it doesn't get rechecked!
41945 }
41946 }
41947
41948 // If this is a truncate of load that has been shifted right, we can
41949 // offset the pointer and use a narrower load.
41950 if (TruncIn.getOpcode() == ISD::SRL &&
41951 TruncIn.getOperand(0).hasOneUse() &&
41952 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
41953 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
41954 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
41955 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
41956 // Make sure the shift amount and the load size are divisible by 16.
41957 // Don't do this if the load is volatile or atomic.
41958 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
41959 LN->isSimple()) {
41960 unsigned Offset = ShiftAmt / 8;
41961 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41964 SDValue Ops[] = { LN->getChain(), Ptr };
41965 SDValue BcastLd = DAG.getMemIntrinsicNode(
41966 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41968 LN->getOriginalAlign(),
41969 LN->getMemOperand()->getFlags());
41970 DCI.CombineTo(N.getNode(), BcastLd);
41971 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41972 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41973 return N; // Return N so it doesn't get rechecked!
41974 }
41975 }
41976 }
41977
41978 // vbroadcast(vzload X) -> vbroadcast_load X
41979 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
41980 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
41981 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
41982 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41983 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41984 SDValue BcastLd =
41986 LN->getMemoryVT(), LN->getMemOperand());
41987 DCI.CombineTo(N.getNode(), BcastLd);
41988 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41990 return N; // Return N so it doesn't get rechecked!
41991 }
41992 }
41993
41994 // vbroadcast(vector load X) -> vbroadcast_load
41995 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
41996 SrcVT == MVT::v4i32) &&
41997 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
41998 LoadSDNode *LN = cast<LoadSDNode>(Src);
41999 // Unless the load is volatile or atomic.
42000 if (LN->isSimple()) {
42001 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42002 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42003 SDValue BcastLd = DAG.getMemIntrinsicNode(
42004 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
42005 LN->getPointerInfo(), LN->getOriginalAlign(),
42006 LN->getMemOperand()->getFlags());
42007 DCI.CombineTo(N.getNode(), BcastLd);
42008 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42010 return N; // Return N so it doesn't get rechecked!
42011 }
42012 }
42013
42014 return SDValue();
42015 }
42016 case X86ISD::VZEXT_MOVL: {
42017 SDValue N0 = N.getOperand(0);
42018
42019 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42020 // the load is volatile.
42021 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42022 auto *LN = cast<LoadSDNode>(N0);
42023 if (SDValue VZLoad =
42024 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42025 DCI.CombineTo(N.getNode(), VZLoad);
42026 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42028 return N;
42029 }
42030 }
42031
42032 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42033 // and can just use a VZEXT_LOAD.
42034 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42035 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42036 auto *LN = cast<MemSDNode>(N0);
42037 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42038 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42039 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42040 SDValue VZLoad =
42042 LN->getMemoryVT(), LN->getMemOperand());
42043 DCI.CombineTo(N.getNode(), VZLoad);
42044 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42046 return N;
42047 }
42048 }
42049
42050 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42051 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42052 // if the upper bits of the i64 are zero.
42053 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42054 N0.getOperand(0).hasOneUse() &&
42055 N0.getOperand(0).getValueType() == MVT::i64) {
42056 SDValue In = N0.getOperand(0);
42057 APInt Mask = APInt::getHighBitsSet(64, 32);
42058 if (DAG.MaskedValueIsZero(In, Mask)) {
42059 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42060 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42061 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42062 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42063 return DAG.getBitcast(VT, Movl);
42064 }
42065 }
42066
42067 // Load a scalar integer constant directly to XMM instead of transferring an
42068 // immediate value from GPR.
42069 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42070 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42071 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42072 // Create a vector constant - scalar constant followed by zeros.
42073 EVT ScalarVT = N0.getOperand(0).getValueType();
42074 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42075 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42076 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42077 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42078
42079 // Load the vector constant from constant pool.
42080 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42081 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42082 MachinePointerInfo MPI =
42084 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42085 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42087 }
42088 }
42089
42090 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42091 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42092 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42093 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42094 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42096
42097 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42098 isNullConstant(V.getOperand(2))) {
42099 SDValue In = V.getOperand(1);
42101 In.getValueSizeInBits() /
42102 VT.getScalarSizeInBits());
42103 In = DAG.getBitcast(SubVT, In);
42104 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42105 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42106 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42107 V.getOperand(2));
42108 }
42109 }
42110
42111 return SDValue();
42112 }
42113 case X86ISD::BLENDI: {
42114 SDValue N0 = N.getOperand(0);
42115 SDValue N1 = N.getOperand(1);
42116 unsigned EltBits = VT.getScalarSizeInBits();
42117
42118 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42119 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42120 // TODO: Handle MVT::v16i16 repeated blend mask.
42121 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42122 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42123 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42124 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42125 unsigned NewSize = SrcVT.getVectorNumElements();
42126 APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(NumElts);
42127 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42128 return DAG.getBitcast(
42129 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42130 N1.getOperand(0),
42131 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42132 DL, MVT::i8)));
42133 }
42134 }
42135 // Share PSHUFB masks:
42136 // blend(pshufb(x,m1),pshufb(y,m2))
42137 // --> m3 = blend(m1,m2)
42138 // blend(pshufb(x,m3),pshufb(y,m3))
42139 if (N0.hasOneUse() && N1.hasOneUse()) {
42140 SmallVector<int> Mask, ByteMask;
42144 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42145 RHS.getOpcode() == X86ISD::PSHUFB &&
42146 LHS.getOperand(1) != RHS.getOperand(1) &&
42147 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42148 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42149 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42150 RHS == peekThroughOneUseBitcasts(Ops[1]) &&
42151 "BLENDI decode mismatch");
42152 MVT ShufVT = LHS.getSimpleValueType();
42153 SDValue MaskLHS = LHS.getOperand(1);
42154 SDValue MaskRHS = RHS.getOperand(1);
42155 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42157 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42158 /*HasVariableMask=*/true, DAG, DL, Subtarget)) {
42159 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42160 LHS.getOperand(0), NewMask);
42161 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42162 RHS.getOperand(0), NewMask);
42163 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42164 DAG.getBitcast(VT, NewLHS),
42165 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42166 }
42167 }
42168 }
42169 }
42170 return SDValue();
42171 }
42172 case X86ISD::SHUFP: {
42173 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42174 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42175 // TODO: Support types other than v4f32.
42176 if (VT == MVT::v4f32) {
42177 bool Updated = false;
42178 SmallVector<int> Mask;
42180 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42181 for (int i = 0; i != 2; ++i) {
42182 SmallVector<SDValue> SubOps;
42183 SmallVector<int> SubMask, SubScaledMask;
42184 SDValue Sub = peekThroughBitcasts(Ops[i]);
42185 // TODO: Scaling might be easier if we specify the demanded elts.
42186 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42187 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42188 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42189 int Ofs = i * 2;
42190 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42191 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42192 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42193 Updated = true;
42194 }
42195 }
42196 }
42197 if (Updated) {
42198 for (int &M : Mask)
42199 M %= 4;
42200 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42201 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42202 }
42203 }
42204 return SDValue();
42205 }
42206 case X86ISD::VPERMI: {
42207 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42208 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42209 SDValue N0 = N.getOperand(0);
42210 SDValue N1 = N.getOperand(1);
42211 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42212 if (N0.getOpcode() == ISD::BITCAST &&
42213 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42214 SDValue Src = N0.getOperand(0);
42215 EVT SrcVT = Src.getValueType();
42216 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42217 return DAG.getBitcast(VT, Res);
42218 }
42219 return SDValue();
42220 }
42221 case X86ISD::SHUF128: {
42222 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42223 // see if we can peek through and access the subvector directly.
42224 if (VT.is512BitVector()) {
42225 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
42226 // upper subvector is used.
42227 SDValue LHS = N->getOperand(0);
42228 SDValue RHS = N->getOperand(1);
42229 uint64_t Mask = N->getConstantOperandVal(2);
42230 SmallVector<SDValue> LHSOps, RHSOps;
42231 SDValue NewLHS, NewRHS;
42232 if ((Mask & 0x0A) == 0x0A &&
42233 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42234 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42235 Mask &= ~0x0A;
42236 }
42237 if ((Mask & 0xA0) == 0xA0 &&
42238 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42239 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42240 Mask &= ~0xA0;
42241 }
42242 if (NewLHS || NewRHS)
42243 return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS,
42244 NewRHS ? NewRHS : RHS,
42245 DAG.getTargetConstant(Mask, DL, MVT::i8));
42246 }
42247 return SDValue();
42248 }
42249 case X86ISD::VPERM2X128: {
42250 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42251 SDValue LHS = N->getOperand(0);
42252 SDValue RHS = N->getOperand(1);
42253 if (LHS.getOpcode() == ISD::BITCAST &&
42254 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42255 EVT SrcVT = LHS.getOperand(0).getValueType();
42256 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42257 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42258 DAG.getBitcast(SrcVT, LHS),
42259 DAG.getBitcast(SrcVT, RHS),
42260 N->getOperand(2)));
42261 }
42262 }
42263
42264 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42266 return Res;
42267
42268 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42269 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42270 auto FindSubVector128 = [&](unsigned Idx) {
42271 if (Idx > 3)
42272 return SDValue();
42273 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42274 SmallVector<SDValue> SubOps;
42275 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42276 return SubOps[Idx & 1];
42277 unsigned NumElts = Src.getValueType().getVectorNumElements();
42278 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42279 Src.getOperand(1).getValueSizeInBits() == 128 &&
42280 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42281 return Src.getOperand(1);
42282 }
42283 return SDValue();
42284 };
42285 unsigned Imm = N.getConstantOperandVal(2);
42286 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42287 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42288 MVT SubVT = VT.getHalfNumVectorElementsVT();
42289 SubLo = DAG.getBitcast(SubVT, SubLo);
42290 SubHi = DAG.getBitcast(SubVT, SubHi);
42291 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42292 }
42293 }
42294 return SDValue();
42295 }
42296 case X86ISD::PSHUFD:
42297 case X86ISD::PSHUFLW:
42298 case X86ISD::PSHUFHW: {
42299 SDValue N0 = N.getOperand(0);
42300 SDValue N1 = N.getOperand(1);
42301 if (N0->hasOneUse()) {
42303 switch (V.getOpcode()) {
42304 case X86ISD::VSHL:
42305 case X86ISD::VSRL:
42306 case X86ISD::VSRA:
42307 case X86ISD::VSHLI:
42308 case X86ISD::VSRLI:
42309 case X86ISD::VSRAI:
42310 case X86ISD::VROTLI:
42311 case X86ISD::VROTRI: {
42312 MVT InnerVT = V.getSimpleValueType();
42313 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42314 SDValue Res = DAG.getNode(Opcode, DL, VT,
42315 DAG.getBitcast(VT, V.getOperand(0)), N1);
42316 Res = DAG.getBitcast(InnerVT, Res);
42317 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42318 return DAG.getBitcast(VT, Res);
42319 }
42320 break;
42321 }
42322 }
42323 }
42324
42325 Mask = getPSHUFShuffleMask(N);
42326 assert(Mask.size() == 4);
42327 break;
42328 }
42329 case X86ISD::MOVSD:
42330 case X86ISD::MOVSH:
42331 case X86ISD::MOVSS: {
42332 SDValue N0 = N.getOperand(0);
42333 SDValue N1 = N.getOperand(1);
42334
42335 // Canonicalize scalar FPOps:
42336 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42337 // If commutable, allow OP(N1[0], N0[0]).
42338 unsigned Opcode1 = N1.getOpcode();
42339 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42340 Opcode1 == ISD::FDIV) {
42341 SDValue N10 = N1.getOperand(0);
42342 SDValue N11 = N1.getOperand(1);
42343 if (N10 == N0 ||
42344 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42345 if (N10 != N0)
42346 std::swap(N10, N11);
42347 MVT SVT = VT.getVectorElementType();
42348 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42349 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42350 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42351 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42352 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42353 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42354 }
42355 }
42356
42357 return SDValue();
42358 }
42359 case X86ISD::INSERTPS: {
42360 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42361 SDValue Op0 = N.getOperand(0);
42362 SDValue Op1 = N.getOperand(1);
42363 unsigned InsertPSMask = N.getConstantOperandVal(2);
42364 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42365 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42366 unsigned ZeroMask = InsertPSMask & 0xF;
42367
42368 // If we zero out all elements from Op0 then we don't need to reference it.
42369 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42370 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42371 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42372
42373 // If we zero out the element from Op1 then we don't need to reference it.
42374 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42375 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42376 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42377
42378 // Attempt to merge insertps Op1 with an inner target shuffle node.
42379 SmallVector<int, 8> TargetMask1;
42381 APInt KnownUndef1, KnownZero1;
42382 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42383 KnownZero1)) {
42384 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42385 // Zero/UNDEF insertion - zero out element and remove dependency.
42386 InsertPSMask |= (1u << DstIdx);
42387 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42388 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42389 }
42390 // Update insertps mask srcidx and reference the source input directly.
42391 int M = TargetMask1[SrcIdx];
42392 assert(0 <= M && M < 8 && "Shuffle index out of range");
42393 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42394 Op1 = Ops1[M < 4 ? 0 : 1];
42395 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42396 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42397 }
42398
42399 // Attempt to merge insertps Op0 with an inner target shuffle node.
42400 SmallVector<int, 8> TargetMask0;
42402 APInt KnownUndef0, KnownZero0;
42403 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42404 KnownZero0)) {
42405 bool Updated = false;
42406 bool UseInput00 = false;
42407 bool UseInput01 = false;
42408 for (int i = 0; i != 4; ++i) {
42409 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42410 // No change if element is already zero or the inserted element.
42411 continue;
42412 }
42413
42414 if (KnownUndef0[i] || KnownZero0[i]) {
42415 // If the target mask is undef/zero then we must zero the element.
42416 InsertPSMask |= (1u << i);
42417 Updated = true;
42418 continue;
42419 }
42420
42421 // The input vector element must be inline.
42422 int M = TargetMask0[i];
42423 if (M != i && M != (i + 4))
42424 return SDValue();
42425
42426 // Determine which inputs of the target shuffle we're using.
42427 UseInput00 |= (0 <= M && M < 4);
42428 UseInput01 |= (4 <= M);
42429 }
42430
42431 // If we're not using both inputs of the target shuffle then use the
42432 // referenced input directly.
42433 if (UseInput00 && !UseInput01) {
42434 Updated = true;
42435 Op0 = Ops0[0];
42436 } else if (!UseInput00 && UseInput01) {
42437 Updated = true;
42438 Op0 = Ops0[1];
42439 }
42440
42441 if (Updated)
42442 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42443 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42444 }
42445
42446 // If we're inserting an element from a vbroadcast load, fold the
42447 // load into the X86insertps instruction. We need to convert the scalar
42448 // load to a vector and clear the source lane of the INSERTPS control.
42449 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42450 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42451 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42452 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42453 MemIntr->getBasePtr(),
42454 MemIntr->getMemOperand());
42455 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42457 Load),
42458 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42459 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42460 return Insert;
42461 }
42462 }
42463
42464 return SDValue();
42465 }
42466 case X86ISD::VPERMV3: {
42467 // Combine VPERMV3 to widened VPERMV if the two source operands can be
42468 // freely concatenated.
42469 if (VT.is128BitVector() ||
42470 (VT.is256BitVector() && Subtarget.useAVX512Regs())) {
42471 SDValue Ops[] = {N.getOperand(0), N.getOperand(2)};
42472 MVT WideVT = VT.getDoubleNumVectorElementsVT();
42473 if (SDValue ConcatSrc =
42474 combineConcatVectorOps(DL, WideVT, Ops, DAG, DCI, Subtarget)) {
42475 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
42476 DL, WideVT.getSizeInBits());
42477 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
42478 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
42479 DAG.getVectorIdxConstant(0, DL));
42480 }
42481 }
42484 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42485 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42486 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
42487 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
42488 MVT MaskVT = N.getOperand(1).getSimpleValueType();
42489 // Canonicalize to VPERMV if both sources are the same.
42490 if (V1 == V2) {
42491 for (int &M : Mask)
42492 M = (M < 0 ? M : M & (Mask.size() - 1));
42493 SDValue NewMask = getConstVector(Mask, MaskVT, DAG, DL,
42494 /*IsMask=*/true);
42495 return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, N.getOperand(0));
42496 }
42497 // If sources are half width, then concat and use VPERMV with adjusted
42498 // mask.
42499 SDValue Ops[2];
42500 MVT HalfVT = VT.getHalfNumVectorElementsVT();
42501 if (sd_match(V1,
42502 m_InsertSubvector(m_Undef(), m_Value(Ops[0]), m_Zero())) &&
42503 sd_match(V2,
42504 m_InsertSubvector(m_Undef(), m_Value(Ops[1]), m_Zero())) &&
42505 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
42506 if (SDValue ConcatSrc =
42507 combineConcatVectorOps(DL, VT, Ops, DAG, DCI, Subtarget)) {
42508 for (int &M : Mask)
42509 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
42510 SDValue NewMask = getConstVector(Mask, MaskVT, DAG, DL,
42511 /*IsMask=*/true);
42512 return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, ConcatSrc);
42513 }
42514 }
42515 // Commute foldable source to the RHS.
42516 if (isShuffleFoldableLoad(N.getOperand(0)) &&
42517 !isShuffleFoldableLoad(N.getOperand(2))) {
42519 SDValue NewMask =
42520 getConstVector(Mask, MaskVT, DAG, DL, /*IsMask=*/true);
42521 return DAG.getNode(X86ISD::VPERMV3, DL, VT, N.getOperand(2), NewMask,
42522 N.getOperand(0));
42523 }
42524 }
42525 return SDValue();
42526 }
42527 default:
42528 return SDValue();
42529 }
42530
42531 // Nuke no-op shuffles that show up after combining.
42532 if (isNoopShuffleMask(Mask))
42533 return N.getOperand(0);
42534
42535 // Look for simplifications involving one or two shuffle instructions.
42536 SDValue V = N.getOperand(0);
42537 switch (N.getOpcode()) {
42538 default:
42539 break;
42540 case X86ISD::PSHUFLW:
42541 case X86ISD::PSHUFHW:
42542 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
42543
42544 // See if this reduces to a PSHUFD which is no more expensive and can
42545 // combine with more operations. Note that it has to at least flip the
42546 // dwords as otherwise it would have been removed as a no-op.
42547 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
42548 int DMask[] = {0, 1, 2, 3};
42549 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
42550 DMask[DOffset + 0] = DOffset + 1;
42551 DMask[DOffset + 1] = DOffset + 0;
42552 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
42553 V = DAG.getBitcast(DVT, V);
42554 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
42555 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
42556 return DAG.getBitcast(VT, V);
42557 }
42558
42559 // Look for shuffle patterns which can be implemented as a single unpack.
42560 // FIXME: This doesn't handle the location of the PSHUFD generically, and
42561 // only works when we have a PSHUFD followed by two half-shuffles.
42562 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
42563 (V.getOpcode() == X86ISD::PSHUFLW ||
42564 V.getOpcode() == X86ISD::PSHUFHW) &&
42565 V.getOpcode() != N.getOpcode() &&
42566 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
42567 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
42568 if (D.getOpcode() == X86ISD::PSHUFD) {
42571 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42572 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42573 int WordMask[8];
42574 for (int i = 0; i < 4; ++i) {
42575 WordMask[i + NOffset] = Mask[i] + NOffset;
42576 WordMask[i + VOffset] = VMask[i] + VOffset;
42577 }
42578 // Map the word mask through the DWord mask.
42579 int MappedMask[8];
42580 for (int i = 0; i < 8; ++i)
42581 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
42582 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
42583 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
42584 // We can replace all three shuffles with an unpack.
42585 V = DAG.getBitcast(VT, D.getOperand(0));
42586 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
42588 DL, VT, V, V);
42589 }
42590 }
42591 }
42592
42593 break;
42594
42595 case X86ISD::PSHUFD:
42596 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
42597 return NewN;
42598
42599 break;
42600 }
42601
42602 return SDValue();
42603}
42604
42605/// Checks if the shuffle mask takes subsequent elements
42606/// alternately from two vectors.
42607/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
42608static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
42609
42610 int ParitySrc[2] = {-1, -1};
42611 unsigned Size = Mask.size();
42612 for (unsigned i = 0; i != Size; ++i) {
42613 int M = Mask[i];
42614 if (M < 0)
42615 continue;
42616
42617 // Make sure we are using the matching element from the input.
42618 if ((M % Size) != i)
42619 return false;
42620
42621 // Make sure we use the same input for all elements of the same parity.
42622 int Src = M / Size;
42623 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
42624 return false;
42625 ParitySrc[i % 2] = Src;
42626 }
42627
42628 // Make sure each input is used.
42629 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
42630 return false;
42631
42632 Op0Even = ParitySrc[0] == 0;
42633 return true;
42634}
42635
42636/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
42637/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
42638/// are written to the parameters \p Opnd0 and \p Opnd1.
42639///
42640/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
42641/// so it is easier to generically match. We also insert dummy vector shuffle
42642/// nodes for the operands which explicitly discard the lanes which are unused
42643/// by this operation to try to flow through the rest of the combiner
42644/// the fact that they're unused.
42645static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
42646 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
42647 bool &IsSubAdd) {
42648
42649 EVT VT = N->getValueType(0);
42650 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42651 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
42653 return false;
42654
42655 // We only handle target-independent shuffles.
42656 // FIXME: It would be easy and harmless to use the target shuffle mask
42657 // extraction tool to support more.
42658 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42659 return false;
42660
42661 SDValue V1 = N->getOperand(0);
42662 SDValue V2 = N->getOperand(1);
42663
42664 // Make sure we have an FADD and an FSUB.
42665 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
42666 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
42667 V1.getOpcode() == V2.getOpcode())
42668 return false;
42669
42670 // If there are other uses of these operations we can't fold them.
42671 if (!V1->hasOneUse() || !V2->hasOneUse())
42672 return false;
42673
42674 // Ensure that both operations have the same operands. Note that we can
42675 // commute the FADD operands.
42676 SDValue LHS, RHS;
42677 if (V1.getOpcode() == ISD::FSUB) {
42678 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
42679 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
42680 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
42681 return false;
42682 } else {
42683 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
42684 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
42685 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
42686 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
42687 return false;
42688 }
42689
42690 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42691 bool Op0Even;
42692 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42693 return false;
42694
42695 // It's a subadd if the vector in the even parity is an FADD.
42696 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
42697 : V2->getOpcode() == ISD::FADD;
42698
42699 Opnd0 = LHS;
42700 Opnd1 = RHS;
42701 return true;
42702}
42703
42704/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
42706 const X86Subtarget &Subtarget,
42707 SelectionDAG &DAG) {
42708 // We only handle target-independent shuffles.
42709 // FIXME: It would be easy and harmless to use the target shuffle mask
42710 // extraction tool to support more.
42711 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42712 return SDValue();
42713
42714 MVT VT = N->getSimpleValueType(0);
42715 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42716 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
42717 return SDValue();
42718
42719 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
42720 SDValue Op0 = N->getOperand(0);
42721 SDValue Op1 = N->getOperand(1);
42722 SDValue FMAdd = Op0, FMSub = Op1;
42723 if (FMSub.getOpcode() != X86ISD::FMSUB)
42724 std::swap(FMAdd, FMSub);
42725
42726 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
42727 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
42728 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
42729 FMAdd.getOperand(2) != FMSub.getOperand(2))
42730 return SDValue();
42731
42732 // Check for correct shuffle mask.
42733 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42734 bool Op0Even;
42735 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42736 return SDValue();
42737
42738 // FMAddSub takes zeroth operand from FMSub node.
42739 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
42740 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42741 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
42742 FMAdd.getOperand(2));
42743}
42744
42745/// Try to combine a shuffle into a target-specific add-sub or
42746/// mul-add-sub node.
42748 const X86Subtarget &Subtarget,
42749 SelectionDAG &DAG) {
42750 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
42751 return V;
42752
42753 SDValue Opnd0, Opnd1;
42754 bool IsSubAdd;
42755 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
42756 return SDValue();
42757
42758 MVT VT = N->getSimpleValueType(0);
42759
42760 // Try to generate X86ISD::FMADDSUB node here.
42761 SDValue Opnd2;
42762 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
42763 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42764 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
42765 }
42766
42767 if (IsSubAdd)
42768 return SDValue();
42769
42770 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
42771 // the ADDSUB idiom has been successfully recognized. There are no known
42772 // X86 targets with 512-bit ADDSUB instructions!
42773 if (VT.is512BitVector())
42774 return SDValue();
42775
42776 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
42777 // the ADDSUB idiom has been successfully recognized. There are no known
42778 // X86 targets with FP16 ADDSUB instructions!
42779 if (VT.getVectorElementType() == MVT::f16)
42780 return SDValue();
42781
42782 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
42783}
42784
42785// We are looking for a shuffle where both sources are concatenated with undef
42786// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
42787// if we can express this as a single-source shuffle, that's preferable.
42789 SelectionDAG &DAG,
42790 const X86Subtarget &Subtarget) {
42791 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
42792 return SDValue();
42793
42794 EVT VT = N->getValueType(0);
42795
42796 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
42797 if (!VT.is128BitVector() && !VT.is256BitVector())
42798 return SDValue();
42799
42800 if (VT.getVectorElementType() != MVT::i32 &&
42801 VT.getVectorElementType() != MVT::i64 &&
42802 VT.getVectorElementType() != MVT::f32 &&
42803 VT.getVectorElementType() != MVT::f64)
42804 return SDValue();
42805
42806 SDValue N0 = N->getOperand(0);
42807 SDValue N1 = N->getOperand(1);
42808
42809 // Check that both sources are concats with undef.
42810 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
42811 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
42812 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
42813 !N1.getOperand(1).isUndef())
42814 return SDValue();
42815
42816 // Construct the new shuffle mask. Elements from the first source retain their
42817 // index, but elements from the second source no longer need to skip an undef.
42819 int NumElts = VT.getVectorNumElements();
42820
42821 auto *SVOp = cast<ShuffleVectorSDNode>(N);
42822 for (int Elt : SVOp->getMask())
42823 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
42824
42826 N1.getOperand(0));
42827 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
42828}
42829
42830/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
42831/// low half of each source vector and does not set any high half elements in
42832/// the destination vector, narrow the shuffle to half its original size.
42834 EVT VT = Shuf->getValueType(0);
42835 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
42836 return SDValue();
42837 if (!VT.is256BitVector() && !VT.is512BitVector())
42838 return SDValue();
42839
42840 // See if we can ignore all of the high elements of the shuffle.
42841 ArrayRef<int> Mask = Shuf->getMask();
42842 if (!isUndefUpperHalf(Mask))
42843 return SDValue();
42844
42845 // Check if the shuffle mask accesses only the low half of each input vector
42846 // (half-index output is 0 or 2).
42847 int HalfIdx1, HalfIdx2;
42848 SmallVector<int, 8> HalfMask(Mask.size() / 2);
42849 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
42850 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
42851 return SDValue();
42852
42853 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
42854 // The trick is knowing that all of the insert/extract are actually free
42855 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
42856 // of narrow inputs into a narrow output, and that is always cheaper than
42857 // the wide shuffle that we started with.
42858 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
42859 Shuf->getOperand(1), HalfMask, HalfIdx1,
42860 HalfIdx2, false, DAG, /*UseConcat*/ true);
42861}
42862
42865 const X86Subtarget &Subtarget) {
42866 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
42867 if (SDValue V = narrowShuffle(Shuf, DAG))
42868 return V;
42869
42870 // If we have legalized the vector types, look for blends of FADD and FSUB
42871 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
42872 SDLoc dl(N);
42873 EVT VT = N->getValueType(0);
42874 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42875 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
42876 if (SDValue AddSub =
42877 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
42878 return AddSub;
42879
42880 // Attempt to combine into a vector load/broadcast.
42882 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
42883 return LD;
42884
42885 // For AVX2, we sometimes want to combine
42886 // (vector_shuffle <mask> (concat_vectors t1, undef)
42887 // (concat_vectors t2, undef))
42888 // Into:
42889 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
42890 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
42891 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
42892 return ShufConcat;
42893
42894 if (isTargetShuffle(N->getOpcode())) {
42895 SDValue Op(N, 0);
42896 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
42897 return Shuffle;
42898
42899 // Try recursively combining arbitrary sequences of x86 shuffle
42900 // instructions into higher-order shuffles. We do this after combining
42901 // specific PSHUF instruction sequences into their minimal form so that we
42902 // can evaluate how many specialized shuffle instructions are involved in
42903 // a particular chain.
42904 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
42905 return Res;
42906
42907 // Simplify source operands based on shuffle mask.
42908 // TODO - merge this into combineX86ShufflesRecursively.
42909 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
42910 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
42911 return SDValue(N, 0);
42912
42913 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42914 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42915 // Perform this after other shuffle combines to allow inner shuffles to be
42916 // combined away first.
42917 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
42918 return BinOp;
42919 }
42920
42921 return SDValue();
42922}
42923
42924// Simplify variable target shuffle masks based on the demanded elements.
42925// TODO: Handle DemandedBits in mask indices as well?
42927 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
42928 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
42929 // If we're demanding all elements don't bother trying to simplify the mask.
42930 unsigned NumElts = DemandedElts.getBitWidth();
42931 if (DemandedElts.isAllOnes())
42932 return false;
42933
42934 SDValue Mask = Op.getOperand(MaskIndex);
42935 if (!Mask.hasOneUse())
42936 return false;
42937
42938 // Attempt to generically simplify the variable shuffle mask.
42939 APInt MaskUndef, MaskZero;
42940 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
42941 Depth + 1))
42942 return true;
42943
42944 // Attempt to extract+simplify a (constant pool load) shuffle mask.
42945 // TODO: Support other types from getTargetShuffleMaskIndices?
42947 EVT BCVT = BC.getValueType();
42948 auto *Load = dyn_cast<LoadSDNode>(BC);
42949 if (!Load || !Load->getBasePtr().hasOneUse())
42950 return false;
42951
42952 const Constant *C = getTargetConstantFromNode(Load);
42953 if (!C)
42954 return false;
42955
42956 Type *CTy = C->getType();
42957 if (!CTy->isVectorTy() ||
42958 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
42959 return false;
42960
42961 // Handle scaling for i64 elements on 32-bit targets.
42962 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
42963 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
42964 return false;
42965 unsigned Scale = NumCstElts / NumElts;
42966
42967 // Simplify mask if we have an undemanded element that is not undef.
42968 bool Simplified = false;
42969 SmallVector<Constant *, 32> ConstVecOps;
42970 for (unsigned i = 0; i != NumCstElts; ++i) {
42971 Constant *Elt = C->getAggregateElement(i);
42972 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
42973 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
42974 Simplified = true;
42975 continue;
42976 }
42977 ConstVecOps.push_back(Elt);
42978 }
42979 if (!Simplified)
42980 return false;
42981
42982 // Generate new constant pool entry + legalize immediately for the load.
42983 SDLoc DL(Op);
42984 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
42985 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
42986 SDValue NewMask = TLO.DAG.getLoad(
42987 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
42989 Load->getAlign());
42990 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
42991}
42992
42994 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
42995 TargetLoweringOpt &TLO, unsigned Depth) const {
42996 int NumElts = DemandedElts.getBitWidth();
42997 unsigned Opc = Op.getOpcode();
42998 EVT VT = Op.getValueType();
42999
43000 // Handle special case opcodes.
43001 switch (Opc) {
43002 case X86ISD::PMULDQ:
43003 case X86ISD::PMULUDQ: {
43004 APInt LHSUndef, LHSZero;
43005 APInt RHSUndef, RHSZero;
43006 SDValue LHS = Op.getOperand(0);
43007 SDValue RHS = Op.getOperand(1);
43008 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43009 Depth + 1))
43010 return true;
43011 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43012 Depth + 1))
43013 return true;
43014 // Multiply by zero.
43015 KnownZero = LHSZero | RHSZero;
43016 break;
43017 }
43018 case X86ISD::VPMADDUBSW:
43019 case X86ISD::VPMADDWD: {
43020 APInt LHSUndef, LHSZero;
43021 APInt RHSUndef, RHSZero;
43022 SDValue LHS = Op.getOperand(0);
43023 SDValue RHS = Op.getOperand(1);
43024 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43025
43026 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43027 Depth + 1))
43028 return true;
43029 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43030 Depth + 1))
43031 return true;
43032
43033 // TODO: Multiply by zero.
43034
43035 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43036 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43037 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43038 Depth + 1))
43039 return true;
43040 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43041 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43042 Depth + 1))
43043 return true;
43044 break;
43045 }
43046 case X86ISD::PSADBW: {
43047 SDValue LHS = Op.getOperand(0);
43048 SDValue RHS = Op.getOperand(1);
43049 assert(VT.getScalarType() == MVT::i64 &&
43050 LHS.getValueType() == RHS.getValueType() &&
43051 LHS.getValueType().getScalarType() == MVT::i8 &&
43052 "Unexpected PSADBW types");
43053
43054 // Aggressively peek through ops to get at the demanded elts.
43055 if (!DemandedElts.isAllOnes()) {
43056 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43057 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43059 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43061 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43062 if (NewLHS || NewRHS) {
43063 NewLHS = NewLHS ? NewLHS : LHS;
43064 NewRHS = NewRHS ? NewRHS : RHS;
43065 return TLO.CombineTo(
43066 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43067 }
43068 }
43069 break;
43070 }
43071 case X86ISD::VSHL:
43072 case X86ISD::VSRL:
43073 case X86ISD::VSRA: {
43074 // We only need the bottom 64-bits of the (128-bit) shift amount.
43075 SDValue Amt = Op.getOperand(1);
43076 MVT AmtVT = Amt.getSimpleValueType();
43077 assert(AmtVT.is128BitVector() && "Unexpected value type");
43078
43079 // If we reuse the shift amount just for sse shift amounts then we know that
43080 // only the bottom 64-bits are only ever used.
43081 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43082 unsigned UseOpc = Use->getOpcode();
43083 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43084 UseOpc == X86ISD::VSRA) &&
43085 Use->getOperand(0) != Amt;
43086 });
43087
43088 APInt AmtUndef, AmtZero;
43089 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43090 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43091 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43092 Depth + 1, AssumeSingleUse))
43093 return true;
43094 [[fallthrough]];
43095 }
43096 case X86ISD::VSHLI:
43097 case X86ISD::VSRLI:
43098 case X86ISD::VSRAI: {
43099 SDValue Src = Op.getOperand(0);
43100 APInt SrcUndef;
43101 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43102 Depth + 1))
43103 return true;
43104
43105 // Fold shift(0,x) -> 0
43106 if (DemandedElts.isSubsetOf(KnownZero))
43107 return TLO.CombineTo(
43108 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43109
43110 // Aggressively peek through ops to get at the demanded elts.
43111 if (!DemandedElts.isAllOnes())
43113 Src, DemandedElts, TLO.DAG, Depth + 1))
43114 return TLO.CombineTo(
43115 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43116 break;
43117 }
43118 case X86ISD::VPSHA:
43119 case X86ISD::VPSHL:
43120 case X86ISD::VSHLV:
43121 case X86ISD::VSRLV:
43122 case X86ISD::VSRAV: {
43123 APInt LHSUndef, LHSZero;
43124 APInt RHSUndef, RHSZero;
43125 SDValue LHS = Op.getOperand(0);
43126 SDValue RHS = Op.getOperand(1);
43127 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43128 Depth + 1))
43129 return true;
43130
43131 // Fold shift(0,x) -> 0
43132 if (DemandedElts.isSubsetOf(LHSZero))
43133 return TLO.CombineTo(
43134 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43135
43136 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43137 Depth + 1))
43138 return true;
43139
43140 KnownZero = LHSZero;
43141 break;
43142 }
43143 case X86ISD::PCMPEQ:
43144 case X86ISD::PCMPGT: {
43145 APInt LHSUndef, LHSZero;
43146 APInt RHSUndef, RHSZero;
43147 SDValue LHS = Op.getOperand(0);
43148 SDValue RHS = Op.getOperand(1);
43149 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43150 Depth + 1))
43151 return true;
43152 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43153 Depth + 1))
43154 return true;
43155 break;
43156 }
43157 case X86ISD::KSHIFTL: {
43158 SDValue Src = Op.getOperand(0);
43159 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43160 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43161 unsigned ShiftAmt = Amt->getZExtValue();
43162
43163 if (ShiftAmt == 0)
43164 return TLO.CombineTo(Op, Src);
43165
43166 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43167 // single shift. We can do this if the bottom bits (which are shifted
43168 // out) are never demanded.
43169 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43170 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43171 unsigned C1 = Src.getConstantOperandVal(1);
43172 unsigned NewOpc = X86ISD::KSHIFTL;
43173 int Diff = ShiftAmt - C1;
43174 if (Diff < 0) {
43175 Diff = -Diff;
43176 NewOpc = X86ISD::KSHIFTR;
43177 }
43178
43179 SDLoc dl(Op);
43180 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43181 return TLO.CombineTo(
43182 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43183 }
43184 }
43185
43186 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43187 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43188 Depth + 1))
43189 return true;
43190
43191 KnownUndef <<= ShiftAmt;
43192 KnownZero <<= ShiftAmt;
43193 KnownZero.setLowBits(ShiftAmt);
43194 break;
43195 }
43196 case X86ISD::KSHIFTR: {
43197 SDValue Src = Op.getOperand(0);
43198 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43199 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43200 unsigned ShiftAmt = Amt->getZExtValue();
43201
43202 if (ShiftAmt == 0)
43203 return TLO.CombineTo(Op, Src);
43204
43205 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43206 // single shift. We can do this if the top bits (which are shifted
43207 // out) are never demanded.
43208 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43209 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43210 unsigned C1 = Src.getConstantOperandVal(1);
43211 unsigned NewOpc = X86ISD::KSHIFTR;
43212 int Diff = ShiftAmt - C1;
43213 if (Diff < 0) {
43214 Diff = -Diff;
43215 NewOpc = X86ISD::KSHIFTL;
43216 }
43217
43218 SDLoc dl(Op);
43219 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43220 return TLO.CombineTo(
43221 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43222 }
43223 }
43224
43225 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43226 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43227 Depth + 1))
43228 return true;
43229
43230 KnownUndef.lshrInPlace(ShiftAmt);
43231 KnownZero.lshrInPlace(ShiftAmt);
43232 KnownZero.setHighBits(ShiftAmt);
43233 break;
43234 }
43235 case X86ISD::ANDNP: {
43236 // ANDNP = (~LHS & RHS);
43237 SDValue LHS = Op.getOperand(0);
43238 SDValue RHS = Op.getOperand(1);
43239
43240 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43241 APInt UndefElts;
43242 SmallVector<APInt> EltBits;
43243 int NumElts = VT.getVectorNumElements();
43244 int EltSizeInBits = VT.getScalarSizeInBits();
43245 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43246 APInt OpElts = DemandedElts;
43247 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43248 EltBits)) {
43249 OpBits.clearAllBits();
43250 OpElts.clearAllBits();
43251 for (int I = 0; I != NumElts; ++I) {
43252 if (!DemandedElts[I])
43253 continue;
43254 if (UndefElts[I]) {
43255 // We can't assume an undef src element gives an undef dst - the
43256 // other src might be zero.
43257 OpBits.setAllBits();
43258 OpElts.setBit(I);
43259 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43260 (!Invert && !EltBits[I].isZero())) {
43261 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43262 OpElts.setBit(I);
43263 }
43264 }
43265 }
43266 return std::make_pair(OpBits, OpElts);
43267 };
43268 APInt BitsLHS, EltsLHS;
43269 APInt BitsRHS, EltsRHS;
43270 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43271 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43272
43273 APInt LHSUndef, LHSZero;
43274 APInt RHSUndef, RHSZero;
43275 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43276 Depth + 1))
43277 return true;
43278 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43279 Depth + 1))
43280 return true;
43281
43282 if (!DemandedElts.isAllOnes()) {
43283 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43284 TLO.DAG, Depth + 1);
43285 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43286 TLO.DAG, Depth + 1);
43287 if (NewLHS || NewRHS) {
43288 NewLHS = NewLHS ? NewLHS : LHS;
43289 NewRHS = NewRHS ? NewRHS : RHS;
43290 return TLO.CombineTo(
43291 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43292 }
43293 }
43294 break;
43295 }
43296 case X86ISD::CVTSI2P:
43297 case X86ISD::CVTUI2P:
43298 case X86ISD::CVTPH2PS:
43299 case X86ISD::CVTPS2PH: {
43300 SDValue Src = Op.getOperand(0);
43301 EVT SrcVT = Src.getValueType();
43302 APInt SrcUndef, SrcZero;
43303 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43304 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43305 Depth + 1))
43306 return true;
43307 break;
43308 }
43309 case X86ISD::PACKSS:
43310 case X86ISD::PACKUS: {
43311 SDValue N0 = Op.getOperand(0);
43312 SDValue N1 = Op.getOperand(1);
43313
43314 APInt DemandedLHS, DemandedRHS;
43315 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43316
43317 APInt LHSUndef, LHSZero;
43318 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43319 Depth + 1))
43320 return true;
43321 APInt RHSUndef, RHSZero;
43322 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43323 Depth + 1))
43324 return true;
43325
43326 // TODO - pass on known zero/undef.
43327
43328 // Aggressively peek through ops to get at the demanded elts.
43329 // TODO - we should do this for all target/faux shuffles ops.
43330 if (!DemandedElts.isAllOnes()) {
43331 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43332 TLO.DAG, Depth + 1);
43333 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43334 TLO.DAG, Depth + 1);
43335 if (NewN0 || NewN1) {
43336 NewN0 = NewN0 ? NewN0 : N0;
43337 NewN1 = NewN1 ? NewN1 : N1;
43338 return TLO.CombineTo(Op,
43339 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43340 }
43341 }
43342 break;
43343 }
43344 case X86ISD::HADD:
43345 case X86ISD::HSUB:
43346 case X86ISD::FHADD:
43347 case X86ISD::FHSUB: {
43348 SDValue N0 = Op.getOperand(0);
43349 SDValue N1 = Op.getOperand(1);
43350
43351 APInt DemandedLHS, DemandedRHS;
43352 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43353
43354 APInt LHSUndef, LHSZero;
43355 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43356 Depth + 1))
43357 return true;
43358 APInt RHSUndef, RHSZero;
43359 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43360 Depth + 1))
43361 return true;
43362
43363 // TODO - pass on known zero/undef.
43364
43365 // Aggressively peek through ops to get at the demanded elts.
43366 // TODO: Handle repeated operands.
43367 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43368 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43369 TLO.DAG, Depth + 1);
43370 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43371 TLO.DAG, Depth + 1);
43372 if (NewN0 || NewN1) {
43373 NewN0 = NewN0 ? NewN0 : N0;
43374 NewN1 = NewN1 ? NewN1 : N1;
43375 return TLO.CombineTo(Op,
43376 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43377 }
43378 }
43379 break;
43380 }
43381 case X86ISD::VTRUNC:
43382 case X86ISD::VTRUNCS:
43383 case X86ISD::VTRUNCUS: {
43384 SDValue Src = Op.getOperand(0);
43385 MVT SrcVT = Src.getSimpleValueType();
43386 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43387 APInt SrcUndef, SrcZero;
43388 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43389 Depth + 1))
43390 return true;
43391 KnownZero = SrcZero.zextOrTrunc(NumElts);
43392 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43393 break;
43394 }
43395 case X86ISD::BLENDI: {
43396 SmallVector<int, 16> BlendMask;
43397 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43399 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43400 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43401 return TLO.CombineTo(Op, R);
43402 break;
43403 }
43404 case X86ISD::BLENDV: {
43405 APInt SelUndef, SelZero;
43406 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43407 SelZero, TLO, Depth + 1))
43408 return true;
43409
43410 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43411 APInt LHSUndef, LHSZero;
43412 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43413 LHSZero, TLO, Depth + 1))
43414 return true;
43415
43416 APInt RHSUndef, RHSZero;
43417 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43418 RHSZero, TLO, Depth + 1))
43419 return true;
43420
43421 KnownZero = LHSZero & RHSZero;
43422 KnownUndef = LHSUndef & RHSUndef;
43423 break;
43424 }
43425 case X86ISD::VZEXT_MOVL: {
43426 // If upper demanded elements are already zero then we have nothing to do.
43427 SDValue Src = Op.getOperand(0);
43428 APInt DemandedUpperElts = DemandedElts;
43429 DemandedUpperElts.clearLowBits(1);
43430 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43431 return TLO.CombineTo(Op, Src);
43432 break;
43433 }
43434 case X86ISD::VZEXT_LOAD: {
43435 // If upper demanded elements are not demanded then simplify to a
43436 // scalar_to_vector(load()).
43438 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43439 SDLoc DL(Op);
43440 auto *Mem = cast<MemSDNode>(Op);
43441 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43442 Mem->getMemOperand());
43443 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43444 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43445 }
43446 break;
43447 }
43448 case X86ISD::VBROADCAST: {
43449 SDValue Src = Op.getOperand(0);
43450 MVT SrcVT = Src.getSimpleValueType();
43451 // Don't bother broadcasting if we just need the 0'th element.
43452 if (DemandedElts == 1) {
43453 if (!SrcVT.isVector())
43454 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
43455 else if (Src.getValueType() != VT)
43456 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43457 SDLoc(Op));
43458 return TLO.CombineTo(Op, Src);
43459 }
43460 if (!SrcVT.isVector())
43461 break;
43462 APInt SrcUndef, SrcZero;
43463 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43464 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43465 Depth + 1))
43466 return true;
43467 // Aggressively peek through src to get at the demanded elt.
43468 // TODO - we should do this for all target/faux shuffles ops.
43470 Src, SrcElts, TLO.DAG, Depth + 1))
43471 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43472 break;
43473 }
43474 case X86ISD::VPERMV:
43475 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
43476 Depth))
43477 return true;
43478 break;
43479 case X86ISD::PSHUFB:
43480 case X86ISD::VPERMV3:
43481 case X86ISD::VPERMILPV:
43482 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
43483 Depth))
43484 return true;
43485 break;
43486 case X86ISD::VPPERM:
43487 case X86ISD::VPERMIL2:
43488 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
43489 Depth))
43490 return true;
43491 break;
43492 }
43493
43494 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
43495 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
43496 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
43497 if ((VT.is256BitVector() || VT.is512BitVector()) &&
43498 DemandedElts.lshr(NumElts / 2) == 0) {
43499 unsigned SizeInBits = VT.getSizeInBits();
43500 unsigned ExtSizeInBits = SizeInBits / 2;
43501
43502 // See if 512-bit ops only use the bottom 128-bits.
43503 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
43504 ExtSizeInBits = SizeInBits / 4;
43505
43506 switch (Opc) {
43507 // Scalar broadcast.
43508 case X86ISD::VBROADCAST: {
43509 SDLoc DL(Op);
43510 SDValue Src = Op.getOperand(0);
43511 if (Src.getValueSizeInBits() > ExtSizeInBits)
43512 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
43513 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43514 ExtSizeInBits / VT.getScalarSizeInBits());
43515 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
43516 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43517 TLO.DAG, DL, ExtSizeInBits));
43518 }
43520 SDLoc DL(Op);
43521 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43522 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43523 ExtSizeInBits / VT.getScalarSizeInBits());
43524 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
43525 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
43526 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
43527 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
43528 MemIntr->getMemOperand());
43530 Bcst.getValue(1));
43531 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43532 TLO.DAG, DL, ExtSizeInBits));
43533 }
43534 // Subvector broadcast.
43536 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43537 EVT MemVT = MemIntr->getMemoryVT();
43538 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
43539 SDLoc DL(Op);
43540 SDValue Ld =
43541 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
43542 MemIntr->getBasePtr(), MemIntr->getMemOperand());
43544 Ld.getValue(1));
43545 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
43546 TLO.DAG, DL, ExtSizeInBits));
43547 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
43548 SDLoc DL(Op);
43549 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43550 ExtSizeInBits / VT.getScalarSizeInBits());
43551 if (SDValue BcstLd =
43552 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
43553 return TLO.CombineTo(Op,
43554 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
43555 TLO.DAG, DL, ExtSizeInBits));
43556 }
43557 break;
43558 }
43559 // Byte shifts by immediate.
43560 case X86ISD::VSHLDQ:
43561 case X86ISD::VSRLDQ:
43562 // Shift by uniform.
43563 case X86ISD::VSHL:
43564 case X86ISD::VSRL:
43565 case X86ISD::VSRA:
43566 // Shift by immediate.
43567 case X86ISD::VSHLI:
43568 case X86ISD::VSRLI:
43569 case X86ISD::VSRAI: {
43570 SDLoc DL(Op);
43571 SDValue Ext0 =
43572 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
43573 SDValue ExtOp =
43574 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
43575 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43576 SDValue Insert =
43577 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43578 return TLO.CombineTo(Op, Insert);
43579 }
43580 case X86ISD::VPERMI: {
43581 // Simplify PERMPD/PERMQ to extract_subvector.
43582 // TODO: This should be done in shuffle combining.
43583 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
43585 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
43586 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
43587 SDLoc DL(Op);
43588 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
43589 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43590 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
43591 return TLO.CombineTo(Op, Insert);
43592 }
43593 }
43594 break;
43595 }
43596 case X86ISD::VPERM2X128: {
43597 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
43598 SDLoc DL(Op);
43599 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
43600 if (LoMask & 0x8)
43601 return TLO.CombineTo(
43602 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
43603 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
43604 unsigned SrcIdx = (LoMask & 0x2) >> 1;
43605 SDValue ExtOp =
43606 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
43607 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43608 SDValue Insert =
43609 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43610 return TLO.CombineTo(Op, Insert);
43611 }
43612 // Conversions.
43613 // TODO: Add more CVT opcodes when we have test coverage.
43614 case X86ISD::CVTTP2SI:
43615 case X86ISD::CVTTP2UI:
43616 case X86ISD::CVTPH2PS: {
43617 SDLoc DL(Op);
43618 unsigned Scale = SizeInBits / ExtSizeInBits;
43619 SDValue SrcOp = Op.getOperand(0);
43620 MVT SrcVT = SrcOp.getSimpleValueType();
43621 unsigned SrcExtSize =
43622 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
43624 ExtSizeInBits / VT.getScalarSizeInBits());
43625 SDValue ExtOp = TLO.DAG.getNode(
43626 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
43627 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43628 SDValue Insert =
43629 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43630 return TLO.CombineTo(Op, Insert);
43631 }
43632 // Zero upper elements.
43633 case X86ISD::VZEXT_MOVL:
43634 // Variable blend.
43635 case X86ISD::BLENDV:
43636 // Target unary shuffles by immediate:
43637 case X86ISD::PSHUFD:
43638 case X86ISD::PSHUFLW:
43639 case X86ISD::PSHUFHW:
43640 case X86ISD::VPERMILPI:
43641 // (Non-Lane Crossing) Target Shuffles.
43642 case X86ISD::VPERMILPV:
43643 case X86ISD::VPERMIL2:
43644 case X86ISD::PSHUFB:
43645 case X86ISD::UNPCKL:
43646 case X86ISD::UNPCKH:
43647 case X86ISD::BLENDI:
43648 // Integer ops.
43649 case X86ISD::PACKSS:
43650 case X86ISD::PACKUS:
43651 case X86ISD::PCMPEQ:
43652 case X86ISD::PCMPGT:
43653 case X86ISD::PMULUDQ:
43654 case X86ISD::PMULDQ:
43655 case X86ISD::VSHLV:
43656 case X86ISD::VSRLV:
43657 case X86ISD::VSRAV:
43658 // Float ops.
43659 case X86ISD::FMAX:
43660 case X86ISD::FMIN:
43661 case X86ISD::FMAXC:
43662 case X86ISD::FMINC:
43663 case X86ISD::FRSQRT:
43664 case X86ISD::FRCP:
43665 // Horizontal Ops.
43666 case X86ISD::HADD:
43667 case X86ISD::HSUB:
43668 case X86ISD::FHADD:
43669 case X86ISD::FHSUB: {
43670 SDLoc DL(Op);
43672 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
43673 SDValue SrcOp = Op.getOperand(i);
43674 EVT SrcVT = SrcOp.getValueType();
43675 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
43676 "Unsupported vector size");
43677 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
43678 ExtSizeInBits)
43679 : SrcOp);
43680 }
43681 MVT ExtVT = VT.getSimpleVT();
43682 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
43683 ExtSizeInBits / ExtVT.getScalarSizeInBits());
43684 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
43685 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43686 SDValue Insert =
43687 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43688 return TLO.CombineTo(Op, Insert);
43689 }
43690 }
43691 }
43692
43693 // For splats, unless we *only* demand the 0'th element,
43694 // stop attempts at simplification here, we aren't going to improve things,
43695 // this is better than any potential shuffle.
43696 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
43697 return false;
43698
43699 // Get target/faux shuffle mask.
43700 APInt OpUndef, OpZero;
43701 SmallVector<int, 64> OpMask;
43702 SmallVector<SDValue, 2> OpInputs;
43703 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
43704 OpZero, TLO.DAG, Depth, false))
43705 return false;
43706
43707 // Shuffle inputs must be the same size as the result.
43708 if (OpMask.size() != (unsigned)NumElts ||
43709 llvm::any_of(OpInputs, [VT](SDValue V) {
43710 return VT.getSizeInBits() != V.getValueSizeInBits() ||
43711 !V.getValueType().isVector();
43712 }))
43713 return false;
43714
43715 KnownZero = OpZero;
43716 KnownUndef = OpUndef;
43717
43718 // Check if shuffle mask can be simplified to undef/zero/identity.
43719 int NumSrcs = OpInputs.size();
43720 for (int i = 0; i != NumElts; ++i)
43721 if (!DemandedElts[i])
43722 OpMask[i] = SM_SentinelUndef;
43723
43724 if (isUndefInRange(OpMask, 0, NumElts)) {
43725 KnownUndef.setAllBits();
43726 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
43727 }
43728 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
43729 KnownZero.setAllBits();
43730 return TLO.CombineTo(
43731 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43732 }
43733 for (int Src = 0; Src != NumSrcs; ++Src)
43734 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
43735 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
43736
43737 // Attempt to simplify inputs.
43738 for (int Src = 0; Src != NumSrcs; ++Src) {
43739 // TODO: Support inputs of different types.
43740 if (OpInputs[Src].getValueType() != VT)
43741 continue;
43742
43743 int Lo = Src * NumElts;
43744 APInt SrcElts = APInt::getZero(NumElts);
43745 for (int i = 0; i != NumElts; ++i)
43746 if (DemandedElts[i]) {
43747 int M = OpMask[i] - Lo;
43748 if (0 <= M && M < NumElts)
43749 SrcElts.setBit(M);
43750 }
43751
43752 // TODO - Propagate input undef/zero elts.
43753 APInt SrcUndef, SrcZero;
43754 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
43755 TLO, Depth + 1))
43756 return true;
43757 }
43758
43759 // If we don't demand all elements, then attempt to combine to a simpler
43760 // shuffle.
43761 // We need to convert the depth to something combineX86ShufflesRecursively
43762 // can handle - so pretend its Depth == 0 again, and reduce the max depth
43763 // to match. This prevents combineX86ShuffleChain from returning a
43764 // combined shuffle that's the same as the original root, causing an
43765 // infinite loop.
43766 if (!DemandedElts.isAllOnes()) {
43767 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
43768
43769 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
43770 for (int i = 0; i != NumElts; ++i)
43771 if (DemandedElts[i])
43772 DemandedMask[i] = i;
43773
43775 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
43776 /*HasVarMask*/ false,
43777 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
43778 Subtarget);
43779 if (NewShuffle)
43780 return TLO.CombineTo(Op, NewShuffle);
43781 }
43782
43783 return false;
43784}
43785
43787 SDValue Op, const APInt &OriginalDemandedBits,
43788 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
43789 unsigned Depth) const {
43790 EVT VT = Op.getValueType();
43791 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
43792 unsigned Opc = Op.getOpcode();
43793 switch(Opc) {
43794 case X86ISD::VTRUNC: {
43795 KnownBits KnownOp;
43796 SDValue Src = Op.getOperand(0);
43797 MVT SrcVT = Src.getSimpleValueType();
43798
43799 // Simplify the input, using demanded bit information.
43800 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
43801 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
43802 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
43803 return true;
43804 break;
43805 }
43806 case X86ISD::PMULDQ:
43807 case X86ISD::PMULUDQ: {
43808 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
43809 KnownBits KnownLHS, KnownRHS;
43810 SDValue LHS = Op.getOperand(0);
43811 SDValue RHS = Op.getOperand(1);
43812
43813 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
43814 // FIXME: Can we bound this better?
43815 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
43816 APInt DemandedMaskLHS = APInt::getAllOnes(64);
43817 APInt DemandedMaskRHS = APInt::getAllOnes(64);
43818
43819 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
43820 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
43821 DemandedMaskLHS = DemandedMask;
43822 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
43823 DemandedMaskRHS = DemandedMask;
43824
43825 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
43826 KnownLHS, TLO, Depth + 1))
43827 return true;
43828 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
43829 KnownRHS, TLO, Depth + 1))
43830 return true;
43831
43832 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
43833 KnownRHS = KnownRHS.trunc(32);
43834 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
43835 KnownRHS.getConstant().isOne()) {
43836 SDLoc DL(Op);
43837 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
43838 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
43839 }
43840
43841 // Aggressively peek through ops to get at the demanded low bits.
43843 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43845 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43846 if (DemandedLHS || DemandedRHS) {
43847 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
43848 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
43849 return TLO.CombineTo(
43850 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
43851 }
43852 break;
43853 }
43854 case X86ISD::ANDNP: {
43855 KnownBits Known2;
43856 SDValue Op0 = Op.getOperand(0);
43857 SDValue Op1 = Op.getOperand(1);
43858
43859 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
43860 Known, TLO, Depth + 1))
43861 return true;
43862
43863 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
43864 OriginalDemandedElts, Known2, TLO, Depth + 1))
43865 return true;
43866
43867 // If the RHS is a constant, see if we can simplify it.
43868 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
43869 OriginalDemandedElts, TLO))
43870 return true;
43871
43872 // ANDNP = (~Op0 & Op1);
43873 Known.One &= Known2.Zero;
43874 Known.Zero |= Known2.One;
43875 break;
43876 }
43877 case X86ISD::VSHLI: {
43878 SDValue Op0 = Op.getOperand(0);
43879 SDValue Op1 = Op.getOperand(1);
43880
43881 unsigned ShAmt = Op1->getAsZExtVal();
43882 if (ShAmt >= BitWidth)
43883 break;
43884
43885 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
43886
43887 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43888 // single shift. We can do this if the bottom bits (which are shifted
43889 // out) are never demanded.
43890 if (Op0.getOpcode() == X86ISD::VSRLI &&
43891 OriginalDemandedBits.countr_zero() >= ShAmt) {
43892 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
43893 if (Shift2Amt < BitWidth) {
43894 int Diff = ShAmt - Shift2Amt;
43895 if (Diff == 0)
43896 return TLO.CombineTo(Op, Op0.getOperand(0));
43897
43898 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
43899 SDValue NewShift = TLO.DAG.getNode(
43900 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
43901 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
43902 return TLO.CombineTo(Op, NewShift);
43903 }
43904 }
43905
43906 // If we are only demanding sign bits then we can use the shift source directly.
43907 unsigned NumSignBits =
43908 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
43909 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
43910 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43911 return TLO.CombineTo(Op, Op0);
43912
43913 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43914 TLO, Depth + 1))
43915 return true;
43916
43917 Known.Zero <<= ShAmt;
43918 Known.One <<= ShAmt;
43919
43920 // Low bits known zero.
43921 Known.Zero.setLowBits(ShAmt);
43922
43923 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
43924 // Attempt to avoid multi-use ops if we don't need anything from them.
43925 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
43926 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
43927 SDValue NewOp =
43928 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
43929 return TLO.CombineTo(Op, NewOp);
43930 }
43931 }
43932 return false;
43933 }
43934 case X86ISD::VSRLI: {
43935 SDValue Op0 = Op.getOperand(0);
43936 SDValue Op1 = Op.getOperand(1);
43937
43938 unsigned ShAmt = Op1->getAsZExtVal();
43939 if (ShAmt >= BitWidth)
43940 break;
43941
43942 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43943
43944 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43945 TLO, Depth + 1))
43946 return true;
43947
43948 Known.Zero.lshrInPlace(ShAmt);
43949 Known.One.lshrInPlace(ShAmt);
43950
43951 // High bits known zero.
43952 Known.Zero.setHighBits(ShAmt);
43953
43954 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
43955 // Attempt to avoid multi-use ops if we don't need anything from them.
43956 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
43957 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
43958 SDValue NewOp =
43959 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
43960 return TLO.CombineTo(Op, NewOp);
43961 }
43962 }
43963 return false;
43964 }
43965 case X86ISD::VSRAI: {
43966 SDValue Op0 = Op.getOperand(0);
43967 SDValue Op1 = Op.getOperand(1);
43968
43969 unsigned ShAmt = Op1->getAsZExtVal();
43970 if (ShAmt >= BitWidth)
43971 break;
43972
43973 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43974
43975 // If we just want the sign bit then we don't need to shift it.
43976 if (OriginalDemandedBits.isSignMask())
43977 return TLO.CombineTo(Op, Op0);
43978
43979 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
43980 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
43981 SDValue Op00 = Op0.getOperand(0);
43982 unsigned NumSignBits =
43983 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
43984 if (ShAmt < NumSignBits)
43985 return TLO.CombineTo(Op, Op00);
43986 }
43987
43988 // If any of the demanded bits are produced by the sign extension, we also
43989 // demand the input sign bit.
43990 if (OriginalDemandedBits.countl_zero() < ShAmt)
43991 DemandedMask.setSignBit();
43992
43993 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43994 TLO, Depth + 1))
43995 return true;
43996
43997 Known.Zero.lshrInPlace(ShAmt);
43998 Known.One.lshrInPlace(ShAmt);
43999
44000 // If the input sign bit is known to be zero, or if none of the top bits
44001 // are demanded, turn this into an unsigned shift right.
44002 if (Known.Zero[BitWidth - ShAmt - 1] ||
44003 OriginalDemandedBits.countl_zero() >= ShAmt)
44004 return TLO.CombineTo(
44005 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44006
44007 // High bits are known one.
44008 if (Known.One[BitWidth - ShAmt - 1])
44009 Known.One.setHighBits(ShAmt);
44010
44011 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44012 // Attempt to avoid multi-use ops if we don't need anything from them.
44013 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44014 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44015 SDValue NewOp =
44016 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44017 return TLO.CombineTo(Op, NewOp);
44018 }
44019 }
44020 return false;
44021 }
44022 case X86ISD::BLENDV: {
44023 SDValue Sel = Op.getOperand(0);
44024 SDValue LHS = Op.getOperand(1);
44025 SDValue RHS = Op.getOperand(2);
44026
44027 APInt SignMask = APInt::getSignMask(BitWidth);
44029 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44031 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44033 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44034
44035 if (NewSel || NewLHS || NewRHS) {
44036 NewSel = NewSel ? NewSel : Sel;
44037 NewLHS = NewLHS ? NewLHS : LHS;
44038 NewRHS = NewRHS ? NewRHS : RHS;
44039 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44040 NewSel, NewLHS, NewRHS));
44041 }
44042 break;
44043 }
44044 case X86ISD::PEXTRB:
44045 case X86ISD::PEXTRW: {
44046 SDValue Vec = Op.getOperand(0);
44047 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44048 MVT VecVT = Vec.getSimpleValueType();
44049 unsigned NumVecElts = VecVT.getVectorNumElements();
44050
44051 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44052 unsigned Idx = CIdx->getZExtValue();
44053 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44054
44055 // If we demand no bits from the vector then we must have demanded
44056 // bits from the implict zext - simplify to zero.
44057 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44058 if (DemandedVecBits == 0)
44059 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44060
44061 APInt KnownUndef, KnownZero;
44062 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44063 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44064 KnownZero, TLO, Depth + 1))
44065 return true;
44066
44067 KnownBits KnownVec;
44068 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44069 KnownVec, TLO, Depth + 1))
44070 return true;
44071
44073 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44074 return TLO.CombineTo(
44075 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44076
44077 Known = KnownVec.zext(BitWidth);
44078 return false;
44079 }
44080 break;
44081 }
44082 case X86ISD::PINSRB:
44083 case X86ISD::PINSRW: {
44084 SDValue Vec = Op.getOperand(0);
44085 SDValue Scl = Op.getOperand(1);
44086 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44087 MVT VecVT = Vec.getSimpleValueType();
44088
44089 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44090 unsigned Idx = CIdx->getZExtValue();
44091 if (!OriginalDemandedElts[Idx])
44092 return TLO.CombineTo(Op, Vec);
44093
44094 KnownBits KnownVec;
44095 APInt DemandedVecElts(OriginalDemandedElts);
44096 DemandedVecElts.clearBit(Idx);
44097 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44098 KnownVec, TLO, Depth + 1))
44099 return true;
44100
44101 KnownBits KnownScl;
44102 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44103 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44104 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44105 return true;
44106
44107 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44108 Known = KnownVec.intersectWith(KnownScl);
44109 return false;
44110 }
44111 break;
44112 }
44113 case X86ISD::PACKSS:
44114 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44115 // sign bit then we can just ask for the source operands sign bit.
44116 // TODO - add known bits handling.
44117 if (OriginalDemandedBits.isSignMask()) {
44118 APInt DemandedLHS, DemandedRHS;
44119 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44120
44121 KnownBits KnownLHS, KnownRHS;
44122 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44123 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44124 KnownLHS, TLO, Depth + 1))
44125 return true;
44126 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44127 KnownRHS, TLO, Depth + 1))
44128 return true;
44129
44130 // Attempt to avoid multi-use ops if we don't need anything from them.
44132 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44134 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44135 if (DemandedOp0 || DemandedOp1) {
44136 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44137 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44138 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44139 }
44140 }
44141 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44142 break;
44143 case X86ISD::VBROADCAST: {
44144 SDValue Src = Op.getOperand(0);
44145 MVT SrcVT = Src.getSimpleValueType();
44146 APInt DemandedElts = APInt::getOneBitSet(
44147 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44148 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44149 TLO, Depth + 1))
44150 return true;
44151 // If we don't need the upper bits, attempt to narrow the broadcast source.
44152 // Don't attempt this on AVX512 as it might affect broadcast folding.
44153 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44154 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44155 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44156 Src->hasOneUse()) {
44157 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44158 SDValue NewSrc =
44159 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44160 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44161 SDValue NewBcst =
44162 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44163 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44164 }
44165 break;
44166 }
44167 case X86ISD::PCMPGT:
44168 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44169 // iff we only need the sign bit then we can use R directly.
44170 if (OriginalDemandedBits.isSignMask() &&
44171 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44172 return TLO.CombineTo(Op, Op.getOperand(1));
44173 break;
44174 case X86ISD::MOVMSK: {
44175 SDValue Src = Op.getOperand(0);
44176 MVT SrcVT = Src.getSimpleValueType();
44177 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44178 unsigned NumElts = SrcVT.getVectorNumElements();
44179
44180 // If we don't need the sign bits at all just return zero.
44181 if (OriginalDemandedBits.countr_zero() >= NumElts)
44182 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44183
44184 // See if we only demand bits from the lower 128-bit vector.
44185 if (SrcVT.is256BitVector() &&
44186 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44187 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44188 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44189 }
44190
44191 // Only demand the vector elements of the sign bits we need.
44192 APInt KnownUndef, KnownZero;
44193 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44194 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44195 TLO, Depth + 1))
44196 return true;
44197
44198 Known.Zero = KnownZero.zext(BitWidth);
44199 Known.Zero.setHighBits(BitWidth - NumElts);
44200
44201 // MOVMSK only uses the MSB from each vector element.
44202 KnownBits KnownSrc;
44203 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44204 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44205 Depth + 1))
44206 return true;
44207
44208 if (KnownSrc.One[SrcBits - 1])
44209 Known.One.setLowBits(NumElts);
44210 else if (KnownSrc.Zero[SrcBits - 1])
44211 Known.Zero.setLowBits(NumElts);
44212
44213 // Attempt to avoid multi-use os if we don't need anything from it.
44215 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44216 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44217 return false;
44218 }
44219 case X86ISD::TESTP: {
44220 SDValue Op0 = Op.getOperand(0);
44221 SDValue Op1 = Op.getOperand(1);
44222 MVT OpVT = Op0.getSimpleValueType();
44223 assert((OpVT.getVectorElementType() == MVT::f32 ||
44224 OpVT.getVectorElementType() == MVT::f64) &&
44225 "Illegal vector type for X86ISD::TESTP");
44226
44227 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44228 KnownBits KnownSrc;
44229 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44230 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44231 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44232 AssumeSingleUse) ||
44233 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44234 AssumeSingleUse);
44235 }
44236 case X86ISD::CMOV: {
44237 KnownBits Known2;
44238 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44239 OriginalDemandedElts, Known2, TLO, Depth + 1))
44240 return true;
44241 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44242 OriginalDemandedElts, Known, TLO, Depth + 1))
44243 return true;
44244
44245 // Only known if known in both the LHS and RHS.
44246 Known = Known.intersectWith(Known2);
44247 break;
44248 }
44249 case X86ISD::BEXTR:
44250 case X86ISD::BEXTRI: {
44251 SDValue Op0 = Op.getOperand(0);
44252 SDValue Op1 = Op.getOperand(1);
44253
44254 // Only bottom 16-bits of the control bits are required.
44255 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44256 // NOTE: SimplifyDemandedBits won't do this for constants.
44257 uint64_t Val1 = Cst1->getZExtValue();
44258 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44259 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44260 SDLoc DL(Op);
44261 return TLO.CombineTo(
44262 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44263 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44264 }
44265
44266 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44267 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44268
44269 // If the length is 0, the result is 0.
44270 if (Length == 0) {
44271 Known.setAllZero();
44272 return false;
44273 }
44274
44275 if ((Shift + Length) <= BitWidth) {
44276 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44277 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44278 return true;
44279
44280 Known = Known.extractBits(Length, Shift);
44281 Known = Known.zextOrTrunc(BitWidth);
44282 return false;
44283 }
44284 } else {
44285 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44286 KnownBits Known1;
44287 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44288 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44289 return true;
44290
44291 // If the length is 0, replace with 0.
44292 KnownBits LengthBits = Known1.extractBits(8, 8);
44293 if (LengthBits.isZero())
44294 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44295 }
44296
44297 break;
44298 }
44299 case X86ISD::PDEP: {
44300 SDValue Op0 = Op.getOperand(0);
44301 SDValue Op1 = Op.getOperand(1);
44302
44303 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44304 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44305
44306 // If the demanded bits has leading zeroes, we don't demand those from the
44307 // mask.
44308 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44309 return true;
44310
44311 // The number of possible 1s in the mask determines the number of LSBs of
44312 // operand 0 used. Undemanded bits from the mask don't matter so filter
44313 // them before counting.
44314 KnownBits Known2;
44315 uint64_t Count = (~Known.Zero & LoMask).popcount();
44316 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44317 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44318 return true;
44319
44320 // Zeroes are retained from the mask, but not ones.
44321 Known.One.clearAllBits();
44322 // The result will have at least as many trailing zeros as the non-mask
44323 // operand since bits can only map to the same or higher bit position.
44324 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44325 return false;
44326 }
44327 }
44328
44330 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
44331}
44332
44334 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
44335 SelectionDAG &DAG, unsigned Depth) const {
44336 int NumElts = DemandedElts.getBitWidth();
44337 unsigned Opc = Op.getOpcode();
44338 EVT VT = Op.getValueType();
44339
44340 switch (Opc) {
44341 case X86ISD::PINSRB:
44342 case X86ISD::PINSRW: {
44343 // If we don't demand the inserted element, return the base vector.
44344 SDValue Vec = Op.getOperand(0);
44345 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44346 MVT VecVT = Vec.getSimpleValueType();
44347 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
44348 !DemandedElts[CIdx->getZExtValue()])
44349 return Vec;
44350 break;
44351 }
44352 case X86ISD::VSHLI: {
44353 // If we are only demanding sign bits then we can use the shift source
44354 // directly.
44355 SDValue Op0 = Op.getOperand(0);
44356 unsigned ShAmt = Op.getConstantOperandVal(1);
44357 unsigned BitWidth = DemandedBits.getBitWidth();
44358 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
44359 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
44360 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44361 return Op0;
44362 break;
44363 }
44364 case X86ISD::VSRAI:
44365 // iff we only need the sign bit then we can use the source directly.
44366 // TODO: generalize where we only demand extended signbits.
44367 if (DemandedBits.isSignMask())
44368 return Op.getOperand(0);
44369 break;
44370 case X86ISD::PCMPGT:
44371 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44372 // iff we only need the sign bit then we can use R directly.
44373 if (DemandedBits.isSignMask() &&
44374 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44375 return Op.getOperand(1);
44376 break;
44377 case X86ISD::BLENDV: {
44378 // BLENDV: Cond (MSB) ? LHS : RHS
44379 SDValue Cond = Op.getOperand(0);
44380 SDValue LHS = Op.getOperand(1);
44381 SDValue RHS = Op.getOperand(2);
44382
44383 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
44384 if (CondKnown.isNegative())
44385 return LHS;
44386 if (CondKnown.isNonNegative())
44387 return RHS;
44388 break;
44389 }
44390 case X86ISD::ANDNP: {
44391 // ANDNP = (~LHS & RHS);
44392 SDValue LHS = Op.getOperand(0);
44393 SDValue RHS = Op.getOperand(1);
44394
44395 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
44396 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
44397
44398 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
44399 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
44400 // this context, so return RHS.
44401 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
44402 return RHS;
44403 break;
44404 }
44405 }
44406
44407 APInt ShuffleUndef, ShuffleZero;
44408 SmallVector<int, 16> ShuffleMask;
44410 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
44411 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
44412 // If all the demanded elts are from one operand and are inline,
44413 // then we can use the operand directly.
44414 int NumOps = ShuffleOps.size();
44415 if (ShuffleMask.size() == (unsigned)NumElts &&
44417 return VT.getSizeInBits() == V.getValueSizeInBits();
44418 })) {
44419
44420 if (DemandedElts.isSubsetOf(ShuffleUndef))
44421 return DAG.getUNDEF(VT);
44422 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
44423 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
44424
44425 // Bitmask that indicates which ops have only been accessed 'inline'.
44426 APInt IdentityOp = APInt::getAllOnes(NumOps);
44427 for (int i = 0; i != NumElts; ++i) {
44428 int M = ShuffleMask[i];
44429 if (!DemandedElts[i] || ShuffleUndef[i])
44430 continue;
44431 int OpIdx = M / NumElts;
44432 int EltIdx = M % NumElts;
44433 if (M < 0 || EltIdx != i) {
44434 IdentityOp.clearAllBits();
44435 break;
44436 }
44437 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
44438 if (IdentityOp == 0)
44439 break;
44440 }
44441 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
44442 "Multiple identity shuffles detected");
44443
44444 if (IdentityOp != 0)
44445 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
44446 }
44447 }
44448
44450 Op, DemandedBits, DemandedElts, DAG, Depth);
44451}
44452
44454 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44455 bool PoisonOnly, unsigned Depth) const {
44456 unsigned NumElts = DemandedElts.getBitWidth();
44457
44458 switch (Op.getOpcode()) {
44459 case X86ISD::PSHUFD:
44460 case X86ISD::VPERMILPI:
44461 case X86ISD::VPERMV3: {
44464 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
44465 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
44466 APInt::getZero(NumElts));
44467 for (auto M : enumerate(Mask)) {
44468 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
44469 continue;
44470 if (M.value() == SM_SentinelUndef)
44471 return false;
44472 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
44473 "Shuffle mask index out of range");
44474 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
44475 }
44476 for (auto Op : enumerate(Ops))
44477 if (!DemandedSrcElts[Op.index()].isZero() &&
44479 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
44480 return false;
44481 return true;
44482 }
44483 break;
44484 }
44485 }
44487 Op, DemandedElts, DAG, PoisonOnly, Depth);
44488}
44489
44491 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44492 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
44493
44494 switch (Op.getOpcode()) {
44495 // SSE vector multiplies are either inbounds or saturate.
44496 case X86ISD::VPMADDUBSW:
44497 case X86ISD::VPMADDWD:
44498 // SSE vector shifts handle out of bounds shift amounts.
44499 case X86ISD::VSHLI:
44500 case X86ISD::VSRLI:
44501 case X86ISD::VSRAI:
44502 return false;
44503 case X86ISD::PSHUFD:
44504 case X86ISD::VPERMILPI:
44505 case X86ISD::VPERMV3:
44506 case X86ISD::UNPCKH:
44507 case X86ISD::UNPCKL:
44508 return false;
44509 // SSE comparisons handle all fcmp cases.
44510 // TODO: Add PCMPEQ/GT and CMPM/MM with test coverage.
44511 case X86ISD::CMPP:
44512 return false;
44514 switch (Op->getConstantOperandVal(0)) {
44515 case Intrinsic::x86_sse2_pmadd_wd:
44516 case Intrinsic::x86_avx2_pmadd_wd:
44517 case Intrinsic::x86_avx512_pmaddw_d_512:
44518 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
44519 case Intrinsic::x86_avx2_pmadd_ub_sw:
44520 case Intrinsic::x86_avx512_pmaddubs_w_512:
44521 return false;
44522 }
44523 }
44525 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
44526}
44527
44529 const APInt &DemandedElts,
44530 APInt &UndefElts,
44531 const SelectionDAG &DAG,
44532 unsigned Depth) const {
44533 unsigned NumElts = DemandedElts.getBitWidth();
44534 unsigned Opc = Op.getOpcode();
44535
44536 switch (Opc) {
44537 case X86ISD::VBROADCAST:
44539 UndefElts = APInt::getZero(NumElts);
44540 return true;
44541 }
44542
44543 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
44544 DAG, Depth);
44545}
44546
44547// Helper to peek through bitops/trunc/setcc to determine size of source vector.
44548// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
44549static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
44550 bool AllowTruncate) {
44551 switch (Src.getOpcode()) {
44552 case ISD::TRUNCATE:
44553 if (!AllowTruncate)
44554 return false;
44555 [[fallthrough]];
44556 case ISD::SETCC:
44557 return Src.getOperand(0).getValueSizeInBits() == Size;
44558 case ISD::FREEZE:
44559 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate);
44560 case ISD::AND:
44561 case ISD::XOR:
44562 case ISD::OR:
44563 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
44564 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
44565 case ISD::SELECT:
44566 case ISD::VSELECT:
44567 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
44568 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
44569 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
44570 case ISD::BUILD_VECTOR:
44571 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
44572 ISD::isBuildVectorAllOnes(Src.getNode());
44573 }
44574 return false;
44575}
44576
44577// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
44578static unsigned getAltBitOpcode(unsigned Opcode) {
44579 switch(Opcode) {
44580 // clang-format off
44581 case ISD::AND: return X86ISD::FAND;
44582 case ISD::OR: return X86ISD::FOR;
44583 case ISD::XOR: return X86ISD::FXOR;
44584 case X86ISD::ANDNP: return X86ISD::FANDN;
44585 // clang-format on
44586 }
44587 llvm_unreachable("Unknown bitwise opcode");
44588}
44589
44590// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
44592 const SDLoc &DL) {
44593 EVT SrcVT = Src.getValueType();
44594 if (SrcVT != MVT::v4i1)
44595 return SDValue();
44596
44597 switch (Src.getOpcode()) {
44598 case ISD::SETCC:
44599 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
44600 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
44601 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
44602 SDValue Op0 = Src.getOperand(0);
44603 if (ISD::isNormalLoad(Op0.getNode()))
44604 return DAG.getBitcast(MVT::v4f32, Op0);
44605 if (Op0.getOpcode() == ISD::BITCAST &&
44606 Op0.getOperand(0).getValueType() == MVT::v4f32)
44607 return Op0.getOperand(0);
44608 }
44609 break;
44610 case ISD::AND:
44611 case ISD::XOR:
44612 case ISD::OR: {
44613 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
44614 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
44615 if (Op0 && Op1)
44616 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
44617 Op1);
44618 break;
44619 }
44620 }
44621 return SDValue();
44622}
44623
44624// Helper to push sign extension of vXi1 SETCC result through bitops.
44626 SDValue Src, const SDLoc &DL) {
44627 switch (Src.getOpcode()) {
44628 case ISD::SETCC:
44629 case ISD::FREEZE:
44630 case ISD::TRUNCATE:
44631 case ISD::BUILD_VECTOR:
44632 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44633 case ISD::AND:
44634 case ISD::XOR:
44635 case ISD::OR:
44636 return DAG.getNode(
44637 Src.getOpcode(), DL, SExtVT,
44638 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
44639 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
44640 case ISD::SELECT:
44641 case ISD::VSELECT:
44642 return DAG.getSelect(
44643 DL, SExtVT, Src.getOperand(0),
44644 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
44645 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
44646 }
44647 llvm_unreachable("Unexpected node type for vXi1 sign extension");
44648}
44649
44650// Try to match patterns such as
44651// (i16 bitcast (v16i1 x))
44652// ->
44653// (i16 movmsk (16i8 sext (v16i1 x)))
44654// before the illegal vector is scalarized on subtargets that don't have legal
44655// vxi1 types.
44657 const SDLoc &DL,
44658 const X86Subtarget &Subtarget) {
44659 EVT SrcVT = Src.getValueType();
44660 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
44661 return SDValue();
44662
44663 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
44664 // legalization destroys the v4i32 type.
44665 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
44666 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
44667 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
44668 DAG.getBitcast(MVT::v4f32, V));
44669 return DAG.getZExtOrTrunc(V, DL, VT);
44670 }
44671 }
44672
44673 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
44674 // movmskb even with avx512. This will be better than truncating to vXi1 and
44675 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
44676 // vpcmpeqb/vpcmpgtb.
44677 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
44678 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
44679 Src.getOperand(0).getValueType() == MVT::v32i8 ||
44680 Src.getOperand(0).getValueType() == MVT::v64i8);
44681
44682 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
44683 // directly with vpmovmskb/vmovmskps/vmovmskpd.
44684 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
44685 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
44686 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
44687 EVT CmpVT = Src.getOperand(0).getValueType();
44688 EVT EltVT = CmpVT.getVectorElementType();
44689 if (CmpVT.getSizeInBits() <= 256 &&
44690 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
44691 PreferMovMsk = true;
44692 }
44693
44694 // With AVX512 vxi1 types are legal and we prefer using k-regs.
44695 // MOVMSK is supported in SSE2 or later.
44696 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
44697 return SDValue();
44698
44699 // If the upper ops of a concatenation are undef, then try to bitcast the
44700 // lower op and extend.
44701 SmallVector<SDValue, 4> SubSrcOps;
44702 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
44703 SubSrcOps.size() >= 2) {
44704 SDValue LowerOp = SubSrcOps[0];
44705 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
44706 if (LowerOp.getOpcode() == ISD::SETCC &&
44707 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
44708 EVT SubVT = VT.getIntegerVT(
44709 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
44710 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
44711 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
44712 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
44713 }
44714 }
44715 }
44716
44717 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
44718 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
44719 // v8i16 and v16i16.
44720 // For these two cases, we can shuffle the upper element bytes to a
44721 // consecutive sequence at the start of the vector and treat the results as
44722 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
44723 // for v16i16 this is not the case, because the shuffle is expensive, so we
44724 // avoid sign-extending to this type entirely.
44725 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
44726 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
44727 MVT SExtVT;
44728 bool PropagateSExt = false;
44729 switch (SrcVT.getSimpleVT().SimpleTy) {
44730 default:
44731 return SDValue();
44732 case MVT::v2i1:
44733 SExtVT = MVT::v2i64;
44734 break;
44735 case MVT::v4i1:
44736 SExtVT = MVT::v4i32;
44737 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
44738 // sign-extend to a 256-bit operation to avoid truncation.
44739 if (Subtarget.hasAVX() &&
44740 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
44741 SExtVT = MVT::v4i64;
44742 PropagateSExt = true;
44743 }
44744 break;
44745 case MVT::v8i1:
44746 SExtVT = MVT::v8i16;
44747 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
44748 // sign-extend to a 256-bit operation to match the compare.
44749 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
44750 // 256-bit because the shuffle is cheaper than sign extending the result of
44751 // the compare.
44752 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
44753 checkBitcastSrcVectorSize(Src, 512, true))) {
44754 SExtVT = MVT::v8i32;
44755 PropagateSExt = true;
44756 }
44757 break;
44758 case MVT::v16i1:
44759 SExtVT = MVT::v16i8;
44760 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
44761 // it is not profitable to sign-extend to 256-bit because this will
44762 // require an extra cross-lane shuffle which is more expensive than
44763 // truncating the result of the compare to 128-bits.
44764 break;
44765 case MVT::v32i1:
44766 SExtVT = MVT::v32i8;
44767 break;
44768 case MVT::v64i1:
44769 // If we have AVX512F, but not AVX512BW and the input is truncated from
44770 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
44771 if (Subtarget.hasAVX512()) {
44772 if (Subtarget.hasBWI())
44773 return SDValue();
44774 SExtVT = MVT::v64i8;
44775 break;
44776 }
44777 // Split if this is a <64 x i8> comparison result.
44778 if (checkBitcastSrcVectorSize(Src, 512, false)) {
44779 SExtVT = MVT::v64i8;
44780 break;
44781 }
44782 return SDValue();
44783 };
44784
44785 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
44786 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44787
44788 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
44789 V = getPMOVMSKB(DL, V, DAG, Subtarget);
44790 } else {
44791 if (SExtVT == MVT::v8i16) {
44792 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
44793 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
44794 }
44795 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
44796 }
44797
44798 EVT IntVT =
44800 V = DAG.getZExtOrTrunc(V, DL, IntVT);
44801 return DAG.getBitcast(VT, V);
44802}
44803
44804// Convert a vXi1 constant build vector to the same width scalar integer.
44806 EVT SrcVT = Op.getValueType();
44807 assert(SrcVT.getVectorElementType() == MVT::i1 &&
44808 "Expected a vXi1 vector");
44810 "Expected a constant build vector");
44811
44812 APInt Imm(SrcVT.getVectorNumElements(), 0);
44813 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
44814 SDValue In = Op.getOperand(Idx);
44815 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
44816 Imm.setBit(Idx);
44817 }
44818 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
44819 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
44820}
44821
44824 const X86Subtarget &Subtarget) {
44825 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
44826
44827 if (!DCI.isBeforeLegalizeOps())
44828 return SDValue();
44829
44830 // Only do this if we have k-registers.
44831 if (!Subtarget.hasAVX512())
44832 return SDValue();
44833
44834 EVT DstVT = N->getValueType(0);
44835 SDValue Op = N->getOperand(0);
44836 EVT SrcVT = Op.getValueType();
44837
44838 if (!Op.hasOneUse())
44839 return SDValue();
44840
44841 // Look for logic ops.
44842 if (Op.getOpcode() != ISD::AND &&
44843 Op.getOpcode() != ISD::OR &&
44844 Op.getOpcode() != ISD::XOR)
44845 return SDValue();
44846
44847 // Make sure we have a bitcast between mask registers and a scalar type.
44848 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44849 DstVT.isScalarInteger()) &&
44850 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
44851 SrcVT.isScalarInteger()))
44852 return SDValue();
44853
44854 SDValue LHS = Op.getOperand(0);
44855 SDValue RHS = Op.getOperand(1);
44856
44857 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
44858 LHS.getOperand(0).getValueType() == DstVT)
44859 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
44860 DAG.getBitcast(DstVT, RHS));
44861
44862 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
44863 RHS.getOperand(0).getValueType() == DstVT)
44864 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44865 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
44866
44867 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
44868 // Most of these have to move a constant from the scalar domain anyway.
44871 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44872 DAG.getBitcast(DstVT, LHS), RHS);
44873 }
44874
44875 return SDValue();
44876}
44877
44879 const X86Subtarget &Subtarget) {
44880 SDLoc DL(BV);
44881 unsigned NumElts = BV->getNumOperands();
44882 SDValue Splat = BV->getSplatValue();
44883
44884 // Build MMX element from integer GPR or SSE float values.
44885 auto CreateMMXElement = [&](SDValue V) {
44886 if (V.isUndef())
44887 return DAG.getUNDEF(MVT::x86mmx);
44888 if (V.getValueType().isFloatingPoint()) {
44889 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
44890 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
44891 V = DAG.getBitcast(MVT::v2i64, V);
44892 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
44893 }
44894 V = DAG.getBitcast(MVT::i32, V);
44895 } else {
44896 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
44897 }
44898 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
44899 };
44900
44901 // Convert build vector ops to MMX data in the bottom elements.
44903
44904 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44905
44906 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
44907 if (Splat) {
44908 if (Splat.isUndef())
44909 return DAG.getUNDEF(MVT::x86mmx);
44910
44911 Splat = CreateMMXElement(Splat);
44912
44913 if (Subtarget.hasSSE1()) {
44914 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
44915 if (NumElts == 8)
44916 Splat = DAG.getNode(
44917 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44918 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
44919 TLI.getPointerTy(DAG.getDataLayout())),
44920 Splat, Splat);
44921
44922 // Use PSHUFW to repeat 16-bit elements.
44923 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
44924 return DAG.getNode(
44925 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44926 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
44927 TLI.getPointerTy(DAG.getDataLayout())),
44928 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
44929 }
44930 Ops.append(NumElts, Splat);
44931 } else {
44932 for (unsigned i = 0; i != NumElts; ++i)
44933 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
44934 }
44935
44936 // Use tree of PUNPCKLs to build up general MMX vector.
44937 while (Ops.size() > 1) {
44938 unsigned NumOps = Ops.size();
44939 unsigned IntrinOp =
44940 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
44941 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
44942 : Intrinsic::x86_mmx_punpcklbw));
44943 SDValue Intrin = DAG.getTargetConstant(
44944 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
44945 for (unsigned i = 0; i != NumOps; i += 2)
44946 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
44947 Ops[i], Ops[i + 1]);
44948 Ops.resize(NumOps / 2);
44949 }
44950
44951 return Ops[0];
44952}
44953
44954// Recursive function that attempts to find if a bool vector node was originally
44955// a vector/float/double that got truncated/extended/bitcast to/from a scalar
44956// integer. If so, replace the scalar ops with bool vector equivalents back down
44957// the chain.
44959 SelectionDAG &DAG,
44960 const X86Subtarget &Subtarget,
44961 unsigned Depth = 0) {
44963 return SDValue(); // Limit search depth.
44964
44965 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44966 unsigned Opc = V.getOpcode();
44967 switch (Opc) {
44968 case ISD::BITCAST: {
44969 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
44970 SDValue Src = V.getOperand(0);
44971 EVT SrcVT = Src.getValueType();
44972 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
44973 return DAG.getBitcast(VT, Src);
44974 break;
44975 }
44976 case ISD::Constant: {
44977 auto *C = cast<ConstantSDNode>(V);
44978 if (C->isZero())
44979 return DAG.getConstant(0, DL, VT);
44980 if (C->isAllOnes())
44981 return DAG.getAllOnesConstant(DL, VT);
44982 break;
44983 }
44984 case ISD::TRUNCATE: {
44985 // If we find a suitable source, a truncated scalar becomes a subvector.
44986 SDValue Src = V.getOperand(0);
44987 EVT NewSrcVT =
44988 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
44989 if (TLI.isTypeLegal(NewSrcVT))
44990 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
44991 Subtarget, Depth + 1))
44992 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
44993 DAG.getVectorIdxConstant(0, DL));
44994 break;
44995 }
44996 case ISD::ANY_EXTEND:
44997 case ISD::ZERO_EXTEND: {
44998 // If we find a suitable source, an extended scalar becomes a subvector.
44999 SDValue Src = V.getOperand(0);
45000 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45001 Src.getScalarValueSizeInBits());
45002 if (TLI.isTypeLegal(NewSrcVT))
45003 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45004 Subtarget, Depth + 1))
45005 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45006 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45007 : DAG.getConstant(0, DL, VT),
45008 N0, DAG.getVectorIdxConstant(0, DL));
45009 break;
45010 }
45011 case ISD::OR:
45012 case ISD::XOR: {
45013 // If we find suitable sources, we can just move the op to the vector
45014 // domain.
45015 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45016 Subtarget, Depth + 1))
45017 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45018 Subtarget, Depth + 1))
45019 return DAG.getNode(Opc, DL, VT, N0, N1);
45020 break;
45021 }
45022 case ISD::SHL: {
45023 // If we find a suitable source, a SHL becomes a KSHIFTL.
45024 SDValue Src0 = V.getOperand(0);
45025 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45026 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45027 break;
45028
45029 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45030 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45031 Depth + 1))
45032 return DAG.getNode(
45033 X86ISD::KSHIFTL, DL, VT, N0,
45034 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45035 break;
45036 }
45037 }
45038
45039 // Does the inner bitcast already exist?
45040 if (Depth > 0)
45041 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45042 return SDValue(Alt, 0);
45043
45044 return SDValue();
45045}
45046
45049 const X86Subtarget &Subtarget) {
45050 SDValue N0 = N->getOperand(0);
45051 EVT VT = N->getValueType(0);
45052 EVT SrcVT = N0.getValueType();
45053 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45054
45055 // Try to match patterns such as
45056 // (i16 bitcast (v16i1 x))
45057 // ->
45058 // (i16 movmsk (16i8 sext (v16i1 x)))
45059 // before the setcc result is scalarized on subtargets that don't have legal
45060 // vxi1 types.
45061 if (DCI.isBeforeLegalize()) {
45062 SDLoc dl(N);
45063 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45064 return V;
45065
45066 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45067 // type, widen both sides to avoid a trip through memory.
45068 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45069 Subtarget.hasAVX512()) {
45070 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45071 N0 = DAG.getBitcast(MVT::v8i1, N0);
45072 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45073 DAG.getVectorIdxConstant(0, dl));
45074 }
45075
45076 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45077 // type, widen both sides to avoid a trip through memory.
45078 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45079 Subtarget.hasAVX512()) {
45080 // Use zeros for the widening if we already have some zeroes. This can
45081 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45082 // stream of this.
45083 // FIXME: It might make sense to detect a concat_vectors with a mix of
45084 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45085 // a separate combine. What we can't do is canonicalize the operands of
45086 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45087 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45088 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45089 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45090 SrcVT = LastOp.getValueType();
45091 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45092 SmallVector<SDValue, 4> Ops(N0->ops());
45093 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45094 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45095 N0 = DAG.getBitcast(MVT::i8, N0);
45096 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45097 }
45098 }
45099
45100 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45101 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45102 Ops[0] = N0;
45103 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45104 N0 = DAG.getBitcast(MVT::i8, N0);
45105 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45106 }
45107 } else if (DCI.isAfterLegalizeDAG()) {
45108 // If we're bitcasting from iX to vXi1, see if the integer originally
45109 // began as a vXi1 and whether we can remove the bitcast entirely.
45110 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45111 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45112 if (SDValue V =
45113 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45114 return V;
45115 }
45116 }
45117
45118 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45119 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45120 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45121 // we can help with known bits propagation from the vXi1 domain to the
45122 // scalar domain.
45123 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45124 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45125 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45127 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45128 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45129
45130 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45131 // and the vbroadcast_load are both integer or both fp. In some cases this
45132 // will remove the bitcast entirely.
45133 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45134 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45135 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45136 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45137 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45138 // Don't swap i8/i16 since don't have fp types that size.
45139 if (MemSize >= 32) {
45140 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45141 : MVT::getIntegerVT(MemSize);
45142 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45143 : MVT::getIntegerVT(SrcVTSize);
45144 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45145
45146 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45147 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45148 SDValue ResNode =
45150 MemVT, BCast->getMemOperand());
45151 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45152 return DAG.getBitcast(VT, ResNode);
45153 }
45154 }
45155
45156 // Since MMX types are special and don't usually play with other vector types,
45157 // it's better to handle them early to be sure we emit efficient code by
45158 // avoiding store-load conversions.
45159 if (VT == MVT::x86mmx) {
45160 // Detect MMX constant vectors.
45161 APInt UndefElts;
45162 SmallVector<APInt, 1> EltBits;
45163 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45164 /*AllowWholeUndefs*/ true,
45165 /*AllowPartialUndefs*/ true)) {
45166 SDLoc DL(N0);
45167 // Handle zero-extension of i32 with MOVD.
45168 if (EltBits[0].countl_zero() >= 32)
45169 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45170 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45171 // Else, bitcast to a double.
45172 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45173 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45174 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45175 }
45176
45177 // Detect bitcasts to x86mmx low word.
45178 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45179 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45180 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45181 bool LowUndef = true, AllUndefOrZero = true;
45182 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45183 SDValue Op = N0.getOperand(i);
45184 LowUndef &= Op.isUndef() || (i >= e/2);
45185 AllUndefOrZero &= isNullConstantOrUndef(Op);
45186 }
45187 if (AllUndefOrZero) {
45188 SDValue N00 = N0.getOperand(0);
45189 SDLoc dl(N00);
45190 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45191 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45192 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45193 }
45194 }
45195
45196 // Detect bitcasts of 64-bit build vectors and convert to a
45197 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45198 // lowest element.
45199 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45200 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45201 SrcVT == MVT::v8i8))
45202 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45203
45204 // Detect bitcasts between element or subvector extraction to x86mmx.
45205 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45207 isNullConstant(N0.getOperand(1))) {
45208 SDValue N00 = N0.getOperand(0);
45209 if (N00.getValueType().is128BitVector())
45210 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45211 DAG.getBitcast(MVT::v2i64, N00));
45212 }
45213
45214 // Detect bitcasts from FP_TO_SINT to x86mmx.
45215 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
45216 SDLoc DL(N0);
45217 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
45218 DAG.getUNDEF(MVT::v2i32));
45219 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
45220 DAG.getBitcast(MVT::v2i64, Res));
45221 }
45222 }
45223
45224 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
45225 // most of these to scalar anyway.
45226 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
45227 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45229 return combinevXi1ConstantToInteger(N0, DAG);
45230 }
45231
45232 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
45233 VT.getVectorElementType() == MVT::i1) {
45234 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
45235 if (C->isAllOnes())
45236 return DAG.getConstant(1, SDLoc(N0), VT);
45237 if (C->isZero())
45238 return DAG.getConstant(0, SDLoc(N0), VT);
45239 }
45240 }
45241
45242 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
45243 // Turn it into a sign bit compare that produces a k-register. This avoids
45244 // a trip through a GPR.
45245 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
45246 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
45248 unsigned NumElts = VT.getVectorNumElements();
45249 SDValue Src = N0;
45250
45251 // Peek through truncate.
45252 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
45253 Src = N0.getOperand(0);
45254
45255 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
45256 SDValue MovmskIn = Src.getOperand(0);
45257 MVT MovmskVT = MovmskIn.getSimpleValueType();
45258 unsigned MovMskElts = MovmskVT.getVectorNumElements();
45259
45260 // We allow extra bits of the movmsk to be used since they are known zero.
45261 // We can't convert a VPMOVMSKB without avx512bw.
45262 if (MovMskElts <= NumElts &&
45263 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
45264 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
45265 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
45266 SDLoc dl(N);
45267 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
45268 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
45269 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
45270 if (EVT(CmpVT) == VT)
45271 return Cmp;
45272
45273 // Pad with zeroes up to original VT to replace the zeroes that were
45274 // being used from the MOVMSK.
45275 unsigned NumConcats = NumElts / MovMskElts;
45276 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
45277 Ops[0] = Cmp;
45278 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
45279 }
45280 }
45281 }
45282
45283 // Try to remove bitcasts from input and output of mask arithmetic to
45284 // remove GPR<->K-register crossings.
45285 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
45286 return V;
45287
45288 // Convert a bitcasted integer logic operation that has one bitcasted
45289 // floating-point operand into a floating-point logic operation. This may
45290 // create a load of a constant, but that is cheaper than materializing the
45291 // constant in an integer register and transferring it to an SSE register or
45292 // transferring the SSE operand to integer register and back.
45293 unsigned FPOpcode;
45294 switch (N0.getOpcode()) {
45295 // clang-format off
45296 case ISD::AND: FPOpcode = X86ISD::FAND; break;
45297 case ISD::OR: FPOpcode = X86ISD::FOR; break;
45298 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
45299 default: return SDValue();
45300 // clang-format on
45301 }
45302
45303 // Check if we have a bitcast from another integer type as well.
45304 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
45305 (Subtarget.hasSSE2() && VT == MVT::f64) ||
45306 (Subtarget.hasFP16() && VT == MVT::f16) ||
45307 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
45308 TLI.isTypeLegal(VT))))
45309 return SDValue();
45310
45311 SDValue LogicOp0 = N0.getOperand(0);
45312 SDValue LogicOp1 = N0.getOperand(1);
45313 SDLoc DL0(N0);
45314
45315 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
45316 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
45317 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
45318 LogicOp0.getOperand(0).getValueType() == VT &&
45319 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
45320 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
45321 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45322 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
45323 }
45324 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
45325 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
45326 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
45327 LogicOp1.getOperand(0).getValueType() == VT &&
45328 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
45329 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
45330 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45331 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
45332 }
45333
45334 return SDValue();
45335}
45336
45337// (mul (zext a), (sext, b))
45338static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
45339 SDValue &Op1) {
45340 Op0 = Mul.getOperand(0);
45341 Op1 = Mul.getOperand(1);
45342
45343 // The operand1 should be signed extend
45344 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
45345 std::swap(Op0, Op1);
45346
45347 auto IsFreeTruncation = [](SDValue &Op) -> bool {
45348 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
45349 Op.getOpcode() == ISD::SIGN_EXTEND) &&
45350 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
45351 return true;
45352
45353 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
45354 return (BV && BV->isConstant());
45355 };
45356
45357 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
45358 // value, we need to check Op0 is zero extended value. Op1 should be signed
45359 // value, so we just check the signed bits.
45360 if ((IsFreeTruncation(Op0) &&
45361 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
45362 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
45363 return true;
45364
45365 return false;
45366}
45367
45368// Given a ABS node, detect the following pattern:
45369// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
45370// This is useful as it is the input into a SAD pattern.
45371static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
45372 SDValue AbsOp1 = Abs->getOperand(0);
45373 if (AbsOp1.getOpcode() != ISD::SUB)
45374 return false;
45375
45376 Op0 = AbsOp1.getOperand(0);
45377 Op1 = AbsOp1.getOperand(1);
45378
45379 // Check if the operands of the sub are zero-extended from vectors of i8.
45380 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
45381 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
45382 Op1.getOpcode() != ISD::ZERO_EXTEND ||
45383 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
45384 return false;
45385
45386 return true;
45387}
45388
45390 unsigned &LogBias, const SDLoc &DL,
45391 const X86Subtarget &Subtarget) {
45392 // Extend or truncate to MVT::i8 first.
45393 MVT Vi8VT =
45394 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
45395 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
45396 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
45397
45398 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
45399 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
45400 // The src A, B element type is i8, but the dst C element type is i32.
45401 // When we calculate the reduce stage, we use src vector type vXi8 for it
45402 // so we need logbias 2 to avoid extra 2 stages.
45403 LogBias = 2;
45404
45405 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
45406 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
45407 RegSize = std::max(512u, RegSize);
45408
45409 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45410 // fill in the missing vector elements with 0.
45411 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
45412 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
45413 Ops[0] = LHS;
45414 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45415 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45416 Ops[0] = RHS;
45417 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45418
45419 // Actually build the DotProduct, split as 256/512 bits for
45420 // AVXVNNI/AVX512VNNI.
45421 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45422 ArrayRef<SDValue> Ops) {
45423 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
45424 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
45425 };
45426 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
45427 SDValue Zero = DAG.getConstant(0, DL, DpVT);
45428
45429 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
45430 DpBuilder, false);
45431}
45432
45433// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
45434// to these zexts.
45435static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
45436 const SDValue &Zext1, const SDLoc &DL,
45437 const X86Subtarget &Subtarget) {
45438 // Find the appropriate width for the PSADBW.
45439 EVT InVT = Zext0.getOperand(0).getValueType();
45440 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
45441
45442 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45443 // fill in the missing vector elements with 0.
45444 unsigned NumConcat = RegSize / InVT.getSizeInBits();
45445 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
45446 Ops[0] = Zext0.getOperand(0);
45447 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45448 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45449 Ops[0] = Zext1.getOperand(0);
45450 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45451
45452 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45453 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45454 ArrayRef<SDValue> Ops) {
45455 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45456 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
45457 };
45458 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
45459 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
45460 PSADBWBuilder);
45461}
45462
45463// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
45464// PHMINPOSUW.
45466 const X86Subtarget &Subtarget) {
45467 // Bail without SSE41.
45468 if (!Subtarget.hasSSE41())
45469 return SDValue();
45470
45471 EVT ExtractVT = Extract->getValueType(0);
45472 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
45473 return SDValue();
45474
45475 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
45476 ISD::NodeType BinOp;
45477 SDValue Src = DAG.matchBinOpReduction(
45478 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
45479 if (!Src)
45480 return SDValue();
45481
45482 EVT SrcVT = Src.getValueType();
45483 EVT SrcSVT = SrcVT.getScalarType();
45484 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
45485 return SDValue();
45486
45487 SDLoc DL(Extract);
45488 SDValue MinPos = Src;
45489
45490 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
45491 while (SrcVT.getSizeInBits() > 128) {
45492 SDValue Lo, Hi;
45493 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
45494 SrcVT = Lo.getValueType();
45495 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
45496 }
45497 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
45498 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
45499 "Unexpected value type");
45500
45501 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
45502 // to flip the value accordingly.
45503 SDValue Mask;
45504 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
45505 if (BinOp == ISD::SMAX)
45506 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
45507 else if (BinOp == ISD::SMIN)
45508 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
45509 else if (BinOp == ISD::UMAX)
45510 Mask = DAG.getAllOnesConstant(DL, SrcVT);
45511
45512 if (Mask)
45513 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45514
45515 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
45516 // shuffling each upper element down and insert zeros. This means that the
45517 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
45518 // ready for the PHMINPOS.
45519 if (ExtractVT == MVT::i8) {
45521 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
45522 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
45523 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
45524 }
45525
45526 // Perform the PHMINPOS on a v8i16 vector,
45527 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
45528 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
45529 MinPos = DAG.getBitcast(SrcVT, MinPos);
45530
45531 if (Mask)
45532 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45533
45534 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
45535 DAG.getVectorIdxConstant(0, DL));
45536}
45537
45538// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
45540 const X86Subtarget &Subtarget) {
45541 // Bail without SSE2.
45542 if (!Subtarget.hasSSE2())
45543 return SDValue();
45544
45545 EVT ExtractVT = Extract->getValueType(0);
45546 unsigned BitWidth = ExtractVT.getSizeInBits();
45547 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
45548 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
45549 return SDValue();
45550
45551 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
45552 ISD::NodeType BinOp;
45553 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
45554 if (!Match && ExtractVT == MVT::i1)
45555 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
45556 if (!Match)
45557 return SDValue();
45558
45559 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
45560 // which we can't support here for now.
45561 if (Match.getScalarValueSizeInBits() != BitWidth)
45562 return SDValue();
45563
45564 SDValue Movmsk;
45565 SDLoc DL(Extract);
45566 EVT MatchVT = Match.getValueType();
45567 unsigned NumElts = MatchVT.getVectorNumElements();
45568 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
45569 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45570 LLVMContext &Ctx = *DAG.getContext();
45571
45572 if (ExtractVT == MVT::i1) {
45573 // Special case for (pre-legalization) vXi1 reductions.
45574 if (NumElts > 64 || !isPowerOf2_32(NumElts))
45575 return SDValue();
45576 if (Match.getOpcode() == ISD::SETCC) {
45577 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
45578 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
45579 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
45580 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
45581 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
45582 X86::CondCode X86CC;
45583 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
45584 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
45585 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
45586 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
45587 DAG, X86CC))
45588 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
45589 getSETCC(X86CC, V, DL, DAG));
45590 }
45591 }
45592 if (TLI.isTypeLegal(MatchVT)) {
45593 // If this is a legal AVX512 predicate type then we can just bitcast.
45594 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45595 Movmsk = DAG.getBitcast(MovmskVT, Match);
45596 } else {
45597 // Use combineBitcastvxi1 to create the MOVMSK.
45598 while (NumElts > MaxElts) {
45599 SDValue Lo, Hi;
45600 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45601 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45602 NumElts /= 2;
45603 }
45604 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45605 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
45606 }
45607 if (!Movmsk)
45608 return SDValue();
45609 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
45610 } else {
45611 // FIXME: Better handling of k-registers or 512-bit vectors?
45612 unsigned MatchSizeInBits = Match.getValueSizeInBits();
45613 if (!(MatchSizeInBits == 128 ||
45614 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
45615 return SDValue();
45616
45617 // Make sure this isn't a vector of 1 element. The perf win from using
45618 // MOVMSK diminishes with less elements in the reduction, but it is
45619 // generally better to get the comparison over to the GPRs as soon as
45620 // possible to reduce the number of vector ops.
45621 if (Match.getValueType().getVectorNumElements() < 2)
45622 return SDValue();
45623
45624 // Check that we are extracting a reduction of all sign bits.
45625 if (DAG.ComputeNumSignBits(Match) != BitWidth)
45626 return SDValue();
45627
45628 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
45629 SDValue Lo, Hi;
45630 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45631 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45632 MatchSizeInBits = Match.getValueSizeInBits();
45633 }
45634
45635 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
45636 MVT MaskSrcVT;
45637 if (64 == BitWidth || 32 == BitWidth)
45639 MatchSizeInBits / BitWidth);
45640 else
45641 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
45642
45643 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
45644 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
45645 NumElts = MaskSrcVT.getVectorNumElements();
45646 }
45647 assert((NumElts <= 32 || NumElts == 64) &&
45648 "Not expecting more than 64 elements");
45649
45650 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
45651 if (BinOp == ISD::XOR) {
45652 // parity -> (PARITY(MOVMSK X))
45653 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
45654 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
45655 }
45656
45657 SDValue CmpC;
45658 ISD::CondCode CondCode;
45659 if (BinOp == ISD::OR) {
45660 // any_of -> MOVMSK != 0
45661 CmpC = DAG.getConstant(0, DL, CmpVT);
45662 CondCode = ISD::CondCode::SETNE;
45663 } else {
45664 // all_of -> MOVMSK == ((1 << NumElts) - 1)
45665 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
45666 DL, CmpVT);
45667 CondCode = ISD::CondCode::SETEQ;
45668 }
45669
45670 // The setcc produces an i8 of 0/1, so extend that to the result width and
45671 // negate to get the final 0/-1 mask value.
45672 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
45673 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
45674 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
45675 return DAG.getNegative(Zext, DL, ExtractVT);
45676}
45677
45679 const X86Subtarget &Subtarget) {
45680 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
45681 return SDValue();
45682
45683 EVT ExtractVT = Extract->getValueType(0);
45684 // Verify the type we're extracting is i32, as the output element type of
45685 // vpdpbusd is i32.
45686 if (ExtractVT != MVT::i32)
45687 return SDValue();
45688
45689 EVT VT = Extract->getOperand(0).getValueType();
45691 return SDValue();
45692
45693 // Match shuffle + add pyramid.
45694 ISD::NodeType BinOp;
45695 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45696
45697 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
45698 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
45699 // before adding into the accumulator.
45700 // TODO:
45701 // We also need to verify that the multiply has at least 2x the number of bits
45702 // of the input. We shouldn't match
45703 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
45704 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
45705 // Root = Root.getOperand(0);
45706
45707 // If there was a match, we want Root to be a mul.
45708 if (!Root || Root.getOpcode() != ISD::MUL)
45709 return SDValue();
45710
45711 // Check whether we have an extend and mul pattern
45712 SDValue LHS, RHS;
45713 if (!detectExtMul(DAG, Root, LHS, RHS))
45714 return SDValue();
45715
45716 // Create the dot product instruction.
45717 SDLoc DL(Extract);
45718 unsigned StageBias;
45719 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
45720
45721 // If the original vector was wider than 4 elements, sum over the results
45722 // in the DP vector.
45723 unsigned Stages = Log2_32(VT.getVectorNumElements());
45724 EVT DpVT = DP.getValueType();
45725
45726 if (Stages > StageBias) {
45727 unsigned DpElems = DpVT.getVectorNumElements();
45728
45729 for (unsigned i = Stages - StageBias; i > 0; --i) {
45730 SmallVector<int, 16> Mask(DpElems, -1);
45731 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45732 Mask[j] = MaskEnd + j;
45733
45734 SDValue Shuffle =
45735 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
45736 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
45737 }
45738 }
45739
45740 // Return the lowest ExtractSizeInBits bits.
45741 EVT ResVT =
45742 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45743 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
45744 DP = DAG.getBitcast(ResVT, DP);
45745 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
45746 Extract->getOperand(1));
45747}
45748
45750 const X86Subtarget &Subtarget) {
45751 // PSADBW is only supported on SSE2 and up.
45752 if (!Subtarget.hasSSE2())
45753 return SDValue();
45754
45755 EVT ExtractVT = Extract->getValueType(0);
45756 // Verify the type we're extracting is either i32 or i64.
45757 // FIXME: Could support other types, but this is what we have coverage for.
45758 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
45759 return SDValue();
45760
45761 EVT VT = Extract->getOperand(0).getValueType();
45763 return SDValue();
45764
45765 // Match shuffle + add pyramid.
45766 ISD::NodeType BinOp;
45767 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45768
45769 // The operand is expected to be zero extended from i8
45770 // (verified in detectZextAbsDiff).
45771 // In order to convert to i64 and above, additional any/zero/sign
45772 // extend is expected.
45773 // The zero extend from 32 bit has no mathematical effect on the result.
45774 // Also the sign extend is basically zero extend
45775 // (extends the sign bit which is zero).
45776 // So it is correct to skip the sign/zero extend instruction.
45777 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
45778 Root.getOpcode() == ISD::ZERO_EXTEND ||
45779 Root.getOpcode() == ISD::ANY_EXTEND))
45780 Root = Root.getOperand(0);
45781
45782 // If there was a match, we want Root to be a select that is the root of an
45783 // abs-diff pattern.
45784 if (!Root || Root.getOpcode() != ISD::ABS)
45785 return SDValue();
45786
45787 // Check whether we have an abs-diff pattern feeding into the select.
45788 SDValue Zext0, Zext1;
45789 if (!detectZextAbsDiff(Root, Zext0, Zext1))
45790 return SDValue();
45791
45792 // Create the SAD instruction.
45793 SDLoc DL(Extract);
45794 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
45795
45796 // If the original vector was wider than 8 elements, sum over the results
45797 // in the SAD vector.
45798 unsigned Stages = Log2_32(VT.getVectorNumElements());
45799 EVT SadVT = SAD.getValueType();
45800 if (Stages > 3) {
45801 unsigned SadElems = SadVT.getVectorNumElements();
45802
45803 for(unsigned i = Stages - 3; i > 0; --i) {
45804 SmallVector<int, 16> Mask(SadElems, -1);
45805 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45806 Mask[j] = MaskEnd + j;
45807
45808 SDValue Shuffle =
45809 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
45810 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
45811 }
45812 }
45813
45814 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
45815 // Return the lowest ExtractSizeInBits bits.
45816 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45817 SadVT.getSizeInBits() / ExtractSizeInBits);
45818 SAD = DAG.getBitcast(ResVT, SAD);
45819 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
45820 Extract->getOperand(1));
45821}
45822
45823// If this extract is from a loaded vector value and will be used as an
45824// integer, that requires a potentially expensive XMM -> GPR transfer.
45825// Additionally, if we can convert to a scalar integer load, that will likely
45826// be folded into a subsequent integer op.
45827// Note: SrcVec might not have a VecVT type, but it must be the same size.
45828// Note: Unlike the related fold for this in DAGCombiner, this is not limited
45829// to a single-use of the loaded vector. For the reasons above, we
45830// expect this to be profitable even if it creates an extra load.
45831static SDValue
45833 const SDLoc &dl, SelectionDAG &DAG,
45835 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45836 "Only EXTRACT_VECTOR_ELT supported so far");
45837
45838 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45839 EVT VT = N->getValueType(0);
45840
45841 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
45842 return Use->getOpcode() == ISD::STORE ||
45843 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
45844 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
45845 });
45846
45847 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
45848 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
45849 VecVT.getVectorElementType() == VT &&
45850 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
45851 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
45852 SDValue NewPtr = TLI.getVectorElementPointer(
45853 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
45854 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
45855 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
45856 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
45857 SDValue Load =
45858 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
45859 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
45860 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
45861 return Load;
45862 }
45863
45864 return SDValue();
45865}
45866
45867// Attempt to peek through a target shuffle and extract the scalar from the
45868// source.
45871 const X86Subtarget &Subtarget) {
45872 if (DCI.isBeforeLegalizeOps())
45873 return SDValue();
45874
45875 SDLoc dl(N);
45876 SDValue Src = N->getOperand(0);
45877 SDValue Idx = N->getOperand(1);
45878
45879 EVT VT = N->getValueType(0);
45880 EVT SrcVT = Src.getValueType();
45881 EVT SrcSVT = SrcVT.getVectorElementType();
45882 unsigned SrcEltBits = SrcSVT.getSizeInBits();
45883 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45884
45885 // Don't attempt this for boolean mask vectors or unknown extraction indices.
45886 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
45887 return SDValue();
45888
45889 const APInt &IdxC = N->getConstantOperandAPInt(1);
45890 if (IdxC.uge(NumSrcElts))
45891 return SDValue();
45892
45893 SDValue SrcBC = peekThroughBitcasts(Src);
45894
45895 // Handle extract(bitcast(broadcast(scalar_value))).
45896 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
45897 SDValue SrcOp = SrcBC.getOperand(0);
45898 EVT SrcOpVT = SrcOp.getValueType();
45899 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
45900 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
45901 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
45902 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
45903 // TODO support non-zero offsets.
45904 if (Offset == 0) {
45905 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
45906 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
45907 return SrcOp;
45908 }
45909 }
45910 }
45911
45912 // If we're extracting a single element from a broadcast load and there are
45913 // no other users, just create a single load.
45914 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
45915 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
45916 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
45917 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
45918 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
45919 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
45920 MemIntr->getBasePtr(),
45921 MemIntr->getPointerInfo(),
45922 MemIntr->getOriginalAlign(),
45923 MemIntr->getMemOperand()->getFlags());
45924 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
45925 return Load;
45926 }
45927 }
45928
45929 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
45930 // TODO: Move to DAGCombine?
45931 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
45932 SrcBC.getValueType().isInteger() &&
45933 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
45934 SrcBC.getScalarValueSizeInBits() ==
45935 SrcBC.getOperand(0).getValueSizeInBits()) {
45936 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
45937 if (IdxC.ult(Scale)) {
45938 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
45939 SDValue Scl = SrcBC.getOperand(0);
45940 EVT SclVT = Scl.getValueType();
45941 if (Offset) {
45942 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
45943 DAG.getShiftAmountConstant(Offset, SclVT, dl));
45944 }
45945 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
45946 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
45947 return Scl;
45948 }
45949 }
45950
45951 // Handle extract(truncate(x)) for 0'th index.
45952 // TODO: Treat this as a faux shuffle?
45953 // TODO: When can we use this for general indices?
45954 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
45955 (SrcVT.getSizeInBits() % 128) == 0) {
45956 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
45957 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
45958 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
45959 Idx);
45960 }
45961
45962 // We can only legally extract other elements from 128-bit vectors and in
45963 // certain circumstances, depending on SSE-level.
45964 // TODO: Investigate float/double extraction if it will be just stored.
45965 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
45966 unsigned Idx) {
45967 EVT VecSVT = VecVT.getScalarType();
45968 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
45969 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
45970 VecSVT == MVT::i64)) {
45971 unsigned EltSizeInBits = VecSVT.getSizeInBits();
45972 unsigned NumEltsPerLane = 128 / EltSizeInBits;
45973 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
45974 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
45975 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
45976 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
45977 Idx &= (NumEltsPerLane - 1);
45978 }
45979 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
45980 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
45981 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
45982 DAG.getBitcast(VecVT, Vec),
45983 DAG.getVectorIdxConstant(Idx, dl));
45984 }
45985 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
45986 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
45987 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
45988 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
45989 DAG.getTargetConstant(Idx, dl, MVT::i8));
45990 }
45991 return SDValue();
45992 };
45993
45994 // Resolve the target shuffle inputs and mask.
45997 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
45998 return SDValue();
45999
46000 // Shuffle inputs must be the same size as the result.
46001 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46002 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46003 }))
46004 return SDValue();
46005
46006 // Attempt to narrow/widen the shuffle mask to the correct size.
46007 if (Mask.size() != NumSrcElts) {
46008 if ((NumSrcElts % Mask.size()) == 0) {
46009 SmallVector<int, 16> ScaledMask;
46010 int Scale = NumSrcElts / Mask.size();
46011 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46012 Mask = std::move(ScaledMask);
46013 } else if ((Mask.size() % NumSrcElts) == 0) {
46014 // Simplify Mask based on demanded element.
46015 int ExtractIdx = (int)IdxC.getZExtValue();
46016 int Scale = Mask.size() / NumSrcElts;
46017 int Lo = Scale * ExtractIdx;
46018 int Hi = Scale * (ExtractIdx + 1);
46019 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46020 if (i < Lo || Hi <= i)
46021 Mask[i] = SM_SentinelUndef;
46022
46023 SmallVector<int, 16> WidenedMask;
46024 while (Mask.size() > NumSrcElts &&
46025 canWidenShuffleElements(Mask, WidenedMask))
46026 Mask = std::move(WidenedMask);
46027 }
46028 }
46029
46030 // If narrowing/widening failed, see if we can extract+zero-extend.
46031 int ExtractIdx;
46032 EVT ExtractVT;
46033 if (Mask.size() == NumSrcElts) {
46034 ExtractIdx = Mask[IdxC.getZExtValue()];
46035 ExtractVT = SrcVT;
46036 } else {
46037 unsigned Scale = Mask.size() / NumSrcElts;
46038 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46039 return SDValue();
46040 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46041 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46042 return SDValue();
46043 ExtractIdx = Mask[ScaledIdx];
46044 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46045 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46046 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46047 "Failed to widen vector type");
46048 }
46049
46050 // If the shuffle source element is undef/zero then we can just accept it.
46051 if (ExtractIdx == SM_SentinelUndef)
46052 return DAG.getUNDEF(VT);
46053
46054 if (ExtractIdx == SM_SentinelZero)
46055 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46056 : DAG.getConstant(0, dl, VT);
46057
46058 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46059 ExtractIdx = ExtractIdx % Mask.size();
46060 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46061 return DAG.getZExtOrTrunc(V, dl, VT);
46062
46063 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46065 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46066 return V;
46067
46068 return SDValue();
46069}
46070
46071/// Extracting a scalar FP value from vector element 0 is free, so extract each
46072/// operand first, then perform the math as a scalar op.
46074 const X86Subtarget &Subtarget,
46076 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46077 SDValue Vec = ExtElt->getOperand(0);
46078 SDValue Index = ExtElt->getOperand(1);
46079 EVT VT = ExtElt->getValueType(0);
46080 EVT VecVT = Vec.getValueType();
46081
46082 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46083 // non-zero element because the shuffle+scalar op will be cheaper?
46084 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46085 return SDValue();
46086
46087 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46088 // extract, the condition code), so deal with those as a special-case.
46089 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46090 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46091 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46092 return SDValue();
46093
46094 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46095 SDLoc DL(ExtElt);
46096 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46097 Vec.getOperand(0), Index);
46098 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46099 Vec.getOperand(1), Index);
46100 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46101 }
46102
46103 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46104 VT != MVT::f64)
46105 return SDValue();
46106
46107 // Vector FP selects don't fit the pattern of FP math ops (because the
46108 // condition has a different type and we have to change the opcode), so deal
46109 // with those here.
46110 // FIXME: This is restricted to pre type legalization. If we loosen this we
46111 // need to convert vector bool to a scalar bool.
46112 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46113 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46114 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
46115 assert(Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
46116 "Unexpected cond type for combine");
46117 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46118 SDLoc DL(ExtElt);
46121 Vec.getOperand(0), Index);
46122 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46123 Vec.getOperand(1), Index);
46124 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46125 Vec.getOperand(2), Index);
46126 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46127 }
46128
46129 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46130 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46131 // missed load folding and fma+fneg combining.
46132 switch (Vec.getOpcode()) {
46133 case ISD::FMA: // Begin 3 operands
46134 case ISD::FMAD:
46135 case ISD::FADD: // Begin 2 operands
46136 case ISD::FSUB:
46137 case ISD::FMUL:
46138 case ISD::FDIV:
46139 case ISD::FREM:
46140 case ISD::FCOPYSIGN:
46141 case ISD::FMINNUM:
46142 case ISD::FMAXNUM:
46143 case ISD::FMINNUM_IEEE:
46144 case ISD::FMAXNUM_IEEE:
46145 case ISD::FMAXIMUM:
46146 case ISD::FMINIMUM:
46147 case ISD::FMAXIMUMNUM:
46148 case ISD::FMINIMUMNUM:
46149 case X86ISD::FMAX:
46150 case X86ISD::FMIN:
46151 case ISD::FABS: // Begin 1 operand
46152 case ISD::FSQRT:
46153 case ISD::FRINT:
46154 case ISD::FCEIL:
46155 case ISD::FTRUNC:
46156 case ISD::FNEARBYINT:
46157 case ISD::FROUNDEVEN:
46158 case ISD::FROUND:
46159 case ISD::FFLOOR:
46160 case X86ISD::FRCP:
46161 case X86ISD::FRSQRT: {
46162 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46163 SDLoc DL(ExtElt);
46165 for (SDValue Op : Vec->ops())
46166 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46167 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46168 }
46169 default:
46170 return SDValue();
46171 }
46172 llvm_unreachable("All opcodes should return within switch");
46173}
46174
46175/// Try to convert a vector reduction sequence composed of binops and shuffles
46176/// into horizontal ops.
46178 const X86Subtarget &Subtarget) {
46179 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46180
46181 // We need at least SSE2 to anything here.
46182 if (!Subtarget.hasSSE2())
46183 return SDValue();
46184
46185 ISD::NodeType Opc;
46186 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46187 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46188 if (!Rdx)
46189 return SDValue();
46190
46191 SDValue Index = ExtElt->getOperand(1);
46192 assert(isNullConstant(Index) &&
46193 "Reduction doesn't end in an extract from index 0");
46194
46195 EVT VT = ExtElt->getValueType(0);
46196 EVT VecVT = Rdx.getValueType();
46197 if (VecVT.getScalarType() != VT)
46198 return SDValue();
46199
46200 SDLoc DL(ExtElt);
46201 unsigned NumElts = VecVT.getVectorNumElements();
46202 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46203
46204 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46205 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46206 if (V.getValueType() == MVT::v4i8) {
46207 if (ZeroExtend && Subtarget.hasSSE41()) {
46208 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46209 DAG.getConstant(0, DL, MVT::v4i32),
46210 DAG.getBitcast(MVT::i32, V),
46211 DAG.getVectorIdxConstant(0, DL));
46212 return DAG.getBitcast(MVT::v16i8, V);
46213 }
46214 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46215 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46216 : DAG.getUNDEF(MVT::v4i8));
46217 }
46218 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46219 DAG.getUNDEF(MVT::v8i8));
46220 };
46221
46222 // vXi8 mul reduction - promote to vXi16 mul reduction.
46223 if (Opc == ISD::MUL) {
46224 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
46225 return SDValue();
46226 if (VecVT.getSizeInBits() >= 128) {
46227 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
46228 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46229 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46230 Lo = DAG.getBitcast(WideVT, Lo);
46231 Hi = DAG.getBitcast(WideVT, Hi);
46232 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
46233 while (Rdx.getValueSizeInBits() > 128) {
46234 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46235 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
46236 }
46237 } else {
46238 Rdx = WidenToV16I8(Rdx, false);
46239 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
46240 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
46241 }
46242 if (NumElts >= 8)
46243 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46244 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46245 {4, 5, 6, 7, -1, -1, -1, -1}));
46246 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46247 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46248 {2, 3, -1, -1, -1, -1, -1, -1}));
46249 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46250 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46251 {1, -1, -1, -1, -1, -1, -1, -1}));
46252 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46253 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46254 }
46255
46256 // vXi8 add reduction - sub 128-bit vector.
46257 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
46258 Rdx = WidenToV16I8(Rdx, true);
46259 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46260 DAG.getConstant(0, DL, MVT::v16i8));
46261 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46262 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46263 }
46264
46265 // Must be a >=128-bit vector with pow2 elements.
46266 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
46267 return SDValue();
46268
46269 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
46270 if (VT == MVT::i8) {
46271 while (Rdx.getValueSizeInBits() > 128) {
46272 SDValue Lo, Hi;
46273 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46274 VecVT = Lo.getValueType();
46275 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
46276 }
46277 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
46278
46280 MVT::v16i8, DL, Rdx, Rdx,
46281 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
46282 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
46283 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46284 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
46285 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46286 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46287 }
46288
46289 // See if we can use vXi8 PSADBW add reduction for larger zext types.
46290 // If the source vector values are 0-255, then we can use PSADBW to
46291 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
46292 // TODO: See if its worth avoiding vXi16/i32 truncations?
46293 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
46294 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
46295 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
46296 Subtarget.hasAVX512())) {
46297 if (Rdx.getValueType() == MVT::v8i16) {
46298 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
46299 DAG.getUNDEF(MVT::v8i16));
46300 } else {
46301 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
46302 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
46303 if (ByteVT.getSizeInBits() < 128)
46304 Rdx = WidenToV16I8(Rdx, true);
46305 }
46306
46307 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46308 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46309 ArrayRef<SDValue> Ops) {
46310 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46311 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
46312 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
46313 };
46314 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
46315 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
46316
46317 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
46318 while (Rdx.getValueSizeInBits() > 128) {
46319 SDValue Lo, Hi;
46320 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46321 VecVT = Lo.getValueType();
46322 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
46323 }
46324 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
46325
46326 if (NumElts > 8) {
46327 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
46328 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
46329 }
46330
46331 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
46332 Rdx = DAG.getBitcast(VecVT, Rdx);
46333 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46334 }
46335
46336 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
46337 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
46338 return SDValue();
46339
46340 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
46341
46342 // 256-bit horizontal instructions operate on 128-bit chunks rather than
46343 // across the whole vector, so we need an extract + hop preliminary stage.
46344 // This is the only step where the operands of the hop are not the same value.
46345 // TODO: We could extend this to handle 512-bit or even longer vectors.
46346 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
46347 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
46348 unsigned NumElts = VecVT.getVectorNumElements();
46349 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
46350 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
46351 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
46352 VecVT = Rdx.getValueType();
46353 }
46354 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
46355 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
46356 return SDValue();
46357
46358 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
46359 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
46360 for (unsigned i = 0; i != ReductionSteps; ++i)
46361 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
46362
46363 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46364}
46365
46366/// Detect vector gather/scatter index generation and convert it from being a
46367/// bunch of shuffles and extracts into a somewhat faster sequence.
46368/// For i686, the best sequence is apparently storing the value and loading
46369/// scalars back, while for x64 we should use 64-bit extracts and shifts.
46372 const X86Subtarget &Subtarget) {
46373 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
46374 return NewOp;
46375
46376 SDValue InputVector = N->getOperand(0);
46377 SDValue EltIdx = N->getOperand(1);
46378 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
46379
46380 EVT SrcVT = InputVector.getValueType();
46381 EVT VT = N->getValueType(0);
46382 SDLoc dl(InputVector);
46383 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
46384 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46385 unsigned NumEltBits = VT.getScalarSizeInBits();
46386 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46387
46388 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
46389 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46390
46391 // Integer Constant Folding.
46392 if (CIdx && VT.isInteger()) {
46393 APInt UndefVecElts;
46394 SmallVector<APInt, 16> EltBits;
46395 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
46396 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
46397 EltBits, /*AllowWholeUndefs*/ true,
46398 /*AllowPartialUndefs*/ false)) {
46399 uint64_t Idx = CIdx->getZExtValue();
46400 if (UndefVecElts[Idx])
46401 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46402 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
46403 }
46404
46405 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
46406 // Improves lowering of bool masks on rust which splits them into byte array.
46407 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
46408 SDValue Src = peekThroughBitcasts(InputVector);
46409 if (Src.getValueType().getScalarType() == MVT::i1 &&
46410 TLI.isTypeLegal(Src.getValueType())) {
46411 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
46412 SDValue Sub = DAG.getNode(
46413 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
46414 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
46415 return DAG.getBitcast(VT, Sub);
46416 }
46417 }
46418 }
46419
46420 if (IsPextr) {
46421 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
46422 DCI))
46423 return SDValue(N, 0);
46424
46425 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
46426 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
46427 InputVector.getOpcode() == X86ISD::PINSRW) &&
46428 InputVector.getOperand(2) == EltIdx) {
46429 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
46430 "Vector type mismatch");
46431 SDValue Scl = InputVector.getOperand(1);
46432 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
46433 return DAG.getZExtOrTrunc(Scl, dl, VT);
46434 }
46435
46436 // TODO - Remove this once we can handle the implicit zero-extension of
46437 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
46438 // combineBasicSADPattern.
46439 return SDValue();
46440 }
46441
46442 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
46443 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
46444 InputVector.getOpcode() == ISD::BITCAST &&
46445 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46446 isNullConstant(EltIdx) && InputVector.hasOneUse())
46447 return DAG.getBitcast(VT, InputVector);
46448
46449 // Detect mmx to i32 conversion through a v2i32 elt extract.
46450 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
46451 InputVector.getOpcode() == ISD::BITCAST &&
46452 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46453 isNullConstant(EltIdx) && InputVector.hasOneUse())
46454 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
46455 InputVector.getOperand(0));
46456
46457 // Check whether this extract is the root of a sum of absolute differences
46458 // pattern. This has to be done here because we really want it to happen
46459 // pre-legalization,
46460 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
46461 return SAD;
46462
46463 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
46464 return VPDPBUSD;
46465
46466 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
46467 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
46468 return Cmp;
46469
46470 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
46471 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
46472 return MinMax;
46473
46474 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
46475 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
46476 return V;
46477
46478 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
46479 return V;
46480
46481 if (CIdx)
46483 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
46484 dl, DAG, DCI))
46485 return V;
46486
46487 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
46488 // and then testing the relevant element.
46489 //
46490 // Note that we only combine extracts on the *same* result number, i.e.
46491 // t0 = merge_values a0, a1, a2, a3
46492 // i1 = extract_vector_elt t0, Constant:i64<2>
46493 // i1 = extract_vector_elt t0, Constant:i64<3>
46494 // but not
46495 // i1 = extract_vector_elt t0:1, Constant:i64<2>
46496 // since the latter would need its own MOVMSK.
46497 if (SrcVT.getScalarType() == MVT::i1) {
46498 bool IsVar = !CIdx;
46499 SmallVector<SDNode *, 16> BoolExtracts;
46500 unsigned ResNo = InputVector.getResNo();
46501 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
46502 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46503 Use->getOperand(0).getResNo() == ResNo &&
46504 Use->getValueType(0) == MVT::i1) {
46505 BoolExtracts.push_back(Use);
46506 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
46507 return true;
46508 }
46509 return false;
46510 };
46511 // TODO: Can we drop the oneuse check for constant extracts?
46512 if (all_of(InputVector->users(), IsBoolExtract) &&
46513 (IsVar || BoolExtracts.size() > 1)) {
46514 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
46515 if (SDValue BC =
46516 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
46517 for (SDNode *Use : BoolExtracts) {
46518 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
46519 // Mask = 1 << MaskIdx
46520 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
46521 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
46522 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
46523 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
46524 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
46525 DCI.CombineTo(Use, Res);
46526 }
46527 return SDValue(N, 0);
46528 }
46529 }
46530 }
46531
46532 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
46533 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
46534 SDValue TruncSrc = InputVector.getOperand(0);
46535 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
46536 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
46537 SDValue NewExt =
46538 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
46539 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
46540 }
46541 }
46542
46543 return SDValue();
46544}
46545
46546// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
46547// This is more or less the reverse of combineBitcastvxi1.
46549 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
46550 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
46551 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
46552 Opcode != ISD::ANY_EXTEND)
46553 return SDValue();
46554 if (!DCI.isBeforeLegalizeOps())
46555 return SDValue();
46556 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46557 return SDValue();
46558
46559 EVT SVT = VT.getScalarType();
46560 EVT InSVT = N0.getValueType().getScalarType();
46561 unsigned EltSizeInBits = SVT.getSizeInBits();
46562
46563 // Input type must be extending a bool vector (bit-casted from a scalar
46564 // integer) to legal integer types.
46565 if (!VT.isVector())
46566 return SDValue();
46567 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
46568 return SDValue();
46569 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
46570 return SDValue();
46571
46572 SDValue N00 = N0.getOperand(0);
46573 EVT SclVT = N00.getValueType();
46574 if (!SclVT.isScalarInteger())
46575 return SDValue();
46576
46577 SDValue Vec;
46578 SmallVector<int> ShuffleMask;
46579 unsigned NumElts = VT.getVectorNumElements();
46580 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
46581
46582 // Broadcast the scalar integer to the vector elements.
46583 if (NumElts > EltSizeInBits) {
46584 // If the scalar integer is greater than the vector element size, then we
46585 // must split it down into sub-sections for broadcasting. For example:
46586 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
46587 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
46588 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
46589 unsigned Scale = NumElts / EltSizeInBits;
46590 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
46591 bool UseBroadcast = Subtarget.hasInt256() &&
46592 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
46593 Vec = UseBroadcast
46594 ? DAG.getSplat(BroadcastVT, DL, N00)
46595 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46596 Vec = DAG.getBitcast(VT, Vec);
46597
46598 for (unsigned i = 0; i != Scale; ++i) {
46599 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
46600 ShuffleMask.append(EltSizeInBits, i + Offset);
46601 }
46602 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46603 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
46604 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
46605 // If we have register broadcast instructions, use the scalar size as the
46606 // element type for the shuffle. Then cast to the wider element type. The
46607 // widened bits won't be used, and this might allow the use of a broadcast
46608 // load.
46609 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
46610 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
46611 (NumElts * EltSizeInBits) / NumElts);
46612 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
46613 } else {
46614 // For smaller scalar integers, we can simply any-extend it to the vector
46615 // element size (we don't care about the upper bits) and broadcast it to all
46616 // elements.
46617 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
46618 }
46619
46620 // Now, mask the relevant bit in each element.
46622 for (unsigned i = 0; i != NumElts; ++i) {
46623 int BitIdx = (i % EltSizeInBits);
46624 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
46625 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
46626 }
46627 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
46628 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
46629
46630 // Compare against the bitmask and extend the result.
46631 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
46632 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
46633 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
46634
46635 // For SEXT, this is now done, otherwise shift the result down for
46636 // zero-extension.
46637 if (Opcode == ISD::SIGN_EXTEND)
46638 return Vec;
46639 return DAG.getNode(ISD::SRL, DL, VT, Vec,
46640 DAG.getConstant(EltSizeInBits - 1, DL, VT));
46641}
46642
46643/// If a vector select has an operand that is -1 or 0, try to simplify the
46644/// select to a bitwise logic operation.
46645/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
46646static SDValue
46649 const X86Subtarget &Subtarget) {
46650 SDValue Cond = N->getOperand(0);
46651 SDValue LHS = N->getOperand(1);
46652 SDValue RHS = N->getOperand(2);
46653 EVT VT = LHS.getValueType();
46654 EVT CondVT = Cond.getValueType();
46655 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46656
46657 if (N->getOpcode() != ISD::VSELECT)
46658 return SDValue();
46659
46660 assert(CondVT.isVector() && "Vector select expects a vector selector!");
46661
46662 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
46663 // TODO: Can we assert that both operands are not zeros (because that should
46664 // get simplified at node creation time)?
46665 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
46666 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
46667
46668 // If both inputs are 0/undef, create a complete zero vector.
46669 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
46670 if (TValIsAllZeros && FValIsAllZeros) {
46671 if (VT.isFloatingPoint())
46672 return DAG.getConstantFP(0.0, DL, VT);
46673 return DAG.getConstant(0, DL, VT);
46674 }
46675
46676 // To use the condition operand as a bitwise mask, it must have elements that
46677 // are the same size as the select elements. Ie, the condition operand must
46678 // have already been promoted from the IR select condition type <N x i1>.
46679 // Don't check if the types themselves are equal because that excludes
46680 // vector floating-point selects.
46681 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
46682 return SDValue();
46683
46684 // Try to invert the condition if true value is not all 1s and false value is
46685 // not all 0s. Only do this if the condition has one use.
46686 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
46687 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
46688 // Check if the selector will be produced by CMPP*/PCMP*.
46689 Cond.getOpcode() == ISD::SETCC &&
46690 // Check if SETCC has already been promoted.
46691 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
46692 CondVT) {
46693 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
46694
46695 if (TValIsAllZeros || FValIsAllOnes) {
46696 SDValue CC = Cond.getOperand(2);
46698 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
46699 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
46700 NewCC);
46701 std::swap(LHS, RHS);
46702 TValIsAllOnes = FValIsAllOnes;
46703 FValIsAllZeros = TValIsAllZeros;
46704 }
46705 }
46706
46707 // Cond value must be 'sign splat' to be converted to a logical op.
46708 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
46709 return SDValue();
46710
46711 // vselect Cond, 111..., 000... -> Cond
46712 if (TValIsAllOnes && FValIsAllZeros)
46713 return DAG.getBitcast(VT, Cond);
46714
46715 if (!TLI.isTypeLegal(CondVT))
46716 return SDValue();
46717
46718 // vselect Cond, 111..., X -> or Cond, X
46719 if (TValIsAllOnes) {
46720 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46721 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
46722 return DAG.getBitcast(VT, Or);
46723 }
46724
46725 // vselect Cond, X, 000... -> and Cond, X
46726 if (FValIsAllZeros) {
46727 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
46728 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
46729 return DAG.getBitcast(VT, And);
46730 }
46731
46732 // vselect Cond, 000..., X -> andn Cond, X
46733 if (TValIsAllZeros) {
46734 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46735 SDValue AndN;
46736 // The canonical form differs for i1 vectors - x86andnp is not used
46737 if (CondVT.getScalarType() == MVT::i1)
46738 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
46739 CastRHS);
46740 else
46741 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
46742 return DAG.getBitcast(VT, AndN);
46743 }
46744
46745 return SDValue();
46746}
46747
46748/// If both arms of a vector select are concatenated vectors, split the select,
46749/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
46750/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
46751/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
46753 const X86Subtarget &Subtarget) {
46754 unsigned Opcode = N->getOpcode();
46755 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
46756 return SDValue();
46757
46758 // TODO: Split 512-bit vectors too?
46759 EVT VT = N->getValueType(0);
46760 if (!VT.is256BitVector())
46761 return SDValue();
46762
46763 // TODO: Split as long as any 2 of the 3 operands are concatenated?
46764 SDValue Cond = N->getOperand(0);
46765 SDValue TVal = N->getOperand(1);
46766 SDValue FVal = N->getOperand(2);
46767 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
46768 !isFreeToSplitVector(TVal.getNode(), DAG) ||
46769 !isFreeToSplitVector(FVal.getNode(), DAG))
46770 return SDValue();
46771
46772 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
46773 ArrayRef<SDValue> Ops) {
46774 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
46775 };
46776 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
46777 /*CheckBWI*/ false);
46778}
46779
46781 const SDLoc &DL) {
46782 SDValue Cond = N->getOperand(0);
46783 SDValue LHS = N->getOperand(1);
46784 SDValue RHS = N->getOperand(2);
46785
46786 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
46787 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
46788 if (!TrueC || !FalseC)
46789 return SDValue();
46790
46791 // Don't do this for crazy integer types.
46792 EVT VT = N->getValueType(0);
46793 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
46794 return SDValue();
46795
46796 // We're going to use the condition bit in math or logic ops. We could allow
46797 // this with a wider condition value (post-legalization it becomes an i8),
46798 // but if nothing is creating selects that late, it doesn't matter.
46799 if (Cond.getValueType() != MVT::i1)
46800 return SDValue();
46801
46802 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
46803 // 3, 5, or 9 with i32/i64, so those get transformed too.
46804 // TODO: For constants that overflow or do not differ by power-of-2 or small
46805 // multiplier, convert to 'and' + 'add'.
46806 const APInt &TrueVal = TrueC->getAPIntValue();
46807 const APInt &FalseVal = FalseC->getAPIntValue();
46808
46809 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
46810 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
46811 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
46812 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46813 if (CC == ISD::SETEQ || CC == ISD::SETNE)
46814 return SDValue();
46815 }
46816
46817 bool OV;
46818 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
46819 if (OV)
46820 return SDValue();
46821
46822 APInt AbsDiff = Diff.abs();
46823 if (AbsDiff.isPowerOf2() ||
46824 ((VT == MVT::i32 || VT == MVT::i64) &&
46825 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
46826
46827 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
46828 // of the condition can usually be folded into a compare predicate, but even
46829 // without that, the sequence should be cheaper than a CMOV alternative.
46830 if (TrueVal.slt(FalseVal)) {
46831 Cond = DAG.getNOT(DL, Cond, MVT::i1);
46832 std::swap(TrueC, FalseC);
46833 }
46834
46835 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
46836 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
46837
46838 // Multiply condition by the difference if non-one.
46839 if (!AbsDiff.isOne())
46840 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
46841
46842 // Add the base if non-zero.
46843 if (!FalseC->isZero())
46844 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
46845
46846 return R;
46847 }
46848
46849 return SDValue();
46850}
46851
46852/// If this is a *dynamic* select (non-constant condition) and we can match
46853/// this node with one of the variable blend instructions, restructure the
46854/// condition so that blends can use the high (sign) bit of each element.
46855/// This function will also call SimplifyDemandedBits on already created
46856/// BLENDV to perform additional simplifications.
46858 const SDLoc &DL,
46860 const X86Subtarget &Subtarget) {
46861 SDValue Cond = N->getOperand(0);
46862 if ((N->getOpcode() != ISD::VSELECT &&
46863 N->getOpcode() != X86ISD::BLENDV) ||
46865 return SDValue();
46866
46867 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46868 unsigned BitWidth = Cond.getScalarValueSizeInBits();
46869 EVT VT = N->getValueType(0);
46870
46871 // We can only handle the cases where VSELECT is directly legal on the
46872 // subtarget. We custom lower VSELECT nodes with constant conditions and
46873 // this makes it hard to see whether a dynamic VSELECT will correctly
46874 // lower, so we both check the operation's status and explicitly handle the
46875 // cases where a *dynamic* blend will fail even though a constant-condition
46876 // blend could be custom lowered.
46877 // FIXME: We should find a better way to handle this class of problems.
46878 // Potentially, we should combine constant-condition vselect nodes
46879 // pre-legalization into shuffles and not mark as many types as custom
46880 // lowered.
46882 return SDValue();
46883 // FIXME: We don't support i16-element blends currently. We could and
46884 // should support them by making *all* the bits in the condition be set
46885 // rather than just the high bit and using an i8-element blend.
46886 if (VT.getVectorElementType() == MVT::i16)
46887 return SDValue();
46888 // Dynamic blending was only available from SSE4.1 onward.
46889 if (VT.is128BitVector() && !Subtarget.hasSSE41())
46890 return SDValue();
46891 // Byte blends are only available in AVX2
46892 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
46893 return SDValue();
46894 // There are no 512-bit blend instructions that use sign bits.
46895 if (VT.is512BitVector())
46896 return SDValue();
46897
46898 // Don't optimize before the condition has been transformed to a legal type
46899 // and don't ever optimize vector selects that map to AVX512 mask-registers.
46900 if (BitWidth < 8 || BitWidth > 64)
46901 return SDValue();
46902
46903 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
46904 for (SDUse &Use : Cond->uses())
46905 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
46906 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
46907 Use.getOperandNo() != 0)
46908 return false;
46909
46910 return true;
46911 };
46912
46914
46915 if (OnlyUsedAsSelectCond(Cond)) {
46916 KnownBits Known;
46918 !DCI.isBeforeLegalizeOps());
46919 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
46920 return SDValue();
46921
46922 // If we changed the computation somewhere in the DAG, this change will
46923 // affect all users of Cond. Update all the nodes so that we do not use
46924 // the generic VSELECT anymore. Otherwise, we may perform wrong
46925 // optimizations as we messed with the actual expectation for the vector
46926 // boolean values.
46927 for (SDNode *U : Cond->users()) {
46928 if (U->getOpcode() == X86ISD::BLENDV)
46929 continue;
46930
46931 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
46932 Cond, U->getOperand(1), U->getOperand(2));
46933 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
46934 DCI.AddToWorklist(U);
46935 }
46936 DCI.CommitTargetLoweringOpt(TLO);
46937 return SDValue(N, 0);
46938 }
46939
46940 // Otherwise we can still at least try to simplify multiple use bits.
46942 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
46943 N->getOperand(1), N->getOperand(2));
46944
46945 return SDValue();
46946}
46947
46948// Try to match:
46949// (or (and (M, (sub 0, X)), (pandn M, X)))
46950// which is a special case of:
46951// (select M, (sub 0, X), X)
46952// Per:
46953// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
46954// We know that, if fNegate is 0 or 1:
46955// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
46956//
46957// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
46958// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
46959// ( M ? -X : X) == ((X ^ M ) + (M & 1))
46960// This lets us transform our vselect to:
46961// (add (xor X, M), (and M, 1))
46962// And further to:
46963// (sub (xor X, M), M)
46965 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
46966 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46967 EVT MaskVT = Mask.getValueType();
46968 assert(MaskVT.isInteger() &&
46969 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
46970 "Mask must be zero/all-bits");
46971
46972 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
46973 return SDValue();
46975 return SDValue();
46976
46977 auto IsNegV = [](SDNode *N, SDValue V) {
46978 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
46979 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
46980 };
46981
46982 SDValue V;
46983 if (IsNegV(Y.getNode(), X))
46984 V = X;
46985 else if (IsNegV(X.getNode(), Y))
46986 V = Y;
46987 else
46988 return SDValue();
46989
46990 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
46991 SDValue SubOp2 = Mask;
46992
46993 // If the negate was on the false side of the select, then
46994 // the operands of the SUB need to be swapped. PR 27251.
46995 // This is because the pattern being matched above is
46996 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
46997 // but if the pattern matched was
46998 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
46999 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47000 // pattern also needs to be a negation of the replacement pattern above.
47001 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47002 // sub accomplishes the negation of the replacement pattern.
47003 if (V == Y)
47004 std::swap(SubOp1, SubOp2);
47005
47006 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47007 return DAG.getBitcast(VT, Res);
47008}
47009
47011 const X86Subtarget &Subtarget) {
47012 if (!Subtarget.hasAVX512())
47013 return SDValue();
47014 if (N->getOpcode() != ISD::VSELECT)
47015 return SDValue();
47016
47017 SDValue Cond = N->getOperand(0);
47018 SDValue LHS = N->getOperand(1);
47019 SDValue RHS = N->getOperand(2);
47020
47021 if (canCombineAsMaskOperation(LHS, Subtarget))
47022 return SDValue();
47023
47024 if (!canCombineAsMaskOperation(RHS, Subtarget))
47025 return SDValue();
47026
47027 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
47028 return SDValue();
47029
47030 // Commute LHS and RHS to create opportunity to select mask instruction.
47031 // (vselect M, L, R) -> (vselect ~M, R, L)
47032 ISD::CondCode NewCC =
47033 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
47034 Cond.getOperand(0).getValueType());
47035 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
47036 Cond.getOperand(1), NewCC);
47037 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47038}
47039
47040/// Do target-specific dag combines on SELECT and VSELECT nodes.
47043 const X86Subtarget &Subtarget) {
47044 SDLoc DL(N);
47045 SDValue Cond = N->getOperand(0);
47046 SDValue LHS = N->getOperand(1);
47047 SDValue RHS = N->getOperand(2);
47048
47049 // Try simplification again because we use this function to optimize
47050 // BLENDV nodes that are not handled by the generic combiner.
47051 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47052 return V;
47053
47054 // When avx512 is available the lhs operand of select instruction can be
47055 // folded with mask instruction, while the rhs operand can't. Commute the
47056 // lhs and rhs of the select instruction to create the opportunity of
47057 // folding.
47058 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47059 return V;
47060
47061 EVT VT = LHS.getValueType();
47062 EVT CondVT = Cond.getValueType();
47063 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47064 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47065
47066 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47067 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47068 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47069 if (CondVT.isVector() && CondVT.isInteger() &&
47070 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47071 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47074 DL, DAG, Subtarget))
47075 return V;
47076
47077 // Convert vselects with constant condition into shuffles.
47078 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
47079 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
47082 N->getOpcode() == X86ISD::BLENDV))
47083 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
47084 }
47085
47086 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47087 // by forcing the unselected elements to zero.
47088 // TODO: Can we handle more shuffles with this?
47089 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
47090 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
47091 LHS.hasOneUse() && RHS.hasOneUse()) {
47092 MVT SimpleVT = VT.getSimpleVT();
47093 SmallVector<SDValue, 1> LHSOps, RHSOps;
47094 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
47095 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
47096 getTargetShuffleMask(LHS, true, LHSOps, LHSMask) &&
47097 getTargetShuffleMask(RHS, true, RHSOps, RHSMask)) {
47098 int NumElts = VT.getVectorNumElements();
47099 for (int i = 0; i != NumElts; ++i) {
47100 // getConstVector sets negative shuffle mask values as undef, so ensure
47101 // we hardcode SM_SentinelZero values to zero (0x80).
47102 if (CondMask[i] < NumElts) {
47103 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
47104 RHSMask[i] = 0x80;
47105 } else {
47106 LHSMask[i] = 0x80;
47107 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
47108 }
47109 }
47110 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
47111 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
47112 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
47113 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
47114 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
47115 }
47116 }
47117
47118 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47119 // instructions match the semantics of the common C idiom x<y?x:y but not
47120 // x<=y?x:y, because of how they handle negative zero (which can be
47121 // ignored in unsafe-math mode).
47122 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47123 if ((Cond.getOpcode() == ISD::SETCC ||
47124 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47125 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47126 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47127 (Subtarget.hasSSE2() ||
47128 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47129 bool IsStrict = Cond->isStrictFPOpcode();
47131 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47132 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47133 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47134
47135 unsigned Opcode = 0;
47136 // Check for x CC y ? x : y.
47137 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47138 switch (CC) {
47139 default: break;
47140 case ISD::SETULT:
47141 // Converting this to a min would handle NaNs incorrectly, and swapping
47142 // the operands would cause it to handle comparisons between positive
47143 // and negative zero incorrectly.
47144 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47146 !(DAG.isKnownNeverZeroFloat(LHS) ||
47148 break;
47149 std::swap(LHS, RHS);
47150 }
47151 Opcode = X86ISD::FMIN;
47152 break;
47153 case ISD::SETOLE:
47154 // Converting this to a min would handle comparisons between positive
47155 // and negative zero incorrectly.
47158 break;
47159 Opcode = X86ISD::FMIN;
47160 break;
47161 case ISD::SETULE:
47162 // Converting this to a min would handle both negative zeros and NaNs
47163 // incorrectly, but we can swap the operands to fix both.
47164 std::swap(LHS, RHS);
47165 [[fallthrough]];
47166 case ISD::SETOLT:
47167 case ISD::SETLT:
47168 case ISD::SETLE:
47169 Opcode = X86ISD::FMIN;
47170 break;
47171
47172 case ISD::SETOGE:
47173 // Converting this to a max would handle comparisons between positive
47174 // and negative zero incorrectly.
47177 break;
47178 Opcode = X86ISD::FMAX;
47179 break;
47180 case ISD::SETUGT:
47181 // Converting this to a max would handle NaNs incorrectly, and swapping
47182 // the operands would cause it to handle comparisons between positive
47183 // and negative zero incorrectly.
47184 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47186 !(DAG.isKnownNeverZeroFloat(LHS) ||
47188 break;
47189 std::swap(LHS, RHS);
47190 }
47191 Opcode = X86ISD::FMAX;
47192 break;
47193 case ISD::SETUGE:
47194 // Converting this to a max would handle both negative zeros and NaNs
47195 // incorrectly, but we can swap the operands to fix both.
47196 std::swap(LHS, RHS);
47197 [[fallthrough]];
47198 case ISD::SETOGT:
47199 case ISD::SETGT:
47200 case ISD::SETGE:
47201 Opcode = X86ISD::FMAX;
47202 break;
47203 }
47204 // Check for x CC y ? y : x -- a min/max with reversed arms.
47205 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47206 switch (CC) {
47207 default: break;
47208 case ISD::SETOGE:
47209 // Converting this to a min would handle comparisons between positive
47210 // and negative zero incorrectly, and swapping the operands would
47211 // cause it to handle NaNs incorrectly.
47213 !(DAG.isKnownNeverZeroFloat(LHS) ||
47214 DAG.isKnownNeverZeroFloat(RHS))) {
47215 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47216 break;
47217 std::swap(LHS, RHS);
47218 }
47219 Opcode = X86ISD::FMIN;
47220 break;
47221 case ISD::SETUGT:
47222 // Converting this to a min would handle NaNs incorrectly.
47223 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47224 break;
47225 Opcode = X86ISD::FMIN;
47226 break;
47227 case ISD::SETUGE:
47228 // Converting this to a min would handle both negative zeros and NaNs
47229 // incorrectly, but we can swap the operands to fix both.
47230 std::swap(LHS, RHS);
47231 [[fallthrough]];
47232 case ISD::SETOGT:
47233 case ISD::SETGT:
47234 case ISD::SETGE:
47235 Opcode = X86ISD::FMIN;
47236 break;
47237
47238 case ISD::SETULT:
47239 // Converting this to a max would handle NaNs incorrectly.
47240 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47241 break;
47242 Opcode = X86ISD::FMAX;
47243 break;
47244 case ISD::SETOLE:
47245 // Converting this to a max would handle comparisons between positive
47246 // and negative zero incorrectly, and swapping the operands would
47247 // cause it to handle NaNs incorrectly.
47249 !DAG.isKnownNeverZeroFloat(LHS) &&
47250 !DAG.isKnownNeverZeroFloat(RHS)) {
47251 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47252 break;
47253 std::swap(LHS, RHS);
47254 }
47255 Opcode = X86ISD::FMAX;
47256 break;
47257 case ISD::SETULE:
47258 // Converting this to a max would handle both negative zeros and NaNs
47259 // incorrectly, but we can swap the operands to fix both.
47260 std::swap(LHS, RHS);
47261 [[fallthrough]];
47262 case ISD::SETOLT:
47263 case ISD::SETLT:
47264 case ISD::SETLE:
47265 Opcode = X86ISD::FMAX;
47266 break;
47267 }
47268 }
47269
47270 if (Opcode) {
47271 if (IsStrict) {
47272 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47274 DL, {N->getValueType(0), MVT::Other},
47275 {Cond.getOperand(0), LHS, RHS});
47276 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47277 return Ret;
47278 }
47279 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47280 }
47281 }
47282
47283 // Some mask scalar intrinsics rely on checking if only one bit is set
47284 // and implement it in C code like this:
47285 // A[0] = (U & 1) ? A[0] : W[0];
47286 // This creates some redundant instructions that break pattern matching.
47287 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47288 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47289 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47290 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47291 SDValue AndNode = Cond.getOperand(0);
47292 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47293 isNullConstant(Cond.getOperand(1)) &&
47294 isOneConstant(AndNode.getOperand(1))) {
47295 // LHS and RHS swapped due to
47296 // setcc outputting 1 when AND resulted in 0 and vice versa.
47297 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47298 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47299 }
47300 }
47301
47302 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47303 // lowering on KNL. In this case we convert it to
47304 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47305 // The same situation all vectors of i8 and i16 without BWI.
47306 // Make sure we extend these even before type legalization gets a chance to
47307 // split wide vectors.
47308 // Since SKX these selects have a proper lowering.
47309 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47310 CondVT.getVectorElementType() == MVT::i1 &&
47311 (VT.getVectorElementType() == MVT::i8 ||
47312 VT.getVectorElementType() == MVT::i16)) {
47313 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47314 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47315 }
47316
47317 // AVX512 - Extend select to merge with target shuffle.
47318 // select(mask, extract_subvector(shuffle(x)), y) -->
47319 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47320 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47321 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47322 CondVT.getVectorElementType() == MVT::i1) {
47323 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
47324 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47325 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
47326 isNullConstant(Op.getOperand(1)) &&
47327 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
47328 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
47329 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
47330 ISD::isBuildVectorAllZeros(Alt.getNode()));
47331 };
47332
47333 bool SelectableLHS = SelectableOp(LHS, RHS);
47334 bool SelectableRHS = SelectableOp(RHS, LHS);
47335 if (SelectableLHS || SelectableRHS) {
47336 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
47337 : RHS.getOperand(0).getValueType();
47338 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
47339 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
47340 VT.getSizeInBits());
47341 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
47342 VT.getSizeInBits());
47343 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
47344 DAG.getUNDEF(SrcCondVT), Cond,
47345 DAG.getVectorIdxConstant(0, DL));
47346 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
47347 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
47348 }
47349 }
47350
47351 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
47352 return V;
47353
47354 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
47355 Cond.hasOneUse()) {
47356 EVT CondVT = Cond.getValueType();
47357 SDValue Cond0 = Cond.getOperand(0);
47358 SDValue Cond1 = Cond.getOperand(1);
47359 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47360
47361 // Canonicalize min/max:
47362 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
47363 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
47364 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
47365 // the need for an extra compare against zero. e.g.
47366 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
47367 // subl %esi, %edi
47368 // testl %edi, %edi
47369 // movl $0, %eax
47370 // cmovgl %edi, %eax
47371 // =>
47372 // xorl %eax, %eax
47373 // subl %esi, $edi
47374 // cmovsl %eax, %edi
47375 //
47376 // We can also canonicalize
47377 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
47378 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
47379 // This allows the use of a test instruction for the compare.
47380 if (LHS == Cond0 && RHS == Cond1) {
47381 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
47384 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47385 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47386 }
47387 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
47388 ISD::CondCode NewCC = ISD::SETUGE;
47389 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47390 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47391 }
47392 }
47393
47394 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
47395 // fold eq + gt/lt nested selects into ge/le selects
47396 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
47397 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
47398 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
47399 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
47400 // .. etc ..
47401 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
47402 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
47403 SDValue InnerSetCC = RHS.getOperand(0);
47404 ISD::CondCode InnerCC =
47405 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
47406 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
47407 Cond0 == InnerSetCC.getOperand(0) &&
47408 Cond1 == InnerSetCC.getOperand(1)) {
47409 ISD::CondCode NewCC;
47410 switch (CC == ISD::SETEQ ? InnerCC : CC) {
47411 // clang-format off
47412 case ISD::SETGT: NewCC = ISD::SETGE; break;
47413 case ISD::SETLT: NewCC = ISD::SETLE; break;
47414 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
47415 case ISD::SETULT: NewCC = ISD::SETULE; break;
47416 default: NewCC = ISD::SETCC_INVALID; break;
47417 // clang-format on
47418 }
47419 if (NewCC != ISD::SETCC_INVALID) {
47420 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
47421 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
47422 }
47423 }
47424 }
47425 }
47426
47427 // Check if the first operand is all zeros and Cond type is vXi1.
47428 // If this an avx512 target we can improve the use of zero masking by
47429 // swapping the operands and inverting the condition.
47430 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
47431 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
47432 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
47433 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
47434 // Invert the cond to not(cond) : xor(op,allones)=not(op)
47435 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
47436 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
47437 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
47438 }
47439
47440 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
47441 // get split by legalization.
47442 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
47443 CondVT.getVectorElementType() == MVT::i1 &&
47444 TLI.isTypeLegal(VT.getScalarType())) {
47445 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
47447 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
47448 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
47449 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
47450 }
47451 }
47452
47453 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
47454 // with out-of-bounds clamping.
47455
47456 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
47457 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
47458 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
47459 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
47460 // exceeding bitwidth-1.
47461 if (N->getOpcode() == ISD::VSELECT) {
47462 using namespace llvm::SDPatternMatch;
47463 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
47464 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
47465 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
47466 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
47468 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
47470 m_SpecificCondCode(ISD::SETULT)))) {
47471 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
47472 : X86ISD::VSHLV,
47473 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
47474 }
47475 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
47476 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
47477 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
47478 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
47480 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
47482 m_SpecificCondCode(ISD::SETUGE)))) {
47483 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
47484 : X86ISD::VSHLV,
47485 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
47486 }
47487 }
47488
47489 // Early exit check
47490 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
47491 return SDValue();
47492
47493 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DL, DCI, Subtarget))
47494 return V;
47495
47496 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
47497 return V;
47498
47499 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
47500 return V;
47501
47502 // select(~Cond, X, Y) -> select(Cond, Y, X)
47503 if (CondVT.getScalarType() != MVT::i1) {
47504 if (SDValue CondNot = IsNOT(Cond, DAG))
47505 return DAG.getNode(N->getOpcode(), DL, VT,
47506 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
47507
47508 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
47509 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
47510 Cond.getOperand(0).getOpcode() == ISD::AND &&
47511 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
47512 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
47513 Cond.getScalarValueSizeInBits(),
47514 /*AllowUndefs=*/true) &&
47515 Cond.hasOneUse()) {
47516 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
47517 Cond.getOperand(0).getOperand(1));
47518 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47519 }
47520
47521 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
47522 // signbit.
47523 if (Cond.getOpcode() == X86ISD::PCMPGT &&
47524 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
47525 Cond.hasOneUse()) {
47526 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
47527 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
47528 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47529 }
47530 }
47531
47532 // Try to optimize vXi1 selects if both operands are either all constants or
47533 // bitcasts from scalar integer type. In that case we can convert the operands
47534 // to integer and use an integer select which will be converted to a CMOV.
47535 // We need to take a little bit of care to avoid creating an i64 type after
47536 // type legalization.
47537 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
47538 VT.getVectorElementType() == MVT::i1 &&
47539 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
47541 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
47542 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
47543 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
47544
47545 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
47546 LHS.getOperand(0).getValueType() == IntVT)) &&
47547 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
47548 RHS.getOperand(0).getValueType() == IntVT))) {
47549 if (LHSIsConst)
47551 else
47552 LHS = LHS.getOperand(0);
47553
47554 if (RHSIsConst)
47556 else
47557 RHS = RHS.getOperand(0);
47558
47559 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
47560 return DAG.getBitcast(VT, Select);
47561 }
47562 }
47563 }
47564
47565 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
47566 // single bits, then invert the predicate and swap the select operands.
47567 // This can lower using a vector shift bit-hack rather than mask and compare.
47568 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
47569 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
47570 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
47571 Cond.getOperand(0).getOpcode() == ISD::AND &&
47572 isNullOrNullSplat(Cond.getOperand(1)) &&
47573 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
47574 Cond.getOperand(0).getValueType() == VT) {
47575 // The 'and' mask must be composed of power-of-2 constants.
47576 SDValue And = Cond.getOperand(0);
47577 auto *C = isConstOrConstSplat(And.getOperand(1));
47578 if (C && C->getAPIntValue().isPowerOf2()) {
47579 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
47580 SDValue NotCond =
47581 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
47582 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
47583 }
47584
47585 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
47586 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
47587 // 16-bit lacks a proper blendv.
47588 unsigned EltBitWidth = VT.getScalarSizeInBits();
47589 bool CanShiftBlend =
47590 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
47591 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
47592 (Subtarget.hasXOP()));
47593 if (CanShiftBlend &&
47594 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
47595 return C->getAPIntValue().isPowerOf2();
47596 })) {
47597 // Create a left-shift constant to get the mask bits over to the sign-bit.
47598 SDValue Mask = And.getOperand(1);
47599 SmallVector<int, 32> ShlVals;
47600 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
47601 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
47602 ShlVals.push_back(EltBitWidth - 1 -
47603 MaskVal->getAPIntValue().exactLogBase2());
47604 }
47605 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
47606 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
47607 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
47608 SDValue NewCond =
47609 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
47610 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
47611 }
47612 }
47613
47614 return SDValue();
47615}
47616
47617/// Combine:
47618/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
47619/// to:
47620/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
47621/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
47622/// Note that this is only legal for some op/cc combinations.
47624 SelectionDAG &DAG,
47625 const X86Subtarget &Subtarget) {
47626 // This combine only operates on CMP-like nodes.
47627 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47628 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47629 return SDValue();
47630
47631 // Can't replace the cmp if it has more uses than the one we're looking at.
47632 // FIXME: We would like to be able to handle this, but would need to make sure
47633 // all uses were updated.
47634 if (!Cmp.hasOneUse())
47635 return SDValue();
47636
47637 // This only applies to variations of the common case:
47638 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
47639 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
47640 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
47641 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
47642 // Using the proper condcodes (see below), overflow is checked for.
47643
47644 // FIXME: We can generalize both constraints:
47645 // - XOR/OR/AND (if they were made to survive AtomicExpand)
47646 // - LHS != 1
47647 // if the result is compared.
47648
47649 SDValue CmpLHS = Cmp.getOperand(0);
47650 SDValue CmpRHS = Cmp.getOperand(1);
47651 EVT CmpVT = CmpLHS.getValueType();
47652
47653 if (!CmpLHS.hasOneUse())
47654 return SDValue();
47655
47656 unsigned Opc = CmpLHS.getOpcode();
47657 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
47658 return SDValue();
47659
47660 SDValue OpRHS = CmpLHS.getOperand(2);
47661 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
47662 if (!OpRHSC)
47663 return SDValue();
47664
47665 APInt Addend = OpRHSC->getAPIntValue();
47666 if (Opc == ISD::ATOMIC_LOAD_SUB)
47667 Addend = -Addend;
47668
47669 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
47670 if (!CmpRHSC)
47671 return SDValue();
47672
47673 APInt Comparison = CmpRHSC->getAPIntValue();
47674 APInt NegAddend = -Addend;
47675
47676 // See if we can adjust the CC to make the comparison match the negated
47677 // addend.
47678 if (Comparison != NegAddend) {
47679 APInt IncComparison = Comparison + 1;
47680 if (IncComparison == NegAddend) {
47681 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
47682 Comparison = IncComparison;
47683 CC = X86::COND_AE;
47684 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
47685 Comparison = IncComparison;
47686 CC = X86::COND_L;
47687 }
47688 }
47689 APInt DecComparison = Comparison - 1;
47690 if (DecComparison == NegAddend) {
47691 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
47692 Comparison = DecComparison;
47693 CC = X86::COND_A;
47694 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
47695 Comparison = DecComparison;
47696 CC = X86::COND_LE;
47697 }
47698 }
47699 }
47700
47701 // If the addend is the negation of the comparison value, then we can do
47702 // a full comparison by emitting the atomic arithmetic as a locked sub.
47703 if (Comparison == NegAddend) {
47704 // The CC is fine, but we need to rewrite the LHS of the comparison as an
47705 // atomic sub.
47706 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
47707 auto AtomicSub = DAG.getAtomic(
47708 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
47709 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
47710 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
47711 AN->getMemOperand());
47712 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
47713 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47714 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47715 return LockOp;
47716 }
47717
47718 // We can handle comparisons with zero in a number of cases by manipulating
47719 // the CC used.
47720 if (!Comparison.isZero())
47721 return SDValue();
47722
47723 if (CC == X86::COND_S && Addend == 1)
47724 CC = X86::COND_LE;
47725 else if (CC == X86::COND_NS && Addend == 1)
47726 CC = X86::COND_G;
47727 else if (CC == X86::COND_G && Addend == -1)
47728 CC = X86::COND_GE;
47729 else if (CC == X86::COND_LE && Addend == -1)
47730 CC = X86::COND_L;
47731 else
47732 return SDValue();
47733
47734 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
47735 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47736 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47737 return LockOp;
47738}
47739
47740// Check whether we're just testing the signbit, and whether we can simplify
47741// this by tracking where the signbit came from.
47743 SelectionDAG &DAG) {
47744 if (CC != X86::COND_S && CC != X86::COND_NS)
47745 return SDValue();
47746
47747 if (!Cmp.hasOneUse())
47748 return SDValue();
47749
47750 SDValue Src;
47751 if (Cmp.getOpcode() == X86ISD::CMP) {
47752 // CMP(X,0) -> signbit test
47753 if (!isNullConstant(Cmp.getOperand(1)))
47754 return SDValue();
47755 Src = Cmp.getOperand(0);
47756 // Peek through a SRA node as we just need the signbit.
47757 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
47758 // TODO: Use SimplifyDemandedBits instead of just SRA?
47759 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
47760 return SDValue();
47761 Src = Src.getOperand(0);
47762 } else if (Cmp.getOpcode() == X86ISD::OR) {
47763 // OR(X,Y) -> see if only one operand contributes to the signbit.
47764 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
47765 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
47766 Src = Cmp.getOperand(1);
47767 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
47768 Src = Cmp.getOperand(0);
47769 else
47770 return SDValue();
47771 } else {
47772 return SDValue();
47773 }
47774
47775 // Replace with a TEST on the MSB.
47776 SDLoc DL(Cmp);
47777 MVT SrcVT = Src.getSimpleValueType();
47778 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
47779
47780 // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
47781 // peek through and adjust the TEST bit.
47782 if (Src.getOpcode() == ISD::SHL) {
47783 if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) {
47784 Src = Src.getOperand(0);
47785 BitMask.lshrInPlace(*ShiftAmt);
47786 }
47787 }
47788
47789 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
47790 DAG.getConstant(BitMask, DL, SrcVT));
47792 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
47793 DAG.getConstant(0, DL, SrcVT));
47794}
47795
47796// Check whether a boolean test is testing a boolean value generated by
47797// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
47798// code.
47799//
47800// Simplify the following patterns:
47801// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
47802// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
47803// to (Op EFLAGS Cond)
47804//
47805// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
47806// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
47807// to (Op EFLAGS !Cond)
47808//
47809// where Op could be BRCOND or CMOV.
47810//
47812 // This combine only operates on CMP-like nodes.
47813 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47814 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47815 return SDValue();
47816
47817 // Quit if not used as a boolean value.
47818 if (CC != X86::COND_E && CC != X86::COND_NE)
47819 return SDValue();
47820
47821 // Check CMP operands. One of them should be 0 or 1 and the other should be
47822 // an SetCC or extended from it.
47823 SDValue Op1 = Cmp.getOperand(0);
47824 SDValue Op2 = Cmp.getOperand(1);
47825
47826 SDValue SetCC;
47827 const ConstantSDNode* C = nullptr;
47828 bool needOppositeCond = (CC == X86::COND_E);
47829 bool checkAgainstTrue = false; // Is it a comparison against 1?
47830
47831 if ((C = dyn_cast<ConstantSDNode>(Op1)))
47832 SetCC = Op2;
47833 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
47834 SetCC = Op1;
47835 else // Quit if all operands are not constants.
47836 return SDValue();
47837
47838 if (C->getZExtValue() == 1) {
47839 needOppositeCond = !needOppositeCond;
47840 checkAgainstTrue = true;
47841 } else if (C->getZExtValue() != 0)
47842 // Quit if the constant is neither 0 or 1.
47843 return SDValue();
47844
47845 bool truncatedToBoolWithAnd = false;
47846 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
47847 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
47848 SetCC.getOpcode() == ISD::TRUNCATE ||
47849 SetCC.getOpcode() == ISD::AND) {
47850 if (SetCC.getOpcode() == ISD::AND) {
47851 int OpIdx = -1;
47852 if (isOneConstant(SetCC.getOperand(0)))
47853 OpIdx = 1;
47854 if (isOneConstant(SetCC.getOperand(1)))
47855 OpIdx = 0;
47856 if (OpIdx < 0)
47857 break;
47858 SetCC = SetCC.getOperand(OpIdx);
47859 truncatedToBoolWithAnd = true;
47860 } else
47861 SetCC = SetCC.getOperand(0);
47862 }
47863
47864 switch (SetCC.getOpcode()) {
47866 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
47867 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
47868 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
47869 // truncated to i1 using 'and'.
47870 if (checkAgainstTrue && !truncatedToBoolWithAnd)
47871 break;
47873 "Invalid use of SETCC_CARRY!");
47874 [[fallthrough]];
47875 case X86ISD::SETCC:
47876 // Set the condition code or opposite one if necessary.
47878 if (needOppositeCond)
47880 return SetCC.getOperand(1);
47881 case X86ISD::CMOV: {
47882 // Check whether false/true value has canonical one, i.e. 0 or 1.
47883 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
47884 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
47885 // Quit if true value is not a constant.
47886 if (!TVal)
47887 return SDValue();
47888 // Quit if false value is not a constant.
47889 if (!FVal) {
47890 SDValue Op = SetCC.getOperand(0);
47891 // Skip 'zext' or 'trunc' node.
47892 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
47893 Op.getOpcode() == ISD::TRUNCATE)
47894 Op = Op.getOperand(0);
47895 // A special case for rdrand/rdseed, where 0 is set if false cond is
47896 // found.
47897 if ((Op.getOpcode() != X86ISD::RDRAND &&
47898 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
47899 return SDValue();
47900 }
47901 // Quit if false value is not the constant 0 or 1.
47902 bool FValIsFalse = true;
47903 if (FVal && FVal->getZExtValue() != 0) {
47904 if (FVal->getZExtValue() != 1)
47905 return SDValue();
47906 // If FVal is 1, opposite cond is needed.
47907 needOppositeCond = !needOppositeCond;
47908 FValIsFalse = false;
47909 }
47910 // Quit if TVal is not the constant opposite of FVal.
47911 if (FValIsFalse && TVal->getZExtValue() != 1)
47912 return SDValue();
47913 if (!FValIsFalse && TVal->getZExtValue() != 0)
47914 return SDValue();
47916 if (needOppositeCond)
47918 return SetCC.getOperand(3);
47919 }
47920 }
47921
47922 return SDValue();
47923}
47924
47925/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
47926/// Match:
47927/// (X86or (X86setcc) (X86setcc))
47928/// (X86cmp (and (X86setcc) (X86setcc)), 0)
47930 X86::CondCode &CC1, SDValue &Flags,
47931 bool &isAnd) {
47932 if (Cond->getOpcode() == X86ISD::CMP) {
47933 if (!isNullConstant(Cond->getOperand(1)))
47934 return false;
47935
47936 Cond = Cond->getOperand(0);
47937 }
47938
47939 isAnd = false;
47940
47941 SDValue SetCC0, SetCC1;
47942 switch (Cond->getOpcode()) {
47943 default: return false;
47944 case ISD::AND:
47945 case X86ISD::AND:
47946 isAnd = true;
47947 [[fallthrough]];
47948 case ISD::OR:
47949 case X86ISD::OR:
47950 SetCC0 = Cond->getOperand(0);
47951 SetCC1 = Cond->getOperand(1);
47952 break;
47953 };
47954
47955 // Make sure we have SETCC nodes, using the same flags value.
47956 if (SetCC0.getOpcode() != X86ISD::SETCC ||
47957 SetCC1.getOpcode() != X86ISD::SETCC ||
47958 SetCC0->getOperand(1) != SetCC1->getOperand(1))
47959 return false;
47960
47961 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
47962 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
47963 Flags = SetCC0->getOperand(1);
47964 return true;
47965}
47966
47967// When legalizing carry, we create carries via add X, -1
47968// If that comes from an actual carry, via setcc, we use the
47969// carry directly.
47971 if (EFLAGS.getOpcode() == X86ISD::ADD) {
47972 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
47973 bool FoundAndLSB = false;
47974 SDValue Carry = EFLAGS.getOperand(0);
47975 while (Carry.getOpcode() == ISD::TRUNCATE ||
47976 Carry.getOpcode() == ISD::ZERO_EXTEND ||
47977 (Carry.getOpcode() == ISD::AND &&
47978 isOneConstant(Carry.getOperand(1)))) {
47979 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
47980 Carry = Carry.getOperand(0);
47981 }
47982 if (Carry.getOpcode() == X86ISD::SETCC ||
47983 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
47984 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
47985 uint64_t CarryCC = Carry.getConstantOperandVal(0);
47986 SDValue CarryOp1 = Carry.getOperand(1);
47987 if (CarryCC == X86::COND_B)
47988 return CarryOp1;
47989 if (CarryCC == X86::COND_A) {
47990 // Try to convert COND_A into COND_B in an attempt to facilitate
47991 // materializing "setb reg".
47992 //
47993 // Do not flip "e > c", where "c" is a constant, because Cmp
47994 // instruction cannot take an immediate as its first operand.
47995 //
47996 if (CarryOp1.getOpcode() == X86ISD::SUB &&
47997 CarryOp1.getNode()->hasOneUse() &&
47998 CarryOp1.getValueType().isInteger() &&
47999 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48000 SDValue SubCommute =
48001 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48002 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48003 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48004 }
48005 }
48006 // If this is a check of the z flag of an add with 1, switch to the
48007 // C flag.
48008 if (CarryCC == X86::COND_E &&
48009 CarryOp1.getOpcode() == X86ISD::ADD &&
48010 isOneConstant(CarryOp1.getOperand(1)))
48011 return CarryOp1;
48012 } else if (FoundAndLSB) {
48013 SDLoc DL(Carry);
48014 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48015 if (Carry.getOpcode() == ISD::SRL) {
48016 BitNo = Carry.getOperand(1);
48017 Carry = Carry.getOperand(0);
48018 }
48019 return getBT(Carry, BitNo, DL, DAG);
48020 }
48021 }
48022 }
48023
48024 return SDValue();
48025}
48026
48027/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48028/// to avoid the inversion.
48030 SelectionDAG &DAG,
48031 const X86Subtarget &Subtarget) {
48032 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48033 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48034 EFLAGS.getOpcode() != X86ISD::TESTP)
48035 return SDValue();
48036
48037 // PTEST/TESTP sets EFLAGS as:
48038 // TESTZ: ZF = (Op0 & Op1) == 0
48039 // TESTC: CF = (~Op0 & Op1) == 0
48040 // TESTNZC: ZF == 0 && CF == 0
48041 MVT VT = EFLAGS.getSimpleValueType();
48042 SDValue Op0 = EFLAGS.getOperand(0);
48043 SDValue Op1 = EFLAGS.getOperand(1);
48044 MVT OpVT = Op0.getSimpleValueType();
48045 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48046
48047 // TEST*(~X,Y) == TEST*(X,Y)
48048 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48049 X86::CondCode InvCC;
48050 switch (CC) {
48051 case X86::COND_B:
48052 // testc -> testz.
48053 InvCC = X86::COND_E;
48054 break;
48055 case X86::COND_AE:
48056 // !testc -> !testz.
48057 InvCC = X86::COND_NE;
48058 break;
48059 case X86::COND_E:
48060 // testz -> testc.
48061 InvCC = X86::COND_B;
48062 break;
48063 case X86::COND_NE:
48064 // !testz -> !testc.
48065 InvCC = X86::COND_AE;
48066 break;
48067 case X86::COND_A:
48068 case X86::COND_BE:
48069 // testnzc -> testnzc (no change).
48070 InvCC = CC;
48071 break;
48072 default:
48073 InvCC = X86::COND_INVALID;
48074 break;
48075 }
48076
48077 if (InvCC != X86::COND_INVALID) {
48078 CC = InvCC;
48079 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48080 DAG.getBitcast(OpVT, NotOp0), Op1);
48081 }
48082 }
48083
48084 if (CC == X86::COND_B || CC == X86::COND_AE) {
48085 // TESTC(X,~X) == TESTC(X,-1)
48086 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48087 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48088 SDLoc DL(EFLAGS);
48089 return DAG.getNode(
48090 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48091 DAG.getBitcast(OpVT,
48092 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48093 }
48094 }
48095 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48096 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48098 SDValue BC0 = peekThroughBitcasts(Op0);
48099 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48101 SDLoc DL(EFLAGS);
48103 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48104 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48105 }
48106 }
48107 }
48108
48109 if (CC == X86::COND_E || CC == X86::COND_NE) {
48110 // TESTZ(X,~Y) == TESTC(Y,X)
48111 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48113 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48114 DAG.getBitcast(OpVT, NotOp1), Op0);
48115 }
48116
48117 if (Op0 == Op1) {
48118 SDValue BC = peekThroughBitcasts(Op0);
48119 EVT BCVT = BC.getValueType();
48120
48121 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48122 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48123 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48124 DAG.getBitcast(OpVT, BC.getOperand(0)),
48125 DAG.getBitcast(OpVT, BC.getOperand(1)));
48126 }
48127
48128 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48129 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48131 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48132 DAG.getBitcast(OpVT, BC.getOperand(0)),
48133 DAG.getBitcast(OpVT, BC.getOperand(1)));
48134 }
48135
48136 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48137 // to more efficiently extract the sign bits and compare that.
48138 // TODO: Handle TESTC with comparison inversion.
48139 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48140 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48141 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48142 unsigned EltBits = BCVT.getScalarSizeInBits();
48143 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48144 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48145 APInt SignMask = APInt::getSignMask(EltBits);
48146 if (SDValue Res =
48147 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48148 // For vXi16 cases we need to use pmovmksb and extract every other
48149 // sign bit.
48150 SDLoc DL(EFLAGS);
48151 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48152 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48153 MVT FloatVT =
48154 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48155 Res = DAG.getBitcast(FloatVT, Res);
48156 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48157 } else if (EltBits == 16) {
48158 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48159 Res = DAG.getBitcast(MovmskVT, Res);
48160 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48161 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48162 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48163 } else {
48164 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48165 }
48166 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48167 DAG.getConstant(0, DL, MVT::i32));
48168 }
48169 }
48170 }
48171 }
48172
48173 // TESTZ(-1,X) == TESTZ(X,X)
48175 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48176
48177 // TESTZ(X,-1) == TESTZ(X,X)
48179 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48180
48181 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48182 // TODO: Add COND_NE handling?
48183 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48184 SDValue Src0 = peekThroughBitcasts(Op0);
48185 SDValue Src1 = peekThroughBitcasts(Op1);
48186 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48188 peekThroughBitcasts(Src0.getOperand(1)), true);
48190 peekThroughBitcasts(Src1.getOperand(1)), true);
48191 if (Src0 && Src1) {
48192 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48193 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48194 DAG.getBitcast(OpVT2, Src0),
48195 DAG.getBitcast(OpVT2, Src1));
48196 }
48197 }
48198 }
48199 }
48200
48201 return SDValue();
48202}
48203
48204// Attempt to simplify the MOVMSK input based on the comparison type.
48206 SelectionDAG &DAG,
48207 const X86Subtarget &Subtarget) {
48208 // Handle eq/ne against zero (any_of).
48209 // Handle eq/ne against -1 (all_of).
48210 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48211 return SDValue();
48212 if (EFLAGS.getValueType() != MVT::i32)
48213 return SDValue();
48214 unsigned CmpOpcode = EFLAGS.getOpcode();
48215 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48216 return SDValue();
48217 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48218 if (!CmpConstant)
48219 return SDValue();
48220 const APInt &CmpVal = CmpConstant->getAPIntValue();
48221
48222 SDValue CmpOp = EFLAGS.getOperand(0);
48223 unsigned CmpBits = CmpOp.getValueSizeInBits();
48224 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48225
48226 // Peek through any truncate.
48227 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48228 CmpOp = CmpOp.getOperand(0);
48229
48230 // Bail if we don't find a MOVMSK.
48231 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48232 return SDValue();
48233
48234 SDValue Vec = CmpOp.getOperand(0);
48235 MVT VecVT = Vec.getSimpleValueType();
48236 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48237 "Unexpected MOVMSK operand");
48238 unsigned NumElts = VecVT.getVectorNumElements();
48239 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48240
48241 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48242 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48243 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48244 if (!IsAnyOf && !IsAllOf)
48245 return SDValue();
48246
48247 // TODO: Check more combining cases for me.
48248 // Here we check the cmp use number to decide do combining or not.
48249 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48250 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48251 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48252
48253 // See if we can peek through to a vector with a wider element type, if the
48254 // signbits extend down to all the sub-elements as well.
48255 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48256 // potential SimplifyDemandedBits/Elts cases.
48257 // If we looked through a truncate that discard bits, we can't do this
48258 // transform.
48259 // FIXME: We could do this transform for truncates that discarded bits by
48260 // inserting an AND mask between the new MOVMSK and the CMP.
48261 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48262 SDValue BC = peekThroughBitcasts(Vec);
48263 MVT BCVT = BC.getSimpleValueType();
48264 unsigned BCNumElts = BCVT.getVectorNumElements();
48265 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48266 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48267 BCNumEltBits > NumEltBits &&
48268 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48269 SDLoc DL(EFLAGS);
48270 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48271 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48272 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48273 DAG.getConstant(CmpMask, DL, MVT::i32));
48274 }
48275 }
48276
48277 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48278 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48279 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48280 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48281 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48283 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48284 Ops.size() == 2) {
48285 SDLoc DL(EFLAGS);
48286 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48287 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48288 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48289 DAG.getBitcast(SubVT, Ops[0]),
48290 DAG.getBitcast(SubVT, Ops[1]));
48291 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48292 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48293 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48294 DAG.getConstant(CmpMask, DL, MVT::i32));
48295 }
48296 }
48297
48298 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48299 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48300 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48301 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48302 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48303 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48304 SDValue BC = peekThroughBitcasts(Vec);
48305 // Ensure MOVMSK was testing every signbit of BC.
48306 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48307 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48308 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48309 BC.getOperand(0), BC.getOperand(1));
48310 V = DAG.getBitcast(TestVT, V);
48311 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48312 }
48313 // Check for 256-bit split vector cases.
48314 if (BC.getOpcode() == ISD::AND &&
48315 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48316 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48317 SDValue LHS = BC.getOperand(0);
48318 SDValue RHS = BC.getOperand(1);
48319 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48320 LHS.getOperand(0), LHS.getOperand(1));
48321 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48322 RHS.getOperand(0), RHS.getOperand(1));
48323 LHS = DAG.getBitcast(TestVT, LHS);
48324 RHS = DAG.getBitcast(TestVT, RHS);
48325 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
48326 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48327 }
48328 }
48329 }
48330
48331 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
48332 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
48333 // sign bits prior to the comparison with zero unless we know that
48334 // the vXi16 splats the sign bit down to the lower i8 half.
48335 // TODO: Handle all_of patterns.
48336 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
48337 SDValue VecOp0 = Vec.getOperand(0);
48338 SDValue VecOp1 = Vec.getOperand(1);
48339 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
48340 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
48341 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
48342 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
48343 SDLoc DL(EFLAGS);
48344 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
48345 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48346 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
48347 if (!SignExt0) {
48348 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
48349 DAG.getConstant(0xAAAA, DL, MVT::i16));
48350 }
48351 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48352 DAG.getConstant(0, DL, MVT::i16));
48353 }
48354 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
48355 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
48356 if (CmpBits >= 16 && Subtarget.hasInt256() &&
48357 (IsAnyOf || (SignExt0 && SignExt1))) {
48358 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
48359 SDLoc DL(EFLAGS);
48360 SDValue Result = peekThroughBitcasts(Src);
48361 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
48362 Result.getValueType().getVectorNumElements() <= NumElts) {
48363 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
48364 Result.getOperand(0), Result.getOperand(1));
48365 V = DAG.getBitcast(MVT::v4i64, V);
48366 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48367 }
48368 Result = DAG.getBitcast(MVT::v32i8, Result);
48369 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48370 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
48371 if (!SignExt0 || !SignExt1) {
48372 assert(IsAnyOf &&
48373 "Only perform v16i16 signmasks for any_of patterns");
48374 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
48375 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48376 }
48377 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48378 DAG.getConstant(CmpMask, DL, MVT::i32));
48379 }
48380 }
48381 }
48382
48383 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
48384 // Since we peek through a bitcast, we need to be careful if the base vector
48385 // type has smaller elements than the MOVMSK type. In that case, even if
48386 // all the elements are demanded by the shuffle mask, only the "high"
48387 // elements which have highbits that align with highbits in the MOVMSK vec
48388 // elements are actually demanded. A simplification of spurious operations
48389 // on the "low" elements take place during other simplifications.
48390 //
48391 // For example:
48392 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
48393 // demanded, because we are swapping around the result can change.
48394 //
48395 // To address this, we check that we can scale the shuffle mask to MOVMSK
48396 // element width (this will ensure "high" elements match). Its slightly overly
48397 // conservative, but fine for an edge case fold.
48398 SmallVector<int, 32> ShuffleMask;
48399 SmallVector<SDValue, 2> ShuffleInputs;
48400 if (NumElts <= CmpBits &&
48401 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
48402 ShuffleMask, DAG) &&
48403 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
48404 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
48405 canScaleShuffleElements(ShuffleMask, NumElts)) {
48406 SDLoc DL(EFLAGS);
48407 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
48408 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48409 Result =
48410 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
48411 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
48412 }
48413
48414 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
48415 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
48416 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
48417 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
48418 // iff every element is referenced.
48419 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
48420 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
48421 (NumEltBits == 32 || NumEltBits == 64)) {
48422 SDLoc DL(EFLAGS);
48423 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
48424 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
48425 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
48426 SDValue LHS = Vec;
48427 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
48428 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48429 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
48430 DAG.getBitcast(FloatVT, LHS),
48431 DAG.getBitcast(FloatVT, RHS));
48432 }
48433
48434 return SDValue();
48435}
48436
48437/// Optimize an EFLAGS definition used according to the condition code \p CC
48438/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
48439/// uses of chain values.
48441 SelectionDAG &DAG,
48442 const X86Subtarget &Subtarget) {
48443 if (CC == X86::COND_B)
48444 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
48445 return Flags;
48446
48447 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
48448 return R;
48449
48450 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
48451 return R;
48452
48453 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
48454 return R;
48455
48456 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
48457 return R;
48458
48459 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
48460}
48461
48462/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
48465 const X86Subtarget &Subtarget) {
48466 SDLoc DL(N);
48467 EVT VT = N->getValueType(0);
48468 SDValue FalseOp = N->getOperand(0);
48469 SDValue TrueOp = N->getOperand(1);
48470 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
48471 SDValue Cond = N->getOperand(3);
48472
48473 // cmov X, X, ?, ? --> X
48474 if (TrueOp == FalseOp)
48475 return TrueOp;
48476
48477 // Try to simplify the EFLAGS and condition code operands.
48478 // We can't always do this as FCMOV only supports a subset of X86 cond.
48479 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
48480 if (!(FalseOp.getValueType() == MVT::f80 ||
48481 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
48482 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
48483 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
48484 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
48485 Flags};
48486 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
48487 }
48488 }
48489
48490 // If this is a select between two integer constants, try to do some
48491 // optimizations. Note that the operands are ordered the opposite of SELECT
48492 // operands.
48493 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
48494 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
48495 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
48496 // larger than FalseC (the false value).
48497 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
48499 std::swap(TrueC, FalseC);
48500 std::swap(TrueOp, FalseOp);
48501 }
48502
48503 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
48504 // This is efficient for any integer data type (including i8/i16) and
48505 // shift amount.
48506 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
48507 Cond = getSETCC(CC, Cond, DL, DAG);
48508
48509 // Zero extend the condition if needed.
48510 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
48511
48512 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
48513 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
48514 DAG.getConstant(ShAmt, DL, MVT::i8));
48515 return Cond;
48516 }
48517
48518 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
48519 // for any integer data type, including i8/i16.
48520 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
48521 Cond = getSETCC(CC, Cond, DL, DAG);
48522
48523 // Zero extend the condition if needed.
48525 FalseC->getValueType(0), Cond);
48526 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48527 SDValue(FalseC, 0));
48528 return Cond;
48529 }
48530
48531 // Optimize cases that will turn into an LEA instruction. This requires
48532 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
48533 if (VT == MVT::i32 || VT == MVT::i64) {
48534 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
48535 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
48536 "Implicit constant truncation");
48537
48538 bool isFastMultiplier = false;
48539 if (Diff.ult(10)) {
48540 switch (Diff.getZExtValue()) {
48541 default: break;
48542 case 1: // result = add base, cond
48543 case 2: // result = lea base( , cond*2)
48544 case 3: // result = lea base(cond, cond*2)
48545 case 4: // result = lea base( , cond*4)
48546 case 5: // result = lea base(cond, cond*4)
48547 case 8: // result = lea base( , cond*8)
48548 case 9: // result = lea base(cond, cond*8)
48549 isFastMultiplier = true;
48550 break;
48551 }
48552 }
48553
48554 if (isFastMultiplier) {
48555 Cond = getSETCC(CC, Cond, DL ,DAG);
48556 // Zero extend the condition if needed.
48557 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
48558 Cond);
48559 // Scale the condition by the difference.
48560 if (Diff != 1)
48561 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
48562 DAG.getConstant(Diff, DL, Cond.getValueType()));
48563
48564 // Add the base if non-zero.
48565 if (FalseC->getAPIntValue() != 0)
48566 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48567 SDValue(FalseC, 0));
48568 return Cond;
48569 }
48570 }
48571 }
48572 }
48573
48574 // Handle these cases:
48575 // (select (x != c), e, c) -> select (x != c), e, x),
48576 // (select (x == c), c, e) -> select (x == c), x, e)
48577 // where the c is an integer constant, and the "select" is the combination
48578 // of CMOV and CMP.
48579 //
48580 // The rationale for this change is that the conditional-move from a constant
48581 // needs two instructions, however, conditional-move from a register needs
48582 // only one instruction.
48583 //
48584 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
48585 // some instruction-combining opportunities. This opt needs to be
48586 // postponed as late as possible.
48587 //
48588 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
48589 // the DCI.xxxx conditions are provided to postpone the optimization as
48590 // late as possible.
48591
48592 ConstantSDNode *CmpAgainst = nullptr;
48593 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
48594 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
48595 !isa<ConstantSDNode>(Cond.getOperand(0))) {
48596
48597 if (CC == X86::COND_NE &&
48598 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
48600 std::swap(TrueOp, FalseOp);
48601 }
48602
48603 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
48604 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
48605 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
48606 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
48607 }
48608 }
48609 }
48610
48611 // Transform:
48612 //
48613 // (cmov 1 T (uge T 2))
48614 //
48615 // to:
48616 //
48617 // (adc T 0 (sub T 1))
48618 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
48619 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
48620 SDValue Cond0 = Cond.getOperand(0);
48621 if (Cond0.getOpcode() == ISD::TRUNCATE)
48622 Cond0 = Cond0.getOperand(0);
48623 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
48624 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
48625 EVT CondVT = Cond->getValueType(0);
48626 // Subtract 1 and generate a carry.
48627 SDValue NewSub =
48628 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
48629 DAG.getConstant(1, DL, CondVT));
48630 SDValue EFLAGS(NewSub.getNode(), 1);
48631 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
48632 DAG.getConstant(0, DL, VT), EFLAGS);
48633 }
48634 }
48635
48636 // Fold and/or of setcc's to double CMOV:
48637 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
48638 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
48639 //
48640 // This combine lets us generate:
48641 // cmovcc1 (jcc1 if we don't have CMOV)
48642 // cmovcc2 (same)
48643 // instead of:
48644 // setcc1
48645 // setcc2
48646 // and/or
48647 // cmovne (jne if we don't have CMOV)
48648 // When we can't use the CMOV instruction, it might increase branch
48649 // mispredicts.
48650 // When we can use CMOV, or when there is no mispredict, this improves
48651 // throughput and reduces register pressure.
48652 //
48653 if (CC == X86::COND_NE) {
48654 SDValue Flags;
48655 X86::CondCode CC0, CC1;
48656 bool isAndSetCC;
48657 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
48658 if (isAndSetCC) {
48659 std::swap(FalseOp, TrueOp);
48662 }
48663
48664 SDValue LOps[] = {FalseOp, TrueOp,
48665 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
48666 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
48667 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
48668 Flags};
48669 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
48670 return CMOV;
48671 }
48672 }
48673
48674 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
48675 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
48676 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
48677 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
48678 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
48679 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
48680 SDValue Add = TrueOp;
48681 SDValue Const = FalseOp;
48682 // Canonicalize the condition code for easier matching and output.
48683 if (CC == X86::COND_E)
48684 std::swap(Add, Const);
48685
48686 // We might have replaced the constant in the cmov with the LHS of the
48687 // compare. If so change it to the RHS of the compare.
48688 if (Const == Cond.getOperand(0))
48689 Const = Cond.getOperand(1);
48690
48691 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
48692 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
48693 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
48694 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
48695 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
48696 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
48697 // This should constant fold.
48698 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
48699 SDValue CMov =
48700 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
48701 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
48702 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
48703 }
48704 }
48705
48706 return SDValue();
48707}
48708
48709/// Different mul shrinking modes.
48711
48713 EVT VT = N->getOperand(0).getValueType();
48714 if (VT.getScalarSizeInBits() != 32)
48715 return false;
48716
48717 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
48718 unsigned SignBits[2] = {1, 1};
48719 bool IsPositive[2] = {false, false};
48720 for (unsigned i = 0; i < 2; i++) {
48721 SDValue Opd = N->getOperand(i);
48722
48723 SignBits[i] = DAG.ComputeNumSignBits(Opd);
48724 IsPositive[i] = DAG.SignBitIsZero(Opd);
48725 }
48726
48727 bool AllPositive = IsPositive[0] && IsPositive[1];
48728 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
48729 // When ranges are from -128 ~ 127, use MULS8 mode.
48730 if (MinSignBits >= 25)
48731 Mode = ShrinkMode::MULS8;
48732 // When ranges are from 0 ~ 255, use MULU8 mode.
48733 else if (AllPositive && MinSignBits >= 24)
48734 Mode = ShrinkMode::MULU8;
48735 // When ranges are from -32768 ~ 32767, use MULS16 mode.
48736 else if (MinSignBits >= 17)
48737 Mode = ShrinkMode::MULS16;
48738 // When ranges are from 0 ~ 65535, use MULU16 mode.
48739 else if (AllPositive && MinSignBits >= 16)
48740 Mode = ShrinkMode::MULU16;
48741 else
48742 return false;
48743 return true;
48744}
48745
48746/// When the operands of vector mul are extended from smaller size values,
48747/// like i8 and i16, the type of mul may be shrinked to generate more
48748/// efficient code. Two typical patterns are handled:
48749/// Pattern1:
48750/// %2 = sext/zext <N x i8> %1 to <N x i32>
48751/// %4 = sext/zext <N x i8> %3 to <N x i32>
48752// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48753/// %5 = mul <N x i32> %2, %4
48754///
48755/// Pattern2:
48756/// %2 = zext/sext <N x i16> %1 to <N x i32>
48757/// %4 = zext/sext <N x i16> %3 to <N x i32>
48758/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48759/// %5 = mul <N x i32> %2, %4
48760///
48761/// There are four mul shrinking modes:
48762/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
48763/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
48764/// generate pmullw+sext32 for it (MULS8 mode).
48765/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
48766/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
48767/// generate pmullw+zext32 for it (MULU8 mode).
48768/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
48769/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
48770/// generate pmullw+pmulhw for it (MULS16 mode).
48771/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
48772/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
48773/// generate pmullw+pmulhuw for it (MULU16 mode).
48775 const X86Subtarget &Subtarget) {
48776 // Check for legality
48777 // pmullw/pmulhw are not supported by SSE.
48778 if (!Subtarget.hasSSE2())
48779 return SDValue();
48780
48781 // Check for profitability
48782 // pmulld is supported since SSE41. It is better to use pmulld
48783 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
48784 // the expansion.
48785 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
48786 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
48787 return SDValue();
48788
48789 ShrinkMode Mode;
48790 if (!canReduceVMulWidth(N, DAG, Mode))
48791 return SDValue();
48792
48793 SDValue N0 = N->getOperand(0);
48794 SDValue N1 = N->getOperand(1);
48795 EVT VT = N->getOperand(0).getValueType();
48796 unsigned NumElts = VT.getVectorNumElements();
48797 if ((NumElts % 2) != 0)
48798 return SDValue();
48799
48800 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
48801
48802 // Shrink the operands of mul.
48803 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
48804 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
48805
48806 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
48807 // lower part is needed.
48808 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
48809 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
48810 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
48812 DL, VT, MulLo);
48813
48814 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
48815 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
48816 // the higher part is also needed.
48817 SDValue MulHi =
48818 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
48819 ReducedVT, NewN0, NewN1);
48820
48821 // Repack the lower part and higher part result of mul into a wider
48822 // result.
48823 // Generate shuffle functioning as punpcklwd.
48824 SmallVector<int, 16> ShuffleMask(NumElts);
48825 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48826 ShuffleMask[2 * i] = i;
48827 ShuffleMask[2 * i + 1] = i + NumElts;
48828 }
48829 SDValue ResLo =
48830 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48831 ResLo = DAG.getBitcast(ResVT, ResLo);
48832 // Generate shuffle functioning as punpckhwd.
48833 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48834 ShuffleMask[2 * i] = i + NumElts / 2;
48835 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
48836 }
48837 SDValue ResHi =
48838 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48839 ResHi = DAG.getBitcast(ResVT, ResHi);
48840 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
48841}
48842
48844 EVT VT, const SDLoc &DL) {
48845
48846 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
48847 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48848 DAG.getConstant(Mult, DL, VT));
48849 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
48850 DAG.getConstant(Shift, DL, MVT::i8));
48851 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48852 N->getOperand(0));
48853 return Result;
48854 };
48855
48856 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
48857 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48858 DAG.getConstant(Mul1, DL, VT));
48859 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
48860 DAG.getConstant(Mul2, DL, VT));
48861 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48862 N->getOperand(0));
48863 return Result;
48864 };
48865
48866 switch (MulAmt) {
48867 default:
48868 break;
48869 case 11:
48870 // mul x, 11 => add ((shl (mul x, 5), 1), x)
48871 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
48872 case 21:
48873 // mul x, 21 => add ((shl (mul x, 5), 2), x)
48874 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
48875 case 41:
48876 // mul x, 41 => add ((shl (mul x, 5), 3), x)
48877 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
48878 case 22:
48879 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
48880 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48881 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
48882 case 19:
48883 // mul x, 19 => add ((shl (mul x, 9), 1), x)
48884 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
48885 case 37:
48886 // mul x, 37 => add ((shl (mul x, 9), 2), x)
48887 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
48888 case 73:
48889 // mul x, 73 => add ((shl (mul x, 9), 3), x)
48890 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
48891 case 13:
48892 // mul x, 13 => add ((shl (mul x, 3), 2), x)
48893 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
48894 case 23:
48895 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
48896 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
48897 case 26:
48898 // mul x, 26 => add ((mul (mul x, 5), 5), x)
48899 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
48900 case 28:
48901 // mul x, 28 => add ((mul (mul x, 9), 3), x)
48902 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
48903 case 29:
48904 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
48905 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48906 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
48907 }
48908
48909 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
48910 // by a single LEA.
48911 // First check if this a sum of two power of 2s because that's easy. Then
48912 // count how many zeros are up to the first bit.
48913 // TODO: We can do this even without LEA at a cost of two shifts and an add.
48914 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
48915 unsigned ScaleShift = llvm::countr_zero(MulAmt);
48916 if (ScaleShift >= 1 && ScaleShift < 4) {
48917 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
48918 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48919 DAG.getConstant(ShiftAmt, DL, MVT::i8));
48920 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48921 DAG.getConstant(ScaleShift, DL, MVT::i8));
48922 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
48923 }
48924 }
48925
48926 return SDValue();
48927}
48928
48929// If the upper 17 bits of either element are zero and the other element are
48930// zero/sign bits then we can use PMADDWD, which is always at least as quick as
48931// PMULLD, except on KNL.
48933 SelectionDAG &DAG,
48934 const X86Subtarget &Subtarget) {
48935 if (!Subtarget.hasSSE2())
48936 return SDValue();
48937
48938 if (Subtarget.isPMADDWDSlow())
48939 return SDValue();
48940
48941 EVT VT = N->getValueType(0);
48942
48943 // Only support vXi32 vectors.
48944 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
48945 return SDValue();
48946
48947 // Make sure the type is legal or can split/widen to a legal type.
48948 // With AVX512 but without BWI, we would need to split v32i16.
48949 unsigned NumElts = VT.getVectorNumElements();
48950 if (NumElts == 1 || !isPowerOf2_32(NumElts))
48951 return SDValue();
48952
48953 // With AVX512 but without BWI, we would need to split v32i16.
48954 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
48955 return SDValue();
48956
48957 SDValue N0 = N->getOperand(0);
48958 SDValue N1 = N->getOperand(1);
48959
48960 // If we are zero/sign extending two steps without SSE4.1, its better to
48961 // reduce the vmul width instead.
48962 if (!Subtarget.hasSSE41() &&
48963 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
48964 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48965 (N1.getOpcode() == ISD::ZERO_EXTEND &&
48966 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
48967 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
48968 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48969 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48970 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
48971 return SDValue();
48972
48973 // If we are sign extending a wide vector without SSE4.1, its better to reduce
48974 // the vmul width instead.
48975 if (!Subtarget.hasSSE41() &&
48976 (N0.getOpcode() == ISD::SIGN_EXTEND &&
48977 N0.getOperand(0).getValueSizeInBits() > 128) &&
48978 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48979 N1.getOperand(0).getValueSizeInBits() > 128))
48980 return SDValue();
48981
48982 // Sign bits must extend down to the lowest i16.
48983 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
48984 DAG.ComputeMaxSignificantBits(N0) > 16)
48985 return SDValue();
48986
48987 // At least one of the elements must be zero in the upper 17 bits, or can be
48988 // safely made zero without altering the final result.
48989 auto GetZeroableOp = [&](SDValue Op) {
48990 APInt Mask17 = APInt::getHighBitsSet(32, 17);
48991 if (DAG.MaskedValueIsZero(Op, Mask17))
48992 return Op;
48993 // Mask off upper 16-bits of sign-extended constants.
48995 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
48996 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
48997 SDValue Src = Op.getOperand(0);
48998 // Convert sext(vXi16) to zext(vXi16).
48999 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49000 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49001 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49002 // which will expand the extension.
49003 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49004 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49005 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49006 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49007 }
49008 }
49009 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49010 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49011 N->isOnlyUserOf(Op.getNode())) {
49012 SDValue Src = Op.getOperand(0);
49013 if (Src.getScalarValueSizeInBits() == 16)
49014 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49015 }
49016 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49017 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49018 N->isOnlyUserOf(Op.getNode())) {
49019 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49020 Op.getOperand(1));
49021 }
49022 return SDValue();
49023 };
49024 SDValue ZeroN0 = GetZeroableOp(N0);
49025 SDValue ZeroN1 = GetZeroableOp(N1);
49026 if (!ZeroN0 && !ZeroN1)
49027 return SDValue();
49028 N0 = ZeroN0 ? ZeroN0 : N0;
49029 N1 = ZeroN1 ? ZeroN1 : N1;
49030
49031 // Use SplitOpsAndApply to handle AVX splitting.
49032 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49033 ArrayRef<SDValue> Ops) {
49034 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49035 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49036 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49037 DAG.getBitcast(OpVT, Ops[0]),
49038 DAG.getBitcast(OpVT, Ops[1]));
49039 };
49040 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49041}
49042
49044 const X86Subtarget &Subtarget) {
49045 if (!Subtarget.hasSSE2())
49046 return SDValue();
49047
49048 EVT VT = N->getValueType(0);
49049
49050 // Only support vXi64 vectors.
49051 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49052 VT.getVectorNumElements() < 2 ||
49054 return SDValue();
49055
49056 SDValue N0 = N->getOperand(0);
49057 SDValue N1 = N->getOperand(1);
49058
49059 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49060 // 32-bits. We can lower with this if the sign bits stretch that far.
49061 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49062 DAG.ComputeNumSignBits(N1) > 32) {
49063 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49064 ArrayRef<SDValue> Ops) {
49065 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49066 };
49067 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49068 /*CheckBWI*/ false);
49069 }
49070
49071 // If the upper bits are zero we can use a single pmuludq.
49072 APInt Mask = APInt::getHighBitsSet(64, 32);
49073 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49074 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49075 ArrayRef<SDValue> Ops) {
49076 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49077 };
49078 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49079 /*CheckBWI*/ false);
49080 }
49081
49082 return SDValue();
49083}
49084
49087 const X86Subtarget &Subtarget) {
49088 EVT VT = N->getValueType(0);
49089 SDLoc DL(N);
49090
49091 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49092 return V;
49093
49094 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49095 return V;
49096
49097 if (DCI.isBeforeLegalize() && VT.isVector())
49098 return reduceVMULWidth(N, DL, DAG, Subtarget);
49099
49100 if (VT != MVT::i64 && VT != MVT::i32 &&
49101 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49102 return SDValue();
49103
49104 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49105 if (!Known1.isConstant())
49106 return SDValue();
49107
49108 const APInt &C = Known1.getConstant();
49109 if (C.isZero())
49110 return DAG.getConstant(0, DL, VT);
49111
49112 if (C.isAllOnes())
49113 return DAG.getNegative(N->getOperand(0), DL, VT);
49114
49115 if (isPowerOf2_64(C.getZExtValue()))
49116 return SDValue();
49117
49118 // Optimize a single multiply with constant into two operations in order to
49119 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49121 return SDValue();
49122
49123 // An imul is usually smaller than the alternative sequence.
49125 return SDValue();
49126
49127 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49128 return SDValue();
49129
49130 int64_t SignMulAmt = C.getSExtValue();
49131 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49132 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49133
49134 SDValue NewMul = SDValue();
49135 if (VT == MVT::i64 || VT == MVT::i32) {
49136 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49137 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49138 DAG.getConstant(AbsMulAmt, DL, VT));
49139 if (SignMulAmt < 0)
49140 NewMul = DAG.getNegative(NewMul, DL, VT);
49141
49142 return NewMul;
49143 }
49144
49145 uint64_t MulAmt1 = 0;
49146 uint64_t MulAmt2 = 0;
49147 if ((AbsMulAmt % 9) == 0) {
49148 MulAmt1 = 9;
49149 MulAmt2 = AbsMulAmt / 9;
49150 } else if ((AbsMulAmt % 5) == 0) {
49151 MulAmt1 = 5;
49152 MulAmt2 = AbsMulAmt / 5;
49153 } else if ((AbsMulAmt % 3) == 0) {
49154 MulAmt1 = 3;
49155 MulAmt2 = AbsMulAmt / 3;
49156 }
49157
49158 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49159 if (MulAmt2 &&
49160 (isPowerOf2_64(MulAmt2) ||
49161 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49162
49163 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49164 N->user_begin()->getOpcode() == ISD::ADD))
49165 // If second multiplifer is pow2, issue it first. We want the multiply
49166 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49167 // use is an add. Only do this for positive multiply amounts since the
49168 // negate would prevent it from being used as an address mode anyway.
49169 std::swap(MulAmt1, MulAmt2);
49170
49171 if (isPowerOf2_64(MulAmt1))
49172 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49173 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49174 else
49175 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49176 DAG.getConstant(MulAmt1, DL, VT));
49177
49178 if (isPowerOf2_64(MulAmt2))
49179 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49180 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49181 else
49182 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49183 DAG.getConstant(MulAmt2, DL, VT));
49184
49185 // Negate the result.
49186 if (SignMulAmt < 0)
49187 NewMul = DAG.getNegative(NewMul, DL, VT);
49188 } else if (!Subtarget.slowLEA())
49189 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49190 }
49191 if (!NewMul) {
49192 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49193 if (isPowerOf2_64(AbsMulAmt - 1)) {
49194 // (mul x, 2^N + 1) => (add (shl x, N), x)
49195 NewMul = DAG.getNode(
49196 ISD::ADD, DL, VT, N->getOperand(0),
49197 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49198 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49199 if (SignMulAmt < 0)
49200 NewMul = DAG.getNegative(NewMul, DL, VT);
49201 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49202 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49203 NewMul =
49204 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49205 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49206 // To negate, reverse the operands of the subtract.
49207 if (SignMulAmt < 0)
49208 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49209 else
49210 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49211 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49212 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49213 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49214 NewMul =
49215 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49216 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49217 NewMul = DAG.getNode(
49218 ISD::ADD, DL, VT, NewMul,
49219 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49220 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49221 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49222 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49223 NewMul =
49224 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49225 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49226 NewMul = DAG.getNode(
49227 ISD::SUB, DL, VT, NewMul,
49228 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49229 } else if (SignMulAmt >= 0 && VT.isVector() &&
49230 Subtarget.fastImmVectorShift()) {
49231 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49232 uint64_t ShiftAmt1;
49233 std::optional<unsigned> Opc;
49234 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49235 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49236 Opc = ISD::ADD;
49237 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49238 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49239 Opc = ISD::SUB;
49240 }
49241
49242 if (Opc) {
49243 SDValue Shift1 =
49244 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49245 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49246 SDValue Shift2 =
49247 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49248 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49249 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49250 }
49251 }
49252 }
49253
49254 return NewMul;
49255}
49256
49257// Try to form a MULHU or MULHS node by looking for
49258// (srl (mul ext, ext), 16)
49259// TODO: This is X86 specific because we want to be able to handle wide types
49260// before type legalization. But we can only do it if the vector will be
49261// legalized via widening/splitting. Type legalization can't handle promotion
49262// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49263// combiner.
49265 const SDLoc &DL,
49266 const X86Subtarget &Subtarget) {
49267 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49268 "SRL or SRA node is required here!");
49269
49270 if (!Subtarget.hasSSE2())
49271 return SDValue();
49272
49273 // The operation feeding into the shift must be a multiply.
49274 SDValue ShiftOperand = N->getOperand(0);
49275 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
49276 return SDValue();
49277
49278 // Input type should be at least vXi32.
49279 EVT VT = N->getValueType(0);
49280 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49281 return SDValue();
49282
49283 // Need a shift by 16.
49284 APInt ShiftAmt;
49285 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
49286 ShiftAmt != 16)
49287 return SDValue();
49288
49289 SDValue LHS = ShiftOperand.getOperand(0);
49290 SDValue RHS = ShiftOperand.getOperand(1);
49291
49292 unsigned ExtOpc = LHS.getOpcode();
49293 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49294 RHS.getOpcode() != ExtOpc)
49295 return SDValue();
49296
49297 // Peek through the extends.
49298 LHS = LHS.getOperand(0);
49299 RHS = RHS.getOperand(0);
49300
49301 // Ensure the input types match.
49302 EVT MulVT = LHS.getValueType();
49303 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49304 return SDValue();
49305
49306 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49307 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49308
49309 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49310 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49311}
49312
49314 const X86Subtarget &Subtarget) {
49315 using namespace llvm::SDPatternMatch;
49316 SDValue N0 = N->getOperand(0);
49317 SDValue N1 = N->getOperand(1);
49318 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
49319 EVT VT = N0.getValueType();
49320 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49321 SDLoc DL(N);
49322
49323 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49324 // with out-of-bounds clamping.
49325 if (N0.getOpcode() == ISD::VSELECT &&
49326 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
49327 SDValue Cond = N0.getOperand(0);
49328 SDValue N00 = N0.getOperand(1);
49329 SDValue N01 = N0.getOperand(2);
49330 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
49332 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49333 m_SpecificCondCode(ISD::SETULT)))) {
49334 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
49335 }
49336 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
49338 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49339 m_SpecificCondCode(ISD::SETUGE)))) {
49340 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
49341 }
49342 }
49343
49344 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
49345 // since the result of setcc_c is all zero's or all ones.
49346 if (VT.isInteger() && !VT.isVector() &&
49347 N1C && N0.getOpcode() == ISD::AND &&
49348 N0.getOperand(1).getOpcode() == ISD::Constant) {
49349 SDValue N00 = N0.getOperand(0);
49350 APInt Mask = N0.getConstantOperandAPInt(1);
49351 Mask <<= N1C->getAPIntValue();
49352 bool MaskOK = false;
49353 // We can handle cases concerning bit-widening nodes containing setcc_c if
49354 // we carefully interrogate the mask to make sure we are semantics
49355 // preserving.
49356 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
49357 // of the underlying setcc_c operation if the setcc_c was zero extended.
49358 // Consider the following example:
49359 // zext(setcc_c) -> i32 0x0000FFFF
49360 // c1 -> i32 0x0000FFFF
49361 // c2 -> i32 0x00000001
49362 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
49363 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
49364 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
49365 MaskOK = true;
49366 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
49368 MaskOK = true;
49369 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
49370 N00.getOpcode() == ISD::ANY_EXTEND) &&
49372 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
49373 }
49374 if (MaskOK && Mask != 0)
49375 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
49376 }
49377
49378 return SDValue();
49379}
49380
49382 const X86Subtarget &Subtarget) {
49383 using namespace llvm::SDPatternMatch;
49384 SDValue N0 = N->getOperand(0);
49385 SDValue N1 = N->getOperand(1);
49386 EVT VT = N0.getValueType();
49387 unsigned Size = VT.getSizeInBits();
49388 SDLoc DL(N);
49389
49390 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
49391 return V;
49392
49393 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
49394 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
49395 SDValue ShrAmtVal;
49396 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
49398 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
49399 }
49400
49401 // fold (SRA (SHL X, ShlConst), SraConst)
49402 // into (SHL (sext_in_reg X), ShlConst - SraConst)
49403 // or (sext_in_reg X)
49404 // or (SRA (sext_in_reg X), SraConst - ShlConst)
49405 // depending on relation between SraConst and ShlConst.
49406 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
49407 // us to do the sext_in_reg from corresponding bit.
49408
49409 // sexts in X86 are MOVs. The MOVs have the same code size
49410 // as above SHIFTs (only SHIFT on 1 has lower code size).
49411 // However the MOVs have 2 advantages to a SHIFT:
49412 // 1. MOVs can write to a register that differs from source
49413 // 2. MOVs accept memory operands
49414
49415 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
49416 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
49418 return SDValue();
49419
49420 SDValue N00 = N0.getOperand(0);
49421 SDValue N01 = N0.getOperand(1);
49422 APInt ShlConst = N01->getAsAPIntVal();
49423 APInt SraConst = N1->getAsAPIntVal();
49424 EVT CVT = N1.getValueType();
49425
49426 if (CVT != N01.getValueType())
49427 return SDValue();
49428 if (SraConst.isNegative())
49429 return SDValue();
49430
49431 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
49432 unsigned ShiftSize = SVT.getSizeInBits();
49433 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
49434 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
49435 continue;
49436 SDValue NN =
49437 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
49438 if (SraConst.eq(ShlConst))
49439 return NN;
49440 if (SraConst.ult(ShlConst))
49441 return DAG.getNode(ISD::SHL, DL, VT, NN,
49442 DAG.getConstant(ShlConst - SraConst, DL, CVT));
49443 return DAG.getNode(ISD::SRA, DL, VT, NN,
49444 DAG.getConstant(SraConst - ShlConst, DL, CVT));
49445 }
49446 return SDValue();
49447}
49448
49451 const X86Subtarget &Subtarget) {
49452 using namespace llvm::SDPatternMatch;
49453 SDValue N0 = N->getOperand(0);
49454 SDValue N1 = N->getOperand(1);
49455 EVT VT = N0.getValueType();
49456 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49457 SDLoc DL(N);
49458
49459 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
49460 return V;
49461
49462 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49463 // with out-of-bounds clamping.
49464 if (N0.getOpcode() == ISD::VSELECT &&
49465 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
49466 SDValue Cond = N0.getOperand(0);
49467 SDValue N00 = N0.getOperand(1);
49468 SDValue N01 = N0.getOperand(2);
49469 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
49471 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49472 m_SpecificCondCode(ISD::SETULT)))) {
49473 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
49474 }
49475 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
49477 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49478 m_SpecificCondCode(ISD::SETUGE)))) {
49479 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
49480 }
49481 }
49482
49483 // Only do this on the last DAG combine as it can interfere with other
49484 // combines.
49485 if (!DCI.isAfterLegalizeDAG())
49486 return SDValue();
49487
49488 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
49489 // TODO: This is a generic DAG combine that became an x86-only combine to
49490 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
49491 // and-not ('andn').
49492 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
49493 return SDValue();
49494
49495 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
49496 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
49497 if (!ShiftC || !AndC)
49498 return SDValue();
49499
49500 // If we can shrink the constant mask below 8-bits or 32-bits, then this
49501 // transform should reduce code size. It may also enable secondary transforms
49502 // from improved known-bits analysis or instruction selection.
49503 APInt MaskVal = AndC->getAPIntValue();
49504
49505 // If this can be matched by a zero extend, don't optimize.
49506 if (MaskVal.isMask()) {
49507 unsigned TO = MaskVal.countr_one();
49508 if (TO >= 8 && isPowerOf2_32(TO))
49509 return SDValue();
49510 }
49511
49512 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
49513 unsigned OldMaskSize = MaskVal.getSignificantBits();
49514 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
49515 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
49516 (OldMaskSize > 32 && NewMaskSize <= 32)) {
49517 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
49518 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
49519 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
49520 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
49521 }
49522 return SDValue();
49523}
49524
49526 const X86Subtarget &Subtarget) {
49527 unsigned Opcode = N->getOpcode();
49528 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
49529
49530 SDLoc DL(N);
49531 EVT VT = N->getValueType(0);
49532 SDValue N0 = N->getOperand(0);
49533 SDValue N1 = N->getOperand(1);
49534 EVT SrcVT = N0.getValueType();
49535
49536 SDValue BC0 =
49537 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
49538 SDValue BC1 =
49539 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
49540
49541 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
49542 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
49543 // truncation trees that help us avoid lane crossing shuffles.
49544 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
49545 // TODO: We don't handle vXf64 shuffles yet.
49546 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
49547 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
49549 SmallVector<int> ShuffleMask, ScaledMask;
49550 SDValue Vec = peekThroughBitcasts(BCSrc);
49551 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
49553 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
49554 // shuffle to a v4X64 width - we can probably relax this in the future.
49555 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
49556 ShuffleOps[0].getValueType().is256BitVector() &&
49557 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
49558 SDValue Lo, Hi;
49559 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
49560 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
49561 Lo = DAG.getBitcast(SrcVT, Lo);
49562 Hi = DAG.getBitcast(SrcVT, Hi);
49563 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
49564 Res = DAG.getBitcast(ShufVT, Res);
49565 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
49566 return DAG.getBitcast(VT, Res);
49567 }
49568 }
49569 }
49570 }
49571
49572 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
49573 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
49574 // If either/both ops are a shuffle that can scale to v2x64,
49575 // then see if we can perform this as a v4x32 post shuffle.
49576 SmallVector<SDValue> Ops0, Ops1;
49577 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
49578 bool IsShuf0 =
49579 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
49580 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
49581 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
49582 bool IsShuf1 =
49583 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
49584 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
49585 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
49586 if (IsShuf0 || IsShuf1) {
49587 if (!IsShuf0) {
49588 Ops0.assign({BC0});
49589 ScaledMask0.assign({0, 1});
49590 }
49591 if (!IsShuf1) {
49592 Ops1.assign({BC1});
49593 ScaledMask1.assign({0, 1});
49594 }
49595
49596 SDValue LHS, RHS;
49597 int PostShuffle[4] = {-1, -1, -1, -1};
49598 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
49599 if (M < 0)
49600 return true;
49601 Idx = M % 2;
49602 SDValue Src = Ops[M / 2];
49603 if (!LHS || LHS == Src) {
49604 LHS = Src;
49605 return true;
49606 }
49607 if (!RHS || RHS == Src) {
49608 Idx += 2;
49609 RHS = Src;
49610 return true;
49611 }
49612 return false;
49613 };
49614 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
49615 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
49616 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
49617 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
49618 LHS = DAG.getBitcast(SrcVT, LHS);
49619 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
49620 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
49621 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
49622 Res = DAG.getBitcast(ShufVT, Res);
49623 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
49624 return DAG.getBitcast(VT, Res);
49625 }
49626 }
49627 }
49628
49629 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
49630 if (VT.is256BitVector() && Subtarget.hasInt256()) {
49631 SmallVector<int> Mask0, Mask1;
49632 SmallVector<SDValue> Ops0, Ops1;
49633 SmallVector<int, 2> ScaledMask0, ScaledMask1;
49634 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
49635 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
49636 !Ops0.empty() && !Ops1.empty() &&
49637 all_of(Ops0,
49638 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49639 all_of(Ops1,
49640 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49641 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
49642 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
49643 SDValue Op00 = peekThroughBitcasts(Ops0.front());
49644 SDValue Op10 = peekThroughBitcasts(Ops1.front());
49645 SDValue Op01 = peekThroughBitcasts(Ops0.back());
49646 SDValue Op11 = peekThroughBitcasts(Ops1.back());
49647 if ((Op00 == Op11) && (Op01 == Op10)) {
49648 std::swap(Op10, Op11);
49650 }
49651 if ((Op00 == Op10) && (Op01 == Op11)) {
49652 const int Map[4] = {0, 2, 1, 3};
49653 SmallVector<int, 4> ShuffleMask(
49654 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
49655 Map[ScaledMask1[1]]});
49656 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
49657 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
49658 DAG.getBitcast(SrcVT, Op01));
49659 Res = DAG.getBitcast(ShufVT, Res);
49660 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
49661 return DAG.getBitcast(VT, Res);
49662 }
49663 }
49664 }
49665
49666 return SDValue();
49667}
49668
49671 const X86Subtarget &Subtarget) {
49672 unsigned Opcode = N->getOpcode();
49673 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
49674 "Unexpected pack opcode");
49675
49676 EVT VT = N->getValueType(0);
49677 SDValue N0 = N->getOperand(0);
49678 SDValue N1 = N->getOperand(1);
49679 unsigned NumDstElts = VT.getVectorNumElements();
49680 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
49681 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
49682 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
49683 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
49684 "Unexpected PACKSS/PACKUS input type");
49685
49686 bool IsSigned = (X86ISD::PACKSS == Opcode);
49687
49688 // Constant Folding.
49689 APInt UndefElts0, UndefElts1;
49690 SmallVector<APInt, 32> EltBits0, EltBits1;
49691 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
49692 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
49693 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
49694 /*AllowWholeUndefs*/ true,
49695 /*AllowPartialUndefs*/ true) &&
49696 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
49697 /*AllowWholeUndefs*/ true,
49698 /*AllowPartialUndefs*/ true)) {
49699 unsigned NumLanes = VT.getSizeInBits() / 128;
49700 unsigned NumSrcElts = NumDstElts / 2;
49701 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
49702 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
49703
49704 APInt Undefs(NumDstElts, 0);
49705 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
49706 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
49707 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
49708 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
49709 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
49710 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
49711
49712 if (UndefElts[SrcIdx]) {
49713 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
49714 continue;
49715 }
49716
49717 APInt &Val = EltBits[SrcIdx];
49718 if (IsSigned) {
49719 // PACKSS: Truncate signed value with signed saturation.
49720 // Source values less than dst minint are saturated to minint.
49721 // Source values greater than dst maxint are saturated to maxint.
49722 Val = Val.truncSSat(DstBitsPerElt);
49723 } else {
49724 // PACKUS: Truncate signed value with unsigned saturation.
49725 // Source values less than zero are saturated to zero.
49726 // Source values greater than dst maxuint are saturated to maxuint.
49727 // NOTE: This is different from APInt::truncUSat.
49728 if (Val.isIntN(DstBitsPerElt))
49729 Val = Val.trunc(DstBitsPerElt);
49730 else if (Val.isNegative())
49731 Val = APInt::getZero(DstBitsPerElt);
49732 else
49733 Val = APInt::getAllOnes(DstBitsPerElt);
49734 }
49735 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
49736 }
49737 }
49738
49739 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
49740 }
49741
49742 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
49743 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49744 return V;
49745
49746 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
49747 // Currently limit this to allsignbits cases only.
49748 if (IsSigned &&
49749 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
49750 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
49751 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
49752 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
49753 if (Not0 && Not1) {
49754 SDLoc DL(N);
49755 MVT SrcVT = N0.getSimpleValueType();
49756 SDValue Pack =
49757 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
49758 DAG.getBitcast(SrcVT, Not1));
49759 return DAG.getNOT(DL, Pack, VT);
49760 }
49761 }
49762
49763 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
49764 // truncate to create a larger truncate.
49765 if (Subtarget.hasAVX512() &&
49766 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
49767 N0.getOperand(0).getValueType() == MVT::v8i32) {
49768 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
49769 (!IsSigned &&
49770 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
49771 if (Subtarget.hasVLX())
49772 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
49773
49774 // Widen input to v16i32 so we can truncate that.
49775 SDLoc dl(N);
49776 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
49777 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
49778 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
49779 }
49780 }
49781
49782 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
49783 if (VT.is128BitVector()) {
49784 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49785 SDValue Src0, Src1;
49786 if (N0.getOpcode() == ExtOpc &&
49788 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49789 Src0 = N0.getOperand(0);
49790 }
49791 if (N1.getOpcode() == ExtOpc &&
49793 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49794 Src1 = N1.getOperand(0);
49795 }
49796 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
49797 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
49798 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
49799 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
49800 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
49801 }
49802
49803 // Try again with pack(*_extend_vector_inreg, undef).
49804 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
49806 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
49807 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
49808 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
49809 DAG);
49810 }
49811
49812 // Attempt to combine as shuffle.
49813 SDValue Op(N, 0);
49814 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49815 return Res;
49816
49817 return SDValue();
49818}
49819
49822 const X86Subtarget &Subtarget) {
49823 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
49824 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
49825 "Unexpected horizontal add/sub opcode");
49826
49827 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
49828 MVT VT = N->getSimpleValueType(0);
49829 SDValue LHS = N->getOperand(0);
49830 SDValue RHS = N->getOperand(1);
49831
49832 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
49833 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
49834 LHS.getOpcode() == RHS.getOpcode() &&
49835 LHS.getValueType() == RHS.getValueType() &&
49836 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
49837 SDValue LHS0 = LHS.getOperand(0);
49838 SDValue LHS1 = LHS.getOperand(1);
49839 SDValue RHS0 = RHS.getOperand(0);
49840 SDValue RHS1 = RHS.getOperand(1);
49841 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
49842 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
49843 SDLoc DL(N);
49844 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
49845 LHS0.isUndef() ? LHS1 : LHS0,
49846 RHS0.isUndef() ? RHS1 : RHS0);
49847 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
49848 Res = DAG.getBitcast(ShufVT, Res);
49849 SDValue NewLHS =
49850 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49851 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
49852 SDValue NewRHS =
49853 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49854 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
49855 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
49856 DAG.getBitcast(VT, NewRHS));
49857 }
49858 }
49859 }
49860
49861 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
49862 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49863 return V;
49864
49865 return SDValue();
49866}
49867
49870 const X86Subtarget &Subtarget) {
49871 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
49872 X86ISD::VSRL == N->getOpcode()) &&
49873 "Unexpected shift opcode");
49874 EVT VT = N->getValueType(0);
49875 SDValue N0 = N->getOperand(0);
49876 SDValue N1 = N->getOperand(1);
49877
49878 // Shift zero -> zero.
49880 return DAG.getConstant(0, SDLoc(N), VT);
49881
49882 // Detect constant shift amounts.
49883 APInt UndefElts;
49884 SmallVector<APInt, 32> EltBits;
49885 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
49886 /*AllowWholeUndefs*/ true,
49887 /*AllowPartialUndefs*/ false)) {
49888 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
49889 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
49890 EltBits[0].getZExtValue(), DAG);
49891 }
49892
49893 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49894 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
49895 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
49896 return SDValue(N, 0);
49897
49898 return SDValue();
49899}
49900
49903 const X86Subtarget &Subtarget) {
49904 unsigned Opcode = N->getOpcode();
49905 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
49906 X86ISD::VSRLI == Opcode) &&
49907 "Unexpected shift opcode");
49908 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
49909 EVT VT = N->getValueType(0);
49910 SDValue N0 = N->getOperand(0);
49911 SDValue N1 = N->getOperand(1);
49912 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49913 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
49914 "Unexpected value type");
49915 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
49916
49917 // (shift undef, X) -> 0
49918 if (N0.isUndef())
49919 return DAG.getConstant(0, SDLoc(N), VT);
49920
49921 // Out of range logical bit shifts are guaranteed to be zero.
49922 // Out of range arithmetic bit shifts splat the sign bit.
49923 unsigned ShiftVal = N->getConstantOperandVal(1);
49924 if (ShiftVal >= NumBitsPerElt) {
49925 if (LogicalShift)
49926 return DAG.getConstant(0, SDLoc(N), VT);
49927 ShiftVal = NumBitsPerElt - 1;
49928 }
49929
49930 // (shift X, 0) -> X
49931 if (!ShiftVal)
49932 return N0;
49933
49934 // (shift 0, C) -> 0
49936 // N0 is all zeros or undef. We guarantee that the bits shifted into the
49937 // result are all zeros, not undef.
49938 return DAG.getConstant(0, SDLoc(N), VT);
49939
49940 // (VSRAI -1, C) -> -1
49941 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
49942 // N0 is all ones or undef. We guarantee that the bits shifted into the
49943 // result are all ones, not undef.
49944 return DAG.getAllOnesConstant(SDLoc(N), VT);
49945
49946 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
49947 unsigned NewShiftVal = Amt0 + Amt1;
49948 if (NewShiftVal >= NumBitsPerElt) {
49949 // Out of range logical bit shifts are guaranteed to be zero.
49950 // Out of range arithmetic bit shifts splat the sign bit.
49951 if (LogicalShift)
49952 return DAG.getConstant(0, SDLoc(N), VT);
49953 NewShiftVal = NumBitsPerElt - 1;
49954 }
49955 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
49956 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
49957 };
49958
49959 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
49960 if (Opcode == N0.getOpcode())
49961 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
49962
49963 // (shl (add X, X), C) -> (shl X, (C + 1))
49964 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
49965 N0.getOperand(0) == N0.getOperand(1))
49966 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
49967
49968 // We can decode 'whole byte' logical bit shifts as shuffles.
49969 if (LogicalShift && (ShiftVal % 8) == 0) {
49970 SDValue Op(N, 0);
49971 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49972 return Res;
49973 }
49974
49975 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
49976 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
49977 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
49978 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
49979 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
49980 N0.getOpcode() == X86ISD::PSHUFD &&
49981 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
49982 N0->hasOneUse()) {
49984 if (BC.getOpcode() == X86ISD::VSHLI &&
49985 BC.getScalarValueSizeInBits() == 64 &&
49986 BC.getConstantOperandVal(1) == 63) {
49987 SDLoc DL(N);
49988 SDValue Src = BC.getOperand(0);
49989 Src = DAG.getBitcast(VT, Src);
49990 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
49991 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
49992 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
49993 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
49994 return Src;
49995 }
49996 }
49997
49998 auto TryConstantFold = [&](SDValue V) {
49999 APInt UndefElts;
50000 SmallVector<APInt, 32> EltBits;
50001 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50002 /*AllowWholeUndefs*/ true,
50003 /*AllowPartialUndefs*/ true))
50004 return SDValue();
50005 assert(EltBits.size() == VT.getVectorNumElements() &&
50006 "Unexpected shift value type");
50007 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50008 // created an undef input due to no input bits being demanded, but user
50009 // still expects 0 in other bits.
50010 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50011 APInt &Elt = EltBits[i];
50012 if (UndefElts[i])
50013 Elt = 0;
50014 else if (X86ISD::VSHLI == Opcode)
50015 Elt <<= ShiftVal;
50016 else if (X86ISD::VSRAI == Opcode)
50017 Elt.ashrInPlace(ShiftVal);
50018 else
50019 Elt.lshrInPlace(ShiftVal);
50020 }
50021 // Reset undef elements since they were zeroed above.
50022 UndefElts = 0;
50023 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50024 };
50025
50026 // Constant Folding.
50027 if (N->isOnlyUserOf(N0.getNode())) {
50028 if (SDValue C = TryConstantFold(N0))
50029 return C;
50030
50031 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50032 // Don't break NOT patterns.
50034 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50035 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50037 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50038 SDLoc DL(N);
50039 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50040 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50041 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50042 }
50043 }
50044 }
50045
50046 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50047 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50048 DCI))
50049 return SDValue(N, 0);
50050
50051 return SDValue();
50052}
50053
50056 const X86Subtarget &Subtarget) {
50057 EVT VT = N->getValueType(0);
50058 unsigned Opcode = N->getOpcode();
50059 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50060 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50061 Opcode == ISD::INSERT_VECTOR_ELT) &&
50062 "Unexpected vector insertion");
50063
50064 SDValue Vec = N->getOperand(0);
50065 SDValue Scl = N->getOperand(1);
50066 SDValue Idx = N->getOperand(2);
50067
50068 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50069 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50070 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50071
50072 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50073 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50074 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50075 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50076 APInt::getAllOnes(NumBitsPerElt), DCI))
50077 return SDValue(N, 0);
50078 }
50079
50080 // Attempt to combine insertion patterns to a shuffle.
50081 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50082 SDValue Op(N, 0);
50083 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50084 return Res;
50085 }
50086
50087 return SDValue();
50088}
50089
50090/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50091/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50092/// OR -> CMPNEQSS.
50095 const X86Subtarget &Subtarget) {
50096 unsigned opcode;
50097
50098 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50099 // we're requiring SSE2 for both.
50100 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50101 SDValue N0 = N->getOperand(0);
50102 SDValue N1 = N->getOperand(1);
50103 SDValue CMP0 = N0.getOperand(1);
50104 SDValue CMP1 = N1.getOperand(1);
50105 SDLoc DL(N);
50106
50107 // The SETCCs should both refer to the same CMP.
50108 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50109 return SDValue();
50110
50111 SDValue CMP00 = CMP0->getOperand(0);
50112 SDValue CMP01 = CMP0->getOperand(1);
50113 EVT VT = CMP00.getValueType();
50114
50115 if (VT == MVT::f32 || VT == MVT::f64 ||
50116 (VT == MVT::f16 && Subtarget.hasFP16())) {
50117 bool ExpectingFlags = false;
50118 // Check for any users that want flags:
50119 for (const SDNode *U : N->users()) {
50120 if (ExpectingFlags)
50121 break;
50122
50123 switch (U->getOpcode()) {
50124 default:
50125 case ISD::BR_CC:
50126 case ISD::BRCOND:
50127 case ISD::SELECT:
50128 ExpectingFlags = true;
50129 break;
50130 case ISD::CopyToReg:
50131 case ISD::SIGN_EXTEND:
50132 case ISD::ZERO_EXTEND:
50133 case ISD::ANY_EXTEND:
50134 break;
50135 }
50136 }
50137
50138 if (!ExpectingFlags) {
50139 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50140 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50141
50142 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50143 X86::CondCode tmp = cc0;
50144 cc0 = cc1;
50145 cc1 = tmp;
50146 }
50147
50148 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50149 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50150 // FIXME: need symbolic constants for these magic numbers.
50151 // See X86ATTInstPrinter.cpp:printSSECC().
50152 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50153 if (Subtarget.hasAVX512()) {
50154 SDValue FSetCC =
50155 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50156 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50157 // Need to fill with zeros to ensure the bitcast will produce zeroes
50158 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50159 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50160 DAG.getConstant(0, DL, MVT::v16i1),
50161 FSetCC, DAG.getVectorIdxConstant(0, DL));
50162 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50163 N->getSimpleValueType(0));
50164 }
50165 SDValue OnesOrZeroesF =
50166 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50167 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50168
50169 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50170 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50171
50172 if (is64BitFP && !Subtarget.is64Bit()) {
50173 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50174 // 64-bit integer, since that's not a legal type. Since
50175 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50176 // bits, but can do this little dance to extract the lowest 32 bits
50177 // and work with those going forward.
50178 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50179 MVT::v2f64, OnesOrZeroesF);
50180 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50181 OnesOrZeroesF =
50182 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50183 DAG.getVectorIdxConstant(0, DL));
50184 IntVT = MVT::i32;
50185 }
50186
50187 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50188 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50189 DAG.getConstant(1, DL, IntVT));
50190 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50191 ANDed);
50192 return OneBitOfTruth;
50193 }
50194 }
50195 }
50196 }
50197 return SDValue();
50198}
50199
50200/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50202 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50203
50204 MVT VT = N->getSimpleValueType(0);
50205 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50206 return SDValue();
50207
50208 SDValue X, Y;
50209 SDValue N0 = N->getOperand(0);
50210 SDValue N1 = N->getOperand(1);
50211
50212 if (SDValue Not = IsNOT(N0, DAG)) {
50213 X = Not;
50214 Y = N1;
50215 } else if (SDValue Not = IsNOT(N1, DAG)) {
50216 X = Not;
50217 Y = N0;
50218 } else
50219 return SDValue();
50220
50221 X = DAG.getBitcast(VT, X);
50222 Y = DAG.getBitcast(VT, Y);
50223 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
50224}
50225
50226/// Try to fold:
50227/// and (vector_shuffle<Z,...,Z>
50228/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50229/// ->
50230/// andnp (vector_shuffle<Z,...,Z>
50231/// (insert_vector_elt undef, X, Z), undef), Y
50233 const X86Subtarget &Subtarget) {
50234 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50235
50236 EVT VT = N->getValueType(0);
50237 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50238 // value and require extra moves.
50239 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50240 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50241 return SDValue();
50242
50243 auto GetNot = [&DAG](SDValue V) {
50244 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
50245 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50246 // end-users are ISD::AND including cases
50247 // (and(extract_vector_element(SVN), Y)).
50248 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50249 !SVN->getOperand(1).isUndef()) {
50250 return SDValue();
50251 }
50252 SDValue IVEN = SVN->getOperand(0);
50253 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50254 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50255 return SDValue();
50256 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50257 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50258 return SDValue();
50259 SDValue Src = IVEN.getOperand(1);
50260 if (SDValue Not = IsNOT(Src, DAG)) {
50261 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50262 SDValue NotIVEN =
50264 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50265 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50266 SVN->getOperand(1), SVN->getMask());
50267 }
50268 return SDValue();
50269 };
50270
50271 SDValue X, Y;
50272 SDValue N0 = N->getOperand(0);
50273 SDValue N1 = N->getOperand(1);
50274 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50275
50276 if (SDValue Not = GetNot(N0)) {
50277 X = Not;
50278 Y = N1;
50279 } else if (SDValue Not = GetNot(N1)) {
50280 X = Not;
50281 Y = N0;
50282 } else
50283 return SDValue();
50284
50285 X = DAG.getBitcast(VT, X);
50286 Y = DAG.getBitcast(VT, Y);
50287 SDLoc DL(N);
50288
50289 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50290 // AVX2.
50291 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50293 SDValue LoX, HiX;
50294 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50295 SDValue LoY, HiY;
50296 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50297 EVT SplitVT = LoX.getValueType();
50298 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50299 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50300 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50301 }
50302
50303 if (TLI.isTypeLegal(VT))
50304 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50305
50306 return SDValue();
50307}
50308
50309// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50310// logical operations, like in the example below.
50311// or (and (truncate x, truncate y)),
50312// (xor (truncate z, build_vector (constants)))
50313// Given a target type \p VT, we generate
50314// or (and x, y), (xor z, zext(build_vector (constants)))
50315// given x, y and z are of type \p VT. We can do so, if operands are either
50316// truncates from VT types, the second operand is a vector of constants or can
50317// be recursively promoted.
50319 SelectionDAG &DAG, unsigned Depth) {
50320 // Limit recursion to avoid excessive compile times.
50322 return SDValue();
50323
50324 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
50325 return SDValue();
50326
50327 SDValue N0 = N.getOperand(0);
50328 SDValue N1 = N.getOperand(1);
50329
50330 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50331 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
50332 return SDValue();
50333
50334 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
50335 N0 = NN0;
50336 else {
50337 // The left side has to be a trunc.
50338 if (N0.getOpcode() != ISD::TRUNCATE)
50339 return SDValue();
50340
50341 // The type of the truncated inputs.
50342 if (N0.getOperand(0).getValueType() != VT)
50343 return SDValue();
50344
50345 N0 = N0.getOperand(0);
50346 }
50347
50348 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
50349 N1 = NN1;
50350 else {
50351 // The right side has to be a 'trunc' or a (foldable) constant.
50352 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
50353 N1.getOperand(0).getValueType() == VT;
50354 if (RHSTrunc)
50355 N1 = N1.getOperand(0);
50356 else if (SDValue Cst =
50358 N1 = Cst;
50359 else
50360 return SDValue();
50361 }
50362
50363 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
50364}
50365
50366// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
50367// register. In most cases we actually compare or select YMM-sized registers
50368// and mixing the two types creates horrible code. This method optimizes
50369// some of the transition sequences.
50370// Even with AVX-512 this is still useful for removing casts around logical
50371// operations on vXi1 mask types.
50373 SelectionDAG &DAG,
50374 const X86Subtarget &Subtarget) {
50375 EVT VT = N.getValueType();
50376 assert(VT.isVector() && "Expected vector type");
50377 assert((N.getOpcode() == ISD::ANY_EXTEND ||
50378 N.getOpcode() == ISD::ZERO_EXTEND ||
50379 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
50380
50381 SDValue Narrow = N.getOperand(0);
50382 EVT NarrowVT = Narrow.getValueType();
50383
50384 // Generate the wide operation.
50385 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
50386 if (!Op)
50387 return SDValue();
50388 switch (N.getOpcode()) {
50389 default: llvm_unreachable("Unexpected opcode");
50390 case ISD::ANY_EXTEND:
50391 return Op;
50392 case ISD::ZERO_EXTEND:
50393 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
50394 case ISD::SIGN_EXTEND:
50395 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
50396 Op, DAG.getValueType(NarrowVT));
50397 }
50398}
50399
50400static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
50401 unsigned FPOpcode;
50402 switch (Opcode) {
50403 // clang-format off
50404 default: llvm_unreachable("Unexpected input node for FP logic conversion");
50405 case ISD::AND: FPOpcode = X86ISD::FAND; break;
50406 case ISD::OR: FPOpcode = X86ISD::FOR; break;
50407 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
50408 // clang-format on
50409 }
50410 return FPOpcode;
50411}
50412
50413/// If both input operands of a logic op are being cast from floating-point
50414/// types or FP compares, try to convert this into a floating-point logic node
50415/// to avoid unnecessary moves from SSE to integer registers.
50416static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
50417 SDValue N0, SDValue N1,
50418 SelectionDAG &DAG,
50420 const X86Subtarget &Subtarget) {
50421 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50422 "Unexpected bit opcode");
50423
50424 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
50425 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
50426 return SDValue();
50427
50428 SDValue N00 = N0.getOperand(0);
50429 SDValue N10 = N1.getOperand(0);
50430 EVT N00Type = N00.getValueType();
50431 EVT N10Type = N10.getValueType();
50432
50433 // Ensure that both types are the same and are legal scalar fp types.
50434 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
50435 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
50436 (Subtarget.hasFP16() && N00Type == MVT::f16)))
50437 return SDValue();
50438
50439 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
50440 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
50441 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
50442 return DAG.getBitcast(VT, FPLogic);
50443 }
50444
50445 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
50446 !N1.hasOneUse())
50447 return SDValue();
50448
50449 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
50450 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
50451
50452 // The vector ISA for FP predicates is incomplete before AVX, so converting
50453 // COMIS* to CMPS* may not be a win before AVX.
50454 if (!Subtarget.hasAVX() &&
50455 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
50456 return SDValue();
50457
50458 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
50459 // and vector logic:
50460 // logic (setcc N00, N01), (setcc N10, N11) -->
50461 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
50462 unsigned NumElts = 128 / N00Type.getSizeInBits();
50463 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
50464 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
50465 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
50466 SDValue N01 = N0.getOperand(1);
50467 SDValue N11 = N1.getOperand(1);
50468 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
50469 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
50470 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
50471 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
50472 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
50473 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
50474 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
50475 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
50476}
50477
50478// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
50479// to reduce XMM->GPR traffic.
50480static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
50481 SDValue N1, SelectionDAG &DAG) {
50482 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50483 "Unexpected bit opcode");
50484
50485 // Both operands must be single use MOVMSK.
50486 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
50487 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
50488 return SDValue();
50489
50490 SDValue Vec0 = N0.getOperand(0);
50491 SDValue Vec1 = N1.getOperand(0);
50492 EVT VecVT0 = Vec0.getValueType();
50493 EVT VecVT1 = Vec1.getValueType();
50494
50495 // Both MOVMSK operands must be from vectors of the same size and same element
50496 // size, but its OK for a fp/int diff.
50497 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
50498 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
50499 return SDValue();
50500
50501 unsigned VecOpc =
50502 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
50503 SDValue Result =
50504 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
50505 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
50506}
50507
50508// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
50509// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
50510// handles in InstCombine.
50511static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
50512 SDValue N0, SDValue N1,
50513 SelectionDAG &DAG) {
50514 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50515 "Unexpected bit opcode");
50516
50517 // Both operands must be single use.
50518 if (!N0.hasOneUse() || !N1.hasOneUse())
50519 return SDValue();
50520
50521 // Search for matching shifts.
50524
50525 unsigned BCOpc = BC0.getOpcode();
50526 EVT BCVT = BC0.getValueType();
50527 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
50528 return SDValue();
50529
50530 switch (BCOpc) {
50531 case X86ISD::VSHLI:
50532 case X86ISD::VSRLI:
50533 case X86ISD::VSRAI: {
50534 if (BC0.getOperand(1) != BC1.getOperand(1))
50535 return SDValue();
50536 SDValue BitOp =
50537 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
50538 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
50539 return DAG.getBitcast(VT, Shift);
50540 }
50541 }
50542
50543 return SDValue();
50544}
50545
50546// Attempt to fold:
50547// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
50548// TODO: Handle PACKUS handling.
50549static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
50550 SDValue N0, SDValue N1, SelectionDAG &DAG) {
50551 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50552 "Unexpected bit opcode");
50553
50554 // Both operands must be single use.
50555 if (!N0.hasOneUse() || !N1.hasOneUse())
50556 return SDValue();
50557
50558 // Search for matching packs.
50561
50562 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
50563 return SDValue();
50564
50565 MVT DstVT = N0.getSimpleValueType();
50566 if (DstVT != N1.getSimpleValueType())
50567 return SDValue();
50568
50569 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
50570 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
50571
50572 // Limit to allsignbits packing.
50573 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
50574 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
50575 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
50576 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
50577 return SDValue();
50578
50579 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
50580 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
50581 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
50582}
50583
50584/// If this is a zero/all-bits result that is bitwise-anded with a low bits
50585/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
50586/// with a shift-right to eliminate loading the vector constant mask value.
50588 const X86Subtarget &Subtarget) {
50589 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
50590 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
50591 EVT VT = Op0.getValueType();
50592 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
50593 return SDValue();
50594
50595 // Try to convert an "is positive" signbit masking operation into arithmetic
50596 // shift and "andn". This saves a materialization of a -1 vector constant.
50597 // The "is negative" variant should be handled more generally because it only
50598 // requires "and" rather than "andn":
50599 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
50600 //
50601 // This is limited to the original type to avoid producing even more bitcasts.
50602 // If the bitcasts can't be eliminated, then it is unlikely that this fold
50603 // will be profitable.
50604 if (N->getValueType(0) == VT &&
50605 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
50606 SDValue X, Y;
50607 if (Op1.getOpcode() == X86ISD::PCMPGT &&
50608 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
50609 X = Op1.getOperand(0);
50610 Y = Op0;
50611 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
50612 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
50613 X = Op0.getOperand(0);
50614 Y = Op1;
50615 }
50616 if (X && Y) {
50617 SDLoc DL(N);
50618 SDValue Sra =
50620 VT.getScalarSizeInBits() - 1, DAG);
50621 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
50622 }
50623 }
50624
50625 APInt SplatVal;
50626 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
50627 return SDValue();
50628
50629 // Don't prevent creation of ANDN.
50630 if (isBitwiseNot(Op0))
50631 return SDValue();
50632
50633 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
50634 return SDValue();
50635
50636 unsigned EltBitWidth = VT.getScalarSizeInBits();
50637 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
50638 return SDValue();
50639
50640 SDLoc DL(N);
50641 unsigned ShiftVal = SplatVal.countr_one();
50642 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
50643 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
50644 return DAG.getBitcast(N->getValueType(0), Shift);
50645}
50646
50647// Get the index node from the lowered DAG of a GEP IR instruction with one
50648// indexing dimension.
50650 if (Ld->isIndexed())
50651 return SDValue();
50652
50653 SDValue Base = Ld->getBasePtr();
50654 if (Base.getOpcode() != ISD::ADD)
50655 return SDValue();
50656
50657 SDValue ShiftedIndex = Base.getOperand(0);
50658 if (ShiftedIndex.getOpcode() != ISD::SHL)
50659 return SDValue();
50660
50661 return ShiftedIndex.getOperand(0);
50662}
50663
50664static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
50665 return Subtarget.hasBMI2() &&
50666 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
50667}
50668
50669/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
50670/// This undoes the inverse fold performed in InstCombine
50672
50673 using namespace llvm::SDPatternMatch;
50674 MVT VT = N->getSimpleValueType(0);
50675 SDLoc DL(N);
50676 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50677 if (!TLI.hasAndNot(SDValue(N, 0)))
50678 return SDValue();
50679
50680 SDValue X, Y, Z;
50681 if (sd_match(N, m_And(m_Value(X),
50682 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
50683 // Don't fold if Y or Z are constants to prevent infinite loops.
50686 return DAG.getNode(
50687 ISD::AND, DL, VT, X,
50688 DAG.getNOT(
50689 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
50690 }
50691
50692 return SDValue();
50693}
50694
50695// This function recognizes cases where X86 bzhi instruction can replace and
50696// 'and-load' sequence.
50697// In case of loading integer value from an array of constants which is defined
50698// as follows:
50699//
50700// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
50701//
50702// then applying a bitwise and on the result with another input.
50703// It's equivalent to performing bzhi (zero high bits) on the input, with the
50704// same index of the load.
50706 const X86Subtarget &Subtarget) {
50707 MVT VT = Node->getSimpleValueType(0);
50708 SDLoc dl(Node);
50709
50710 // Check if subtarget has BZHI instruction for the node's type
50711 if (!hasBZHI(Subtarget, VT))
50712 return SDValue();
50713
50714 // Try matching the pattern for both operands.
50715 for (unsigned i = 0; i < 2; i++) {
50716 // continue if the operand is not a load instruction
50717 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
50718 if (!Ld)
50719 continue;
50720 const Value *MemOp = Ld->getMemOperand()->getValue();
50721 if (!MemOp)
50722 continue;
50723 // Get the Node which indexes into the array.
50725 if (!Index)
50726 continue;
50727
50728 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
50729 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
50730 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
50731 Constant *Init = GV->getInitializer();
50732 Type *Ty = Init->getType();
50733 if (!isa<ConstantDataArray>(Init) ||
50734 !Ty->getArrayElementType()->isIntegerTy() ||
50736 VT.getSizeInBits() ||
50737 Ty->getArrayNumElements() >
50739 continue;
50740
50741 // Check if the array's constant elements are suitable to our case.
50742 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
50743 bool ConstantsMatch = true;
50744 for (uint64_t j = 0; j < ArrayElementCount; j++) {
50745 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
50746 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
50747 ConstantsMatch = false;
50748 break;
50749 }
50750 }
50751 if (!ConstantsMatch)
50752 continue;
50753
50754 // Do the transformation (For 32-bit type):
50755 // -> (and (load arr[idx]), inp)
50756 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
50757 // that will be replaced with one bzhi instruction.
50758 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
50759 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
50760
50761 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
50762 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
50763 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
50764
50765 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
50766 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
50767 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
50768 }
50769 }
50770 }
50771 }
50772 return SDValue();
50773}
50774
50775// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
50776// Where C is a mask containing the same number of bits as the setcc and
50777// where the setcc will freely 0 upper bits of k-register. We can replace the
50778// undef in the concat with 0s and remove the AND. This mainly helps with
50779// v2i1/v4i1 setcc being casted to scalar.
50781 const X86Subtarget &Subtarget) {
50782 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
50783
50784 EVT VT = N->getValueType(0);
50785
50786 // Make sure this is an AND with constant. We will check the value of the
50787 // constant later.
50788 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
50789 if (!C1)
50790 return SDValue();
50791
50792 // This is implied by the ConstantSDNode.
50793 assert(!VT.isVector() && "Expected scalar VT!");
50794
50795 SDValue Src = N->getOperand(0);
50796 if (!Src.hasOneUse())
50797 return SDValue();
50798
50799 // (Optionally) peek through any_extend().
50800 if (Src.getOpcode() == ISD::ANY_EXTEND) {
50801 if (!Src.getOperand(0).hasOneUse())
50802 return SDValue();
50803 Src = Src.getOperand(0);
50804 }
50805
50806 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
50807 return SDValue();
50808
50809 Src = Src.getOperand(0);
50810 EVT SrcVT = Src.getValueType();
50811
50812 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50813 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
50814 !TLI.isTypeLegal(SrcVT))
50815 return SDValue();
50816
50817 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
50818 return SDValue();
50819
50820 // We only care about the first subvector of the concat, we expect the
50821 // other subvectors to be ignored due to the AND if we make the change.
50822 SDValue SubVec = Src.getOperand(0);
50823 EVT SubVecVT = SubVec.getValueType();
50824
50825 // The RHS of the AND should be a mask with as many bits as SubVec.
50826 if (!TLI.isTypeLegal(SubVecVT) ||
50827 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
50828 return SDValue();
50829
50830 // First subvector should be a setcc with a legal result type or a
50831 // AND containing at least one setcc with a legal result type.
50832 auto IsLegalSetCC = [&](SDValue V) {
50833 if (V.getOpcode() != ISD::SETCC)
50834 return false;
50835 EVT SetccVT = V.getOperand(0).getValueType();
50836 if (!TLI.isTypeLegal(SetccVT) ||
50837 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
50838 return false;
50839 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
50840 return false;
50841 return true;
50842 };
50843 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
50844 (IsLegalSetCC(SubVec.getOperand(0)) ||
50845 IsLegalSetCC(SubVec.getOperand(1))))))
50846 return SDValue();
50847
50848 // We passed all the checks. Rebuild the concat_vectors with zeroes
50849 // and cast it back to VT.
50850 SDLoc dl(N);
50851 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
50852 DAG.getConstant(0, dl, SubVecVT));
50853 Ops[0] = SubVec;
50854 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
50855 Ops);
50856 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
50857 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
50858}
50859
50860static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
50861 SDValue OpMustEq, SDValue Op, unsigned Depth) {
50862 // We don't want to go crazy with the recursion here. This isn't a super
50863 // important optimization.
50864 static constexpr unsigned kMaxDepth = 2;
50865
50866 // Only do this re-ordering if op has one use.
50867 if (!Op.hasOneUse())
50868 return SDValue();
50869
50870 SDLoc DL(Op);
50871 // If we hit another assosiative op, recurse further.
50872 if (Op.getOpcode() == Opc) {
50873 // Done recursing.
50874 if (Depth++ >= kMaxDepth)
50875 return SDValue();
50876
50877 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50878 if (SDValue R =
50879 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
50880 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
50881 Op.getOperand(1 - OpIdx));
50882
50883 } else if (Op.getOpcode() == ISD::SUB) {
50884 if (Opc == ISD::AND) {
50885 // BLSI: (and x, (sub 0, x))
50886 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
50887 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50888 }
50889 // Opc must be ISD::AND or ISD::XOR
50890 // BLSR: (and x, (sub x, 1))
50891 // BLSMSK: (xor x, (sub x, 1))
50892 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50893 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50894
50895 } else if (Op.getOpcode() == ISD::ADD) {
50896 // Opc must be ISD::AND or ISD::XOR
50897 // BLSR: (and x, (add x, -1))
50898 // BLSMSK: (xor x, (add x, -1))
50899 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50900 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50901 }
50902 return SDValue();
50903}
50904
50906 const X86Subtarget &Subtarget) {
50907 EVT VT = N->getValueType(0);
50908 // Make sure this node is a candidate for BMI instructions.
50909 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
50910 (VT != MVT::i32 && VT != MVT::i64))
50911 return SDValue();
50912
50913 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
50914
50915 // Try and match LHS and RHS.
50916 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50917 if (SDValue OpMatch =
50918 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
50919 N->getOperand(1 - OpIdx), 0))
50920 return OpMatch;
50921 return SDValue();
50922}
50923
50925 SelectionDAG &DAG,
50927 const X86Subtarget &ST) {
50928 // cmp(setcc(cc, X), 0)
50929 // brcond ne
50930 // ->
50931 // X
50932 // brcond cc
50933
50934 // sub(setcc(cc, X), 1)
50935 // brcond ne
50936 // ->
50937 // X
50938 // brcond ~cc
50939 //
50940 // if only flag has users
50941
50942 SDValue SetCC = N->getOperand(0);
50943
50944 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
50945 return SDValue();
50946
50947 // Check the only user of flag is `brcond ne`.
50948 SDNode *BrCond = *Flag->user_begin();
50949 if (BrCond->getOpcode() != X86ISD::BRCOND)
50950 return SDValue();
50951 unsigned CondNo = 2;
50952 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
50954 return SDValue();
50955
50956 SDValue X = SetCC.getOperand(1);
50957 // sub has two results while X only have one. DAG combine assumes the value
50958 // type matches.
50959 if (N->getOpcode() == X86ISD::SUB)
50960 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
50961
50962 SDValue CCN = SetCC.getOperand(0);
50964 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
50966 // Update CC for the consumer of the flag.
50967 // The old CC is `ne`. Hence, when comparing the result with 0, we are
50968 // checking if the second condition evaluates to true. When comparing the
50969 // result with 1, we are checking uf the second condition evaluates to false.
50970 SmallVector<SDValue> Ops(BrCond->op_values());
50971 if (isNullConstant(N->getOperand(1)))
50972 Ops[CondNo] = CCN;
50973 else if (isOneConstant(N->getOperand(1)))
50974 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
50975 else
50976 llvm_unreachable("expect constant 0 or 1");
50977
50978 SDValue NewBrCond =
50979 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
50980 // Avoid self-assign error b/c CC1 can be `e/ne`.
50981 if (BrCond != NewBrCond.getNode())
50982 DCI.CombineTo(BrCond, NewBrCond);
50983 return X;
50984}
50985
50988 const X86Subtarget &ST) {
50989 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
50990 // ->
50991 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
50992
50993 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
50994 // ->
50995 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
50996 //
50997 // where cflags is determined by cc1.
50998
50999 if (!ST.hasCCMP())
51000 return SDValue();
51001
51002 SDValue SetCC0 = N->getOperand(0);
51003 SDValue SetCC1 = N->getOperand(1);
51004 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51005 SetCC1.getOpcode() != X86ISD::SETCC)
51006 return SDValue();
51007
51008 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51009 SDValue Op = V.getOperand(1);
51010 unsigned Opc = Op.getOpcode();
51011 if (Opc == X86ISD::SUB)
51012 return X86ISD::CCMP;
51013 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51014 return X86ISD::CTEST;
51015 return 0U;
51016 };
51017
51018 unsigned NewOpc = 0;
51019
51020 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51021 // appear on the right.
51022 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51023 std::swap(SetCC0, SetCC1);
51024 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51025 return SDValue();
51026 }
51027
51028 X86::CondCode CC0 =
51029 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51030 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51031 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51032 return SDValue();
51033
51034 bool IsOR = N->getOpcode() == ISD::OR;
51035
51036 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51037 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51038 // operator is OR. Similar for CC1.
51039 SDValue SrcCC =
51041 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51042 : SetCC0.getOperand(0);
51043 SDValue CC1N = SetCC1.getOperand(0);
51044 X86::CondCode CC1 =
51045 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51047 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51048 SDLoc DL(N);
51049 SDValue CFlags = DAG.getTargetConstant(
51050 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51051 SDValue Sub = SetCC1.getOperand(1);
51052
51053 // Replace any uses of the old flag produced by SUB/CMP with the new one
51054 // produced by CCMP/CTEST.
51055 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51056 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51057 {Sub.getOperand(0), Sub.getOperand(1),
51058 CFlags, SrcCC, SetCC0.getOperand(1)})
51059 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51060 {Sub.getOperand(0), Sub.getOperand(0),
51061 CFlags, SrcCC, SetCC0.getOperand(1)});
51062
51063 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51064}
51065
51068 const X86Subtarget &Subtarget) {
51069 SDValue N0 = N->getOperand(0);
51070 SDValue N1 = N->getOperand(1);
51071 EVT VT = N->getValueType(0);
51072 SDLoc dl(N);
51073 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51074
51075 // If this is SSE1 only convert to FAND to avoid scalarization.
51076 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51077 return DAG.getBitcast(MVT::v4i32,
51078 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51079 DAG.getBitcast(MVT::v4f32, N0),
51080 DAG.getBitcast(MVT::v4f32, N1)));
51081 }
51082
51083 // Use a 32-bit and+zext if upper bits known zero.
51084 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51085 APInt HiMask = APInt::getHighBitsSet(64, 32);
51086 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51087 DAG.MaskedValueIsZero(N0, HiMask)) {
51088 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51089 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51090 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51091 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51092 }
51093 }
51094
51095 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51096 // TODO: Support multiple SrcOps.
51097 if (VT == MVT::i1) {
51099 SmallVector<APInt, 2> SrcPartials;
51100 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51101 SrcOps.size() == 1) {
51102 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51103 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51104 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51105 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51106 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51107 if (Mask) {
51108 assert(SrcPartials[0].getBitWidth() == NumElts &&
51109 "Unexpected partial reduction mask");
51110 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51111 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51112 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51113 }
51114 }
51115 }
51116
51117 // InstCombine converts:
51118 // `(-x << C0) & C1`
51119 // to
51120 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51121 // This saves an IR instruction but on x86 the neg/shift version is preferable
51122 // so undo the transform.
51123
51124 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51125 // TODO: We don't actually need a splat for this, we just need the checks to
51126 // hold for each element.
51127 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51128 /*AllowTruncation*/ false);
51129 ConstantSDNode *N01C =
51130 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51131 /*AllowTruncation*/ false);
51132 if (N1C && N01C) {
51133 const APInt &MulC = N01C->getAPIntValue();
51134 const APInt &AndC = N1C->getAPIntValue();
51135 APInt MulCLowBit = MulC & (-MulC);
51136 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51137 (MulCLowBit + MulC).isPowerOf2()) {
51138 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51139 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51140 assert(MulCLowBitLog != -1 &&
51141 "Isolated lowbit is somehow not a power of 2!");
51142 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51143 DAG.getConstant(MulCLowBitLog, dl, VT));
51144 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51145 }
51146 }
51147 }
51148
51149 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51150 return SetCC;
51151
51152 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51153 return V;
51154
51155 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51156 return R;
51157
51158 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51159 return R;
51160
51161 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51162 return R;
51163
51164 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51165 DAG, DCI, Subtarget))
51166 return FPLogic;
51167
51168 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51169 return R;
51170
51171 if (DCI.isBeforeLegalizeOps())
51172 return SDValue();
51173
51174 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51175 return R;
51176
51177 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
51178 return R;
51179
51180 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
51181 return ShiftRight;
51182
51183 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51184 return R;
51185
51187 return R;
51188
51189 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51190 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51191 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51192 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51193 unsigned Opc0 = N0.getOpcode();
51194 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51196 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51197 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51198 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51199 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51200 }
51201 }
51202
51203 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51204 // avoids slow variable shift (moving shift amount to ECX etc.)
51205 if (isOneConstant(N1) && N0->hasOneUse()) {
51206 SDValue Src = N0;
51207 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51208 Src.getOpcode() == ISD::TRUNCATE) &&
51209 Src.getOperand(0)->hasOneUse())
51210 Src = Src.getOperand(0);
51211 bool ContainsNOT = false;
51212 X86::CondCode X86CC = X86::COND_B;
51213 // Peek through AND(NOT(SRL(X,Y)),1).
51214 if (isBitwiseNot(Src)) {
51215 Src = Src.getOperand(0);
51216 X86CC = X86::COND_AE;
51217 ContainsNOT = true;
51218 }
51219 if (Src.getOpcode() == ISD::SRL &&
51220 !isa<ConstantSDNode>(Src.getOperand(1))) {
51221 SDValue BitNo = Src.getOperand(1);
51222 Src = Src.getOperand(0);
51223 // Peek through AND(SRL(NOT(X),Y),1).
51224 if (isBitwiseNot(Src)) {
51225 Src = Src.getOperand(0);
51226 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51227 ContainsNOT = true;
51228 }
51229 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51230 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51231 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51232 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51233 }
51234 }
51235
51236 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51237 // Attempt to recursively combine a bitmask AND with shuffles.
51238 SDValue Op(N, 0);
51239 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51240 return Res;
51241
51242 // If either operand is a constant mask, then only the elements that aren't
51243 // zero are actually demanded by the other operand.
51244 auto GetDemandedMasks = [&](SDValue Op) {
51245 APInt UndefElts;
51246 SmallVector<APInt> EltBits;
51247 int NumElts = VT.getVectorNumElements();
51248 int EltSizeInBits = VT.getScalarSizeInBits();
51249 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51250 APInt DemandedElts = APInt::getAllOnes(NumElts);
51251 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51252 EltBits)) {
51253 DemandedBits.clearAllBits();
51254 DemandedElts.clearAllBits();
51255 for (int I = 0; I != NumElts; ++I) {
51256 if (UndefElts[I]) {
51257 // We can't assume an undef src element gives an undef dst - the
51258 // other src might be zero.
51259 DemandedBits.setAllBits();
51260 DemandedElts.setBit(I);
51261 } else if (!EltBits[I].isZero()) {
51262 DemandedBits |= EltBits[I];
51263 DemandedElts.setBit(I);
51264 }
51265 }
51266 }
51267 return std::make_pair(DemandedBits, DemandedElts);
51268 };
51269 APInt Bits0, Elts0;
51270 APInt Bits1, Elts1;
51271 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
51272 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
51273
51274 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
51275 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
51276 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
51277 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
51278 if (N->getOpcode() != ISD::DELETED_NODE)
51279 DCI.AddToWorklist(N);
51280 return SDValue(N, 0);
51281 }
51282
51283 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
51284 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
51285 if (NewN0 || NewN1)
51286 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
51287 NewN1 ? NewN1 : N1);
51288 }
51289
51290 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
51291 if ((VT.getScalarSizeInBits() % 8) == 0 &&
51293 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
51294 SDValue BitMask = N1;
51295 SDValue SrcVec = N0.getOperand(0);
51296 EVT SrcVecVT = SrcVec.getValueType();
51297
51298 // Check that the constant bitmask masks whole bytes.
51299 APInt UndefElts;
51300 SmallVector<APInt, 64> EltBits;
51301 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
51302 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
51303 llvm::all_of(EltBits, [](const APInt &M) {
51304 return M.isZero() || M.isAllOnes();
51305 })) {
51306 unsigned NumElts = SrcVecVT.getVectorNumElements();
51307 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
51308 unsigned Idx = N0.getConstantOperandVal(1);
51309
51310 // Create a root shuffle mask from the byte mask and the extracted index.
51311 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
51312 for (unsigned i = 0; i != Scale; ++i) {
51313 if (UndefElts[i])
51314 continue;
51315 int VecIdx = Scale * Idx + i;
51316 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
51317 }
51318
51320 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
51322 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
51323 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
51324 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
51325 N0.getOperand(1));
51326 }
51327 }
51328
51329 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
51330 return R;
51331
51332 return SDValue();
51333}
51334
51335// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
51337 SelectionDAG &DAG,
51338 const X86Subtarget &Subtarget) {
51339 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
51340
51341 MVT VT = N->getSimpleValueType(0);
51342 unsigned EltSizeInBits = VT.getScalarSizeInBits();
51343 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
51344 return SDValue();
51345
51346 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
51347 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
51348 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
51349 return SDValue();
51350
51351 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
51352 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
51353 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
51354 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
51355 return SDValue();
51356
51357 // Attempt to extract constant byte masks.
51358 APInt UndefElts0, UndefElts1;
51359 SmallVector<APInt, 32> EltBits0, EltBits1;
51360 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
51361 /*AllowWholeUndefs*/ false,
51362 /*AllowPartialUndefs*/ false))
51363 return SDValue();
51364 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
51365 /*AllowWholeUndefs*/ false,
51366 /*AllowPartialUndefs*/ false))
51367 return SDValue();
51368
51369 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
51370 // TODO - add UNDEF elts support.
51371 if (UndefElts0[i] || UndefElts1[i])
51372 return SDValue();
51373 if (EltBits0[i] != ~EltBits1[i])
51374 return SDValue();
51375 }
51376
51377 if (useVPTERNLOG(Subtarget, VT)) {
51378 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
51379 // VPTERNLOG is only available as vXi32/64-bit types.
51380 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
51381 MVT OpVT =
51382 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
51383 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
51384 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
51385 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
51386 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
51387 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
51388 DAG, Subtarget);
51389 return DAG.getBitcast(VT, Res);
51390 }
51391
51392 SDValue X = N->getOperand(0);
51393 SDValue Y =
51394 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
51395 DAG.getBitcast(VT, N1.getOperand(0)));
51396 return DAG.getNode(ISD::OR, DL, VT, X, Y);
51397}
51398
51399// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
51400static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
51401 if (N->getOpcode() != ISD::OR)
51402 return false;
51403
51404 SDValue N0 = N->getOperand(0);
51405 SDValue N1 = N->getOperand(1);
51406
51407 // Canonicalize AND to LHS.
51408 if (N1.getOpcode() == ISD::AND)
51409 std::swap(N0, N1);
51410
51411 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
51412 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
51413 return false;
51414
51415 Mask = N1.getOperand(0);
51416 X = N1.getOperand(1);
51417
51418 // Check to see if the mask appeared in both the AND and ANDNP.
51419 if (N0.getOperand(0) == Mask)
51420 Y = N0.getOperand(1);
51421 else if (N0.getOperand(1) == Mask)
51422 Y = N0.getOperand(0);
51423 else
51424 return false;
51425
51426 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
51427 // ANDNP combine allows other combines to happen that prevent matching.
51428 return true;
51429}
51430
51431// Try to fold:
51432// (or (and (m, y), (pandn m, x)))
51433// into:
51434// (vselect m, x, y)
51435// As a special case, try to fold:
51436// (or (and (m, (sub 0, x)), (pandn m, x)))
51437// into:
51438// (sub (xor X, M), M)
51440 SelectionDAG &DAG,
51441 const X86Subtarget &Subtarget) {
51442 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
51443
51444 EVT VT = N->getValueType(0);
51445 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
51446 (VT.is256BitVector() && Subtarget.hasInt256())))
51447 return SDValue();
51448
51449 SDValue X, Y, Mask;
51450 if (!matchLogicBlend(N, X, Y, Mask))
51451 return SDValue();
51452
51453 // Validate that X, Y, and Mask are bitcasts, and see through them.
51454 Mask = peekThroughBitcasts(Mask);
51457
51458 EVT MaskVT = Mask.getValueType();
51459 unsigned EltBits = MaskVT.getScalarSizeInBits();
51460
51461 // TODO: Attempt to handle floating point cases as well?
51462 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
51463 return SDValue();
51464
51465 // Attempt to combine to conditional negate: (sub (xor X, M), M)
51466 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
51467 DAG, Subtarget))
51468 return Res;
51469
51470 // PBLENDVB is only available on SSE 4.1.
51471 if (!Subtarget.hasSSE41())
51472 return SDValue();
51473
51474 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
51475 if (Subtarget.hasVLX())
51476 return SDValue();
51477
51478 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
51479
51480 X = DAG.getBitcast(BlendVT, X);
51481 Y = DAG.getBitcast(BlendVT, Y);
51482 Mask = DAG.getBitcast(BlendVT, Mask);
51483 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
51484 return DAG.getBitcast(VT, Mask);
51485}
51486
51487// Helper function for combineOrCmpEqZeroToCtlzSrl
51488// Transforms:
51489// seteq(cmp x, 0)
51490// into:
51491// srl(ctlz x), log2(bitsize(x))
51492// Input pattern is checked by caller.
51494 SDValue Cmp = Op.getOperand(1);
51495 EVT VT = Cmp.getOperand(0).getValueType();
51496 unsigned Log2b = Log2_32(VT.getSizeInBits());
51497 SDLoc dl(Op);
51498 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
51499 // The result of the shift is true or false, and on X86, the 32-bit
51500 // encoding of shr and lzcnt is more desirable.
51501 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
51502 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
51503 DAG.getConstant(Log2b, dl, MVT::i8));
51504 return Scc;
51505}
51506
51507// Try to transform:
51508// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
51509// into:
51510// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
51511// Will also attempt to match more generic cases, eg:
51512// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
51513// Only applies if the target supports the FastLZCNT feature.
51516 const X86Subtarget &Subtarget) {
51517 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
51518 return SDValue();
51519
51520 auto isORCandidate = [](SDValue N) {
51521 return (N->getOpcode() == ISD::OR && N->hasOneUse());
51522 };
51523
51524 // Check the zero extend is extending to 32-bit or more. The code generated by
51525 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
51526 // instructions to clear the upper bits.
51527 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
51528 !isORCandidate(N->getOperand(0)))
51529 return SDValue();
51530
51531 // Check the node matches: setcc(eq, cmp 0)
51532 auto isSetCCCandidate = [](SDValue N) {
51533 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
51534 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
51535 N->getOperand(1).getOpcode() == X86ISD::CMP &&
51536 isNullConstant(N->getOperand(1).getOperand(1)) &&
51537 N->getOperand(1).getValueType().bitsGE(MVT::i32);
51538 };
51539
51540 SDNode *OR = N->getOperand(0).getNode();
51541 SDValue LHS = OR->getOperand(0);
51542 SDValue RHS = OR->getOperand(1);
51543
51544 // Save nodes matching or(or, setcc(eq, cmp 0)).
51546 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
51547 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
51548 ORNodes.push_back(OR);
51549 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
51550 LHS = OR->getOperand(0);
51551 RHS = OR->getOperand(1);
51552 }
51553
51554 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
51555 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
51556 !isORCandidate(SDValue(OR, 0)))
51557 return SDValue();
51558
51559 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
51560 // to
51561 // or(srl(ctlz),srl(ctlz)).
51562 // The dag combiner can then fold it into:
51563 // srl(or(ctlz, ctlz)).
51564 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
51565 SDValue Ret, NewRHS;
51566 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
51567 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
51568
51569 if (!Ret)
51570 return SDValue();
51571
51572 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
51573 while (!ORNodes.empty()) {
51574 OR = ORNodes.pop_back_val();
51575 LHS = OR->getOperand(0);
51576 RHS = OR->getOperand(1);
51577 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
51578 if (RHS->getOpcode() == ISD::OR)
51579 std::swap(LHS, RHS);
51580 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
51581 if (!NewRHS)
51582 return SDValue();
51583 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
51584 }
51585
51586 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
51587}
51588
51590 SDValue And1_L, SDValue And1_R,
51591 const SDLoc &DL, SelectionDAG &DAG) {
51592 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
51593 return SDValue();
51594 SDValue NotOp = And0_L->getOperand(0);
51595 if (NotOp == And1_R)
51596 std::swap(And1_R, And1_L);
51597 if (NotOp != And1_L)
51598 return SDValue();
51599
51600 // (~(NotOp) & And0_R) | (NotOp & And1_R)
51601 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
51602 EVT VT = And1_L->getValueType(0);
51603 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
51604 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
51605 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
51606 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
51607 return Xor1;
51608}
51609
51610/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
51611/// equivalent `((x ^ y) & m) ^ y)` pattern.
51612/// This is typically a better representation for targets without a fused
51613/// "and-not" operation. This function is intended to be called from a
51614/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
51616 // Note that masked-merge variants using XOR or ADD expressions are
51617 // normalized to OR by InstCombine so we only check for OR.
51618 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
51619 SDValue N0 = Node->getOperand(0);
51620 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
51621 return SDValue();
51622 SDValue N1 = Node->getOperand(1);
51623 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
51624 return SDValue();
51625
51626 SDLoc DL(Node);
51627 SDValue N00 = N0->getOperand(0);
51628 SDValue N01 = N0->getOperand(1);
51629 SDValue N10 = N1->getOperand(0);
51630 SDValue N11 = N1->getOperand(1);
51631 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
51632 return Result;
51633 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
51634 return Result;
51635 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
51636 return Result;
51637 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
51638 return Result;
51639 return SDValue();
51640}
51641
51642/// If this is an add or subtract where one operand is produced by a cmp+setcc,
51643/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
51644/// with CMP+{ADC, SBB}.
51645/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
51646static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
51647 SDValue X, SDValue Y,
51648 SelectionDAG &DAG,
51649 bool ZeroSecondOpOnly = false) {
51650 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
51651 return SDValue();
51652
51653 // Look through a one-use zext.
51654 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
51655 Y = Y.getOperand(0);
51656
51658 SDValue EFLAGS;
51659 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
51660 CC = (X86::CondCode)Y.getConstantOperandVal(0);
51661 EFLAGS = Y.getOperand(1);
51662 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
51663 Y.hasOneUse()) {
51664 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
51665 }
51666
51667 if (!EFLAGS)
51668 return SDValue();
51669
51670 // If X is -1 or 0, then we have an opportunity to avoid constants required in
51671 // the general case below.
51672 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
51673 if (ConstantX && !ZeroSecondOpOnly) {
51674 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
51675 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
51676 // This is a complicated way to get -1 or 0 from the carry flag:
51677 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51678 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51679 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51680 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51681 EFLAGS);
51682 }
51683
51684 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
51685 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
51686 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
51687 EFLAGS.getValueType().isInteger() &&
51688 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51689 // Swap the operands of a SUB, and we have the same pattern as above.
51690 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
51691 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
51692 SDValue NewSub = DAG.getNode(
51693 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51694 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51695 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
51696 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51697 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51698 NewEFLAGS);
51699 }
51700 }
51701 }
51702
51703 if (CC == X86::COND_B) {
51704 // X + SETB Z --> adc X, 0
51705 // X - SETB Z --> sbb X, 0
51706 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
51707 DAG.getVTList(VT, MVT::i32), X,
51708 DAG.getConstant(0, DL, VT), EFLAGS);
51709 }
51710
51711 if (ZeroSecondOpOnly)
51712 return SDValue();
51713
51714 if (CC == X86::COND_A) {
51715 // Try to convert COND_A into COND_B in an attempt to facilitate
51716 // materializing "setb reg".
51717 //
51718 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
51719 // cannot take an immediate as its first operand.
51720 //
51721 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51722 EFLAGS.getValueType().isInteger() &&
51723 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51724 SDValue NewSub =
51725 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51726 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51727 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
51728 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
51729 DAG.getVTList(VT, MVT::i32), X,
51730 DAG.getConstant(0, DL, VT), NewEFLAGS);
51731 }
51732 }
51733
51734 if (CC == X86::COND_AE) {
51735 // X + SETAE --> sbb X, -1
51736 // X - SETAE --> adc X, -1
51737 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
51738 DAG.getVTList(VT, MVT::i32), X,
51739 DAG.getAllOnesConstant(DL, VT), EFLAGS);
51740 }
51741
51742 if (CC == X86::COND_BE) {
51743 // X + SETBE --> sbb X, -1
51744 // X - SETBE --> adc X, -1
51745 // Try to convert COND_BE into COND_AE in an attempt to facilitate
51746 // materializing "setae reg".
51747 //
51748 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
51749 // cannot take an immediate as its first operand.
51750 //
51751 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51752 EFLAGS.getValueType().isInteger() &&
51753 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51754 SDValue NewSub =
51755 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51756 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51757 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
51758 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
51759 DAG.getVTList(VT, MVT::i32), X,
51760 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
51761 }
51762 }
51763
51764 if (CC != X86::COND_E && CC != X86::COND_NE)
51765 return SDValue();
51766
51767 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
51768 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
51769 !EFLAGS.getOperand(0).getValueType().isInteger())
51770 return SDValue();
51771
51772 SDValue Z = EFLAGS.getOperand(0);
51773 EVT ZVT = Z.getValueType();
51774
51775 // If X is -1 or 0, then we have an opportunity to avoid constants required in
51776 // the general case below.
51777 if (ConstantX) {
51778 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
51779 // fake operands:
51780 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
51781 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
51782 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
51783 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
51784 SDValue Zero = DAG.getConstant(0, DL, ZVT);
51785 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51786 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
51787 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51788 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51789 SDValue(Neg.getNode(), 1));
51790 }
51791
51792 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
51793 // with fake operands:
51794 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
51795 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
51796 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
51797 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
51798 SDValue One = DAG.getConstant(1, DL, ZVT);
51799 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51800 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
51801 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51802 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51803 Cmp1.getValue(1));
51804 }
51805 }
51806
51807 // (cmp Z, 1) sets the carry flag if Z is 0.
51808 SDValue One = DAG.getConstant(1, DL, ZVT);
51809 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51810 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
51811
51812 // Add the flags type for ADC/SBB nodes.
51813 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
51814
51815 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
51816 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
51817 if (CC == X86::COND_NE)
51818 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
51819 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
51820
51821 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
51822 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
51823 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
51824 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
51825}
51826
51827/// If this is an add or subtract where one operand is produced by a cmp+setcc,
51828/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
51829/// with CMP+{ADC, SBB}.
51831 SelectionDAG &DAG) {
51832 bool IsSub = N->getOpcode() == ISD::SUB;
51833 SDValue X = N->getOperand(0);
51834 SDValue Y = N->getOperand(1);
51835 EVT VT = N->getValueType(0);
51836
51837 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
51838 return ADCOrSBB;
51839
51840 // Commute and try again (negate the result for subtracts).
51841 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
51842 if (IsSub)
51843 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
51844 return ADCOrSBB;
51845 }
51846
51847 return SDValue();
51848}
51849
51850static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
51851 SDValue N0, SDValue N1,
51852 SelectionDAG &DAG) {
51853 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
51854
51855 // Delegate to combineAddOrSubToADCOrSBB if we have:
51856 //
51857 // (xor/or (zero_extend (setcc)) imm)
51858 //
51859 // where imm is odd if and only if we have xor, in which case the XOR/OR are
51860 // equivalent to a SUB/ADD, respectively.
51861 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
51862 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
51863 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
51864 bool IsSub = Opc == ISD::XOR;
51865 bool N1COdd = N1C->getZExtValue() & 1;
51866 if (IsSub ? N1COdd : !N1COdd)
51867 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
51868 return R;
51869 }
51870 }
51871
51872 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
51873 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
51874 N0.getOperand(0).getOpcode() == ISD::AND &&
51878 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
51879 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
51880 N0.getOperand(0).getOperand(1));
51881 }
51882
51883 return SDValue();
51884}
51885
51888 const X86Subtarget &Subtarget) {
51889 SDValue N0 = N->getOperand(0);
51890 SDValue N1 = N->getOperand(1);
51891 EVT VT = N->getValueType(0);
51892 SDLoc dl(N);
51893 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51894
51895 // If this is SSE1 only convert to FOR to avoid scalarization.
51896 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51897 return DAG.getBitcast(MVT::v4i32,
51898 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
51899 DAG.getBitcast(MVT::v4f32, N0),
51900 DAG.getBitcast(MVT::v4f32, N1)));
51901 }
51902
51903 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
51904 // TODO: Support multiple SrcOps.
51905 if (VT == MVT::i1) {
51907 SmallVector<APInt, 2> SrcPartials;
51908 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
51909 SrcOps.size() == 1) {
51910 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51911 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51912 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51913 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51914 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51915 if (Mask) {
51916 assert(SrcPartials[0].getBitWidth() == NumElts &&
51917 "Unexpected partial reduction mask");
51918 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
51919 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51920 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51921 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
51922 }
51923 }
51924 }
51925
51926 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51927 return SetCC;
51928
51929 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51930 return R;
51931
51932 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51933 return R;
51934
51935 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51936 return R;
51937
51938 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51939 DAG, DCI, Subtarget))
51940 return FPLogic;
51941
51942 if (DCI.isBeforeLegalizeOps())
51943 return SDValue();
51944
51945 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51946 return R;
51947
51948 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
51949 return R;
51950
51951 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
51952 return R;
51953
51954 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
51955 if ((VT == MVT::i32 || VT == MVT::i64) &&
51956 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
51957 isNullConstant(N0.getOperand(0))) {
51958 SDValue Cond = N0.getOperand(1);
51959 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
51960 Cond = Cond.getOperand(0);
51961
51962 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
51963 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
51964 uint64_t Val = CN->getZExtValue();
51965 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
51966 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
51967 CCode = X86::GetOppositeBranchCondition(CCode);
51968 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
51969
51970 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
51971 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
51972 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
51973 return R;
51974 }
51975 }
51976 }
51977 }
51978
51979 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
51980 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
51981 // iff the upper elements of the non-shifted arg are zero.
51982 // KUNPCK require 16+ bool vector elements.
51983 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
51984 unsigned NumElts = VT.getVectorNumElements();
51985 unsigned HalfElts = NumElts / 2;
51986 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
51987 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
51988 N1.getConstantOperandAPInt(1) == HalfElts &&
51989 DAG.MaskedVectorIsZero(N0, UpperElts)) {
51990 return DAG.getNode(
51991 ISD::CONCAT_VECTORS, dl, VT,
51992 extractSubVector(N0, 0, DAG, dl, HalfElts),
51993 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
51994 }
51995 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
51996 N0.getConstantOperandAPInt(1) == HalfElts &&
51997 DAG.MaskedVectorIsZero(N1, UpperElts)) {
51998 return DAG.getNode(
51999 ISD::CONCAT_VECTORS, dl, VT,
52000 extractSubVector(N1, 0, DAG, dl, HalfElts),
52001 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52002 }
52003 }
52004
52005 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52006 // Attempt to recursively combine an OR of shuffles.
52007 SDValue Op(N, 0);
52008 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52009 return Res;
52010
52011 // If either operand is a constant mask, then only the elements that aren't
52012 // allones are actually demanded by the other operand.
52013 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52014 APInt UndefElts;
52015 SmallVector<APInt> EltBits;
52016 int NumElts = VT.getVectorNumElements();
52017 int EltSizeInBits = VT.getScalarSizeInBits();
52018 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52019 return false;
52020
52021 APInt DemandedElts = APInt::getZero(NumElts);
52022 for (int I = 0; I != NumElts; ++I)
52023 if (!EltBits[I].isAllOnes())
52024 DemandedElts.setBit(I);
52025
52026 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52027 };
52028 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52029 if (N->getOpcode() != ISD::DELETED_NODE)
52030 DCI.AddToWorklist(N);
52031 return SDValue(N, 0);
52032 }
52033 }
52034
52035 // We should fold "masked merge" patterns when `andn` is not available.
52036 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
52037 if (SDValue R = foldMaskedMerge(N, DAG))
52038 return R;
52039
52040 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52041 return R;
52042
52043 return SDValue();
52044}
52045
52046/// Try to turn tests against the signbit in the form of:
52047/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52048/// into:
52049/// SETGT(X, -1)
52051 // This is only worth doing if the output type is i8 or i1.
52052 EVT ResultType = N->getValueType(0);
52053 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52054 return SDValue();
52055
52056 SDValue N0 = N->getOperand(0);
52057 SDValue N1 = N->getOperand(1);
52058
52059 // We should be performing an xor against a truncated shift.
52060 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52061 return SDValue();
52062
52063 // Make sure we are performing an xor against one.
52064 if (!isOneConstant(N1))
52065 return SDValue();
52066
52067 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52068 SDValue Shift = N0.getOperand(0);
52069 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52070 return SDValue();
52071
52072 // Make sure we are truncating from one of i16, i32 or i64.
52073 EVT ShiftTy = Shift.getValueType();
52074 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52075 return SDValue();
52076
52077 // Make sure the shift amount extracts the sign bit.
52078 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52079 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52080 return SDValue();
52081
52082 // Create a greater-than comparison against -1.
52083 // N.B. Using SETGE against 0 works but we want a canonical looking
52084 // comparison, using SETGT matches up with what TranslateX86CC.
52085 SDLoc DL(N);
52086 SDValue ShiftOp = Shift.getOperand(0);
52087 EVT ShiftOpTy = ShiftOp.getValueType();
52088 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52089 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52090 *DAG.getContext(), ResultType);
52091 SDValue Cond =
52092 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52093 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52094 if (SetCCResultType != ResultType)
52095 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52096 return Cond;
52097}
52098
52099/// Turn vector tests of the signbit in the form of:
52100/// xor (sra X, elt_size(X)-1), -1
52101/// into:
52102/// pcmpgt X, -1
52103///
52104/// This should be called before type legalization because the pattern may not
52105/// persist after that.
52107 const X86Subtarget &Subtarget) {
52108 EVT VT = N->getValueType(0);
52109 if (!VT.isSimple())
52110 return SDValue();
52111
52112 switch (VT.getSimpleVT().SimpleTy) {
52113 // clang-format off
52114 default: return SDValue();
52115 case MVT::v16i8:
52116 case MVT::v8i16:
52117 case MVT::v4i32:
52118 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52119 case MVT::v32i8:
52120 case MVT::v16i16:
52121 case MVT::v8i32:
52122 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52123 // clang-format on
52124 }
52125
52126 // There must be a shift right algebraic before the xor, and the xor must be a
52127 // 'not' operation.
52128 SDValue Shift = N->getOperand(0);
52129 SDValue Ones = N->getOperand(1);
52130 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52132 return SDValue();
52133
52134 // The shift should be smearing the sign bit across each vector element.
52135 auto *ShiftAmt =
52136 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52137 if (!ShiftAmt ||
52138 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52139 return SDValue();
52140
52141 // Create a greater-than comparison against -1. We don't use the more obvious
52142 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52143 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52144}
52145
52146/// Detect patterns of truncation with unsigned saturation:
52147///
52148/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52149/// Return the source value x to be truncated or SDValue() if the pattern was
52150/// not matched.
52151///
52152/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52153/// where C1 >= 0 and C2 is unsigned max of destination type.
52154///
52155/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52156/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52157///
52158/// These two patterns are equivalent to:
52159/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52160/// So return the smax(x, C1) value to be truncated or SDValue() if the
52161/// pattern was not matched.
52163 const SDLoc &DL) {
52164 using namespace llvm::SDPatternMatch;
52165 EVT InVT = In.getValueType();
52166
52167 // Saturation with truncation. We truncate from InVT to VT.
52169 "Unexpected types for truncate operation");
52170
52171 APInt C1, C2;
52173
52174 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52175 // the element size of the destination type.
52176 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52177 C2.isMask(VT.getScalarSizeInBits()))
52178 return UMin;
52179
52180 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52181 sd_match(SMin, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52182 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52183 return SMin;
52184
52185 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52186 sd_match(SMax, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52187 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52188 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52189
52190 return SDValue();
52191}
52192
52193/// Detect patterns of truncation with signed saturation:
52194/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52195/// signed_max_of_dest_type)) to dest_type)
52196/// or:
52197/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52198/// signed_min_of_dest_type)) to dest_type).
52199/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52200/// Return the source value to be truncated or SDValue() if the pattern was not
52201/// matched.
52202static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52203 using namespace llvm::SDPatternMatch;
52204 unsigned NumDstBits = VT.getScalarSizeInBits();
52205 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52206 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52207
52208 APInt SignedMax, SignedMin;
52209 if (MatchPackUS) {
52210 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52211 SignedMin = APInt::getZero(NumSrcBits);
52212 } else {
52213 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52214 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52215 }
52216
52217 SDValue SMin, SMax;
52218 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52219 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52220 return SMax;
52221
52222 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52223 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52224 return SMin;
52225
52226 return SDValue();
52227}
52228
52230 SelectionDAG &DAG,
52231 const X86Subtarget &Subtarget) {
52232 if (!Subtarget.hasSSE2() || !VT.isVector())
52233 return SDValue();
52234
52235 EVT SVT = VT.getVectorElementType();
52236 EVT InVT = In.getValueType();
52237 EVT InSVT = InVT.getVectorElementType();
52238
52239 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52240 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52241 // and concatenate at the same time. Then we can use a final vpmovuswb to
52242 // clip to 0-255.
52243 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52244 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52245 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52246 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52247 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52248 DL, DAG, Subtarget);
52249 assert(Mid && "Failed to pack!");
52250 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52251 }
52252 }
52253
52254 // vXi32 truncate instructions are available with AVX512F.
52255 // vXi16 truncate instructions are only available with AVX512BW.
52256 // For 256-bit or smaller vectors, we require VLX.
52257 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52258 // If the result type is 256-bits or larger and we have disable 512-bit
52259 // registers, we should go ahead and use the pack instructions if possible.
52260 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52261 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52262 (InVT.getSizeInBits() > 128) &&
52263 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52264 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52265
52266 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52268 (SVT == MVT::i8 || SVT == MVT::i16) &&
52269 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52270 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52271 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52272 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52273 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52274 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52275 DAG, Subtarget);
52276 assert(Mid && "Failed to pack!");
52278 Subtarget);
52279 assert(V && "Failed to pack!");
52280 return V;
52281 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52282 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52283 Subtarget);
52284 }
52285 if (SDValue SSatVal = detectSSatPattern(In, VT))
52286 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52287 Subtarget);
52288 }
52289
52290 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52291 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52292 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52293 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52294 unsigned TruncOpc = 0;
52295 SDValue SatVal;
52296 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52297 SatVal = SSatVal;
52298 TruncOpc = X86ISD::VTRUNCS;
52299 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52300 SatVal = USatVal;
52301 TruncOpc = X86ISD::VTRUNCUS;
52302 }
52303 if (SatVal) {
52304 unsigned ResElts = VT.getVectorNumElements();
52305 // If the input type is less than 512 bits and we don't have VLX, we need
52306 // to widen to 512 bits.
52307 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52308 unsigned NumConcats = 512 / InVT.getSizeInBits();
52309 ResElts *= NumConcats;
52310 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52311 ConcatOps[0] = SatVal;
52312 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52313 NumConcats * InVT.getVectorNumElements());
52314 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52315 }
52316 // Widen the result if its narrower than 128 bits.
52317 if (ResElts * SVT.getSizeInBits() < 128)
52318 ResElts = 128 / SVT.getSizeInBits();
52319 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52320 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52321 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52322 DAG.getVectorIdxConstant(0, DL));
52323 }
52324 }
52325
52326 return SDValue();
52327}
52328
52330 SelectionDAG &DAG,
52332 const X86Subtarget &Subtarget) {
52333 auto *Ld = cast<LoadSDNode>(N);
52334 EVT RegVT = Ld->getValueType(0);
52335 SDValue Ptr = Ld->getBasePtr();
52336 SDValue Chain = Ld->getChain();
52337 ISD::LoadExtType Ext = Ld->getExtensionType();
52338
52339 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52340 return SDValue();
52341
52342 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
52343 return SDValue();
52344
52346 if (!LdC)
52347 return SDValue();
52348
52349 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
52350 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
52351 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
52352 if (Undefs[I])
52353 continue;
52354 if (UserUndefs[I] || Bits[I] != UserBits[I])
52355 return false;
52356 }
52357 return true;
52358 };
52359
52360 // Look through all other loads/broadcasts in the chain for another constant
52361 // pool entry.
52362 for (SDNode *User : Chain->users()) {
52363 auto *UserLd = dyn_cast<MemSDNode>(User);
52364 if (User != N && UserLd &&
52365 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
52366 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
52368 UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
52369 User->getValueSizeInBits(0).getFixedValue() >
52370 RegVT.getFixedSizeInBits()) {
52371 EVT UserVT = User->getValueType(0);
52372 SDValue UserPtr = UserLd->getBasePtr();
52373 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
52374
52375 // See if we are loading a constant that matches in the lower
52376 // bits of a longer constant (but from a different constant pool ptr).
52377 if (UserC && UserPtr != Ptr) {
52378 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
52379 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
52380 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
52381 APInt Undefs, UserUndefs;
52382 SmallVector<APInt> Bits, UserBits;
52383 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
52384 UserVT.getScalarSizeInBits());
52385 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
52386 Bits) &&
52388 UserUndefs, UserBits)) {
52389 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
52390 SDValue Extract = extractSubVector(
52391 SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
52392 Extract = DAG.getBitcast(RegVT, Extract);
52393 return DCI.CombineTo(N, Extract, SDValue(User, 1));
52394 }
52395 }
52396 }
52397 }
52398 }
52399 }
52400
52401 return SDValue();
52402}
52403
52406 const X86Subtarget &Subtarget) {
52407 auto *Ld = cast<LoadSDNode>(N);
52408 EVT RegVT = Ld->getValueType(0);
52409 EVT MemVT = Ld->getMemoryVT();
52410 SDLoc dl(Ld);
52411 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52412
52413 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
52414 // into two 16-byte operations. Also split non-temporal aligned loads on
52415 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
52416 ISD::LoadExtType Ext = Ld->getExtensionType();
52417 unsigned Fast;
52418 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
52419 Ext == ISD::NON_EXTLOAD &&
52420 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
52421 Ld->getAlign() >= Align(16)) ||
52422 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
52423 *Ld->getMemOperand(), &Fast) &&
52424 !Fast))) {
52425 unsigned NumElems = RegVT.getVectorNumElements();
52426 if (NumElems < 2)
52427 return SDValue();
52428
52429 unsigned HalfOffset = 16;
52430 SDValue Ptr1 = Ld->getBasePtr();
52431 SDValue Ptr2 =
52432 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
52433 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
52434 NumElems / 2);
52435 SDValue Load1 =
52436 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
52437 Ld->getOriginalAlign(),
52438 Ld->getMemOperand()->getFlags());
52439 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
52440 Ld->getPointerInfo().getWithOffset(HalfOffset),
52441 Ld->getOriginalAlign(),
52442 Ld->getMemOperand()->getFlags());
52443 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
52444 Load1.getValue(1), Load2.getValue(1));
52445
52446 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
52447 return DCI.CombineTo(N, NewVec, TF, true);
52448 }
52449
52450 // Bool vector load - attempt to cast to an integer, as we have good
52451 // (vXiY *ext(vXi1 bitcast(iX))) handling.
52452 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
52453 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
52454 unsigned NumElts = RegVT.getVectorNumElements();
52455 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52456 if (TLI.isTypeLegal(IntVT)) {
52457 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
52458 Ld->getPointerInfo(),
52459 Ld->getOriginalAlign(),
52460 Ld->getMemOperand()->getFlags());
52461 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
52462 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
52463 }
52464 }
52465
52466 // If we also broadcast this vector to a wider type, then just extract the
52467 // lowest subvector.
52468 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
52469 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
52470 SDValue Ptr = Ld->getBasePtr();
52471 SDValue Chain = Ld->getChain();
52472 for (SDNode *User : Chain->users()) {
52473 auto *UserLd = dyn_cast<MemSDNode>(User);
52474 if (User != N && UserLd &&
52475 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
52476 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
52477 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
52478 !User->hasAnyUseOfValue(1) &&
52479 User->getValueSizeInBits(0).getFixedValue() >
52480 RegVT.getFixedSizeInBits()) {
52481 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
52482 RegVT.getSizeInBits());
52483 Extract = DAG.getBitcast(RegVT, Extract);
52484 return DCI.CombineTo(N, Extract, SDValue(User, 1));
52485 }
52486 }
52487 }
52488
52489 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
52490 return V;
52491
52492 // Cast ptr32 and ptr64 pointers to the default address space before a load.
52493 unsigned AddrSpace = Ld->getAddressSpace();
52494 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
52495 AddrSpace == X86AS::PTR32_UPTR) {
52496 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
52497 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
52498 SDValue Cast =
52499 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
52500 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
52501 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
52502 Ld->getMemOperand()->getFlags());
52503 }
52504 }
52505
52506 return SDValue();
52507}
52508
52509/// If V is a build vector of boolean constants and exactly one of those
52510/// constants is true, return the operand index of that true element.
52511/// Otherwise, return -1.
52512static int getOneTrueElt(SDValue V) {
52513 // This needs to be a build vector of booleans.
52514 // TODO: Checking for the i1 type matches the IR definition for the mask,
52515 // but the mask check could be loosened to i8 or other types. That might
52516 // also require checking more than 'allOnesValue'; eg, the x86 HW
52517 // instructions only require that the MSB is set for each mask element.
52518 // The ISD::MSTORE comments/definition do not specify how the mask operand
52519 // is formatted.
52520 auto *BV = dyn_cast<BuildVectorSDNode>(V);
52521 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
52522 return -1;
52523
52524 int TrueIndex = -1;
52525 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
52526 for (unsigned i = 0; i < NumElts; ++i) {
52527 const SDValue &Op = BV->getOperand(i);
52528 if (Op.isUndef())
52529 continue;
52530 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
52531 if (!ConstNode)
52532 return -1;
52533 if (ConstNode->getAPIntValue().countr_one() >= 1) {
52534 // If we already found a one, this is too many.
52535 if (TrueIndex >= 0)
52536 return -1;
52537 TrueIndex = i;
52538 }
52539 }
52540 return TrueIndex;
52541}
52542
52543/// Given a masked memory load/store operation, return true if it has one mask
52544/// bit set. If it has one mask bit set, then also return the memory address of
52545/// the scalar element to load/store, the vector index to insert/extract that
52546/// scalar element, and the alignment for the scalar memory access.
52548 SelectionDAG &DAG, SDValue &Addr,
52549 SDValue &Index, Align &Alignment,
52550 unsigned &Offset) {
52551 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
52552 if (TrueMaskElt < 0)
52553 return false;
52554
52555 // Get the address of the one scalar element that is specified by the mask
52556 // using the appropriate offset from the base pointer.
52557 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
52558 Offset = 0;
52559 Addr = MaskedOp->getBasePtr();
52560 if (TrueMaskElt != 0) {
52561 Offset = TrueMaskElt * EltVT.getStoreSize();
52563 SDLoc(MaskedOp));
52564 }
52565
52566 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
52567 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
52568 EltVT.getStoreSize());
52569 return true;
52570}
52571
52572/// If exactly one element of the mask is set for a non-extending masked load,
52573/// it is a scalar load and vector insert.
52574/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52575/// mask have already been optimized in IR, so we don't bother with those here.
52576static SDValue
52579 const X86Subtarget &Subtarget) {
52580 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52581 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52582 // However, some target hooks may need to be added to know when the transform
52583 // is profitable. Endianness would also have to be considered.
52584
52585 SDValue Addr, VecIndex;
52586 Align Alignment;
52587 unsigned Offset;
52588 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
52589 return SDValue();
52590
52591 // Load the one scalar element that is specified by the mask using the
52592 // appropriate offset from the base pointer.
52593 SDLoc DL(ML);
52594 EVT VT = ML->getValueType(0);
52595 EVT EltVT = VT.getVectorElementType();
52596
52597 EVT CastVT = VT;
52598 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
52599 EltVT = MVT::f64;
52600 CastVT = VT.changeVectorElementType(EltVT);
52601 }
52602
52603 SDValue Load =
52604 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
52605 ML->getPointerInfo().getWithOffset(Offset),
52606 Alignment, ML->getMemOperand()->getFlags());
52607
52608 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
52609
52610 // Insert the loaded element into the appropriate place in the vector.
52611 SDValue Insert =
52612 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
52613 Insert = DAG.getBitcast(VT, Insert);
52614 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
52615}
52616
52617static SDValue
52620 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52621 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
52622 return SDValue();
52623
52624 SDLoc DL(ML);
52625 EVT VT = ML->getValueType(0);
52626
52627 // If we are loading the first and last elements of a vector, it is safe and
52628 // always faster to load the whole vector. Replace the masked load with a
52629 // vector load and select.
52630 unsigned NumElts = VT.getVectorNumElements();
52631 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
52632 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
52633 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
52634 if (LoadFirstElt && LoadLastElt) {
52635 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
52636 ML->getMemOperand());
52637 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
52638 ML->getPassThru());
52639 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
52640 }
52641
52642 // Convert a masked load with a constant mask into a masked load and a select.
52643 // This allows the select operation to use a faster kind of select instruction
52644 // (for example, vblendvps -> vblendps).
52645
52646 // Don't try this if the pass-through operand is already undefined. That would
52647 // cause an infinite loop because that's what we're about to create.
52648 if (ML->getPassThru().isUndef())
52649 return SDValue();
52650
52651 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
52652 return SDValue();
52653
52654 // The new masked load has an undef pass-through operand. The select uses the
52655 // original pass-through operand.
52656 SDValue NewML = DAG.getMaskedLoad(
52657 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
52658 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
52659 ML->getAddressingMode(), ML->getExtensionType());
52660 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
52661 ML->getPassThru());
52662
52663 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
52664}
52665
52668 const X86Subtarget &Subtarget) {
52669 auto *Mld = cast<MaskedLoadSDNode>(N);
52670
52671 // TODO: Expanding load with constant mask may be optimized as well.
52672 if (Mld->isExpandingLoad())
52673 return SDValue();
52674
52675 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
52676 if (SDValue ScalarLoad =
52677 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
52678 return ScalarLoad;
52679
52680 // TODO: Do some AVX512 subsets benefit from this transform?
52681 if (!Subtarget.hasAVX512())
52682 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
52683 return Blend;
52684 }
52685
52686 // If the mask value has been legalized to a non-boolean vector, try to
52687 // simplify ops leading up to it. We only demand the MSB of each lane.
52688 SDValue Mask = Mld->getMask();
52689 if (Mask.getScalarValueSizeInBits() != 1) {
52690 EVT VT = Mld->getValueType(0);
52691 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52693 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
52694 if (N->getOpcode() != ISD::DELETED_NODE)
52695 DCI.AddToWorklist(N);
52696 return SDValue(N, 0);
52697 }
52698 if (SDValue NewMask =
52700 return DAG.getMaskedLoad(
52701 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
52702 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
52703 Mld->getAddressingMode(), Mld->getExtensionType());
52704 }
52705
52706 return SDValue();
52707}
52708
52709/// If exactly one element of the mask is set for a non-truncating masked store,
52710/// it is a vector extract and scalar store.
52711/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52712/// mask have already been optimized in IR, so we don't bother with those here.
52714 SelectionDAG &DAG,
52715 const X86Subtarget &Subtarget) {
52716 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52717 // However, some target hooks may need to be added to know when the transform
52718 // is profitable. Endianness would also have to be considered.
52719
52720 SDValue Addr, VecIndex;
52721 Align Alignment;
52722 unsigned Offset;
52723 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
52724 return SDValue();
52725
52726 // Extract the one scalar element that is actually being stored.
52727 SDLoc DL(MS);
52728 SDValue Value = MS->getValue();
52729 EVT VT = Value.getValueType();
52730 EVT EltVT = VT.getVectorElementType();
52731 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
52732 EltVT = MVT::f64;
52733 EVT CastVT = VT.changeVectorElementType(EltVT);
52734 Value = DAG.getBitcast(CastVT, Value);
52735 }
52736 SDValue Extract =
52737 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
52738
52739 // Store that element at the appropriate offset from the base pointer.
52740 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
52742 Alignment, MS->getMemOperand()->getFlags());
52743}
52744
52747 const X86Subtarget &Subtarget) {
52748 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
52749 if (Mst->isCompressingStore())
52750 return SDValue();
52751
52752 EVT VT = Mst->getValue().getValueType();
52753 SDLoc dl(Mst);
52754 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52755
52756 if (Mst->isTruncatingStore())
52757 return SDValue();
52758
52759 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
52760 return ScalarStore;
52761
52762 // If the mask value has been legalized to a non-boolean vector, try to
52763 // simplify ops leading up to it. We only demand the MSB of each lane.
52764 SDValue Mask = Mst->getMask();
52765 if (Mask.getScalarValueSizeInBits() != 1) {
52767 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
52768 if (N->getOpcode() != ISD::DELETED_NODE)
52769 DCI.AddToWorklist(N);
52770 return SDValue(N, 0);
52771 }
52772 if (SDValue NewMask =
52774 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
52775 Mst->getBasePtr(), Mst->getOffset(), NewMask,
52776 Mst->getMemoryVT(), Mst->getMemOperand(),
52777 Mst->getAddressingMode());
52778 }
52779
52780 SDValue Value = Mst->getValue();
52781 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
52782 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
52783 Mst->getMemoryVT())) {
52784 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
52785 Mst->getBasePtr(), Mst->getOffset(), Mask,
52786 Mst->getMemoryVT(), Mst->getMemOperand(),
52787 Mst->getAddressingMode(), true);
52788 }
52789
52790 return SDValue();
52791}
52792
52795 const X86Subtarget &Subtarget) {
52796 StoreSDNode *St = cast<StoreSDNode>(N);
52797 EVT StVT = St->getMemoryVT();
52798 SDLoc dl(St);
52799 SDValue StoredVal = St->getValue();
52800 EVT VT = StoredVal.getValueType();
52801 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52802
52803 // Convert a store of vXi1 into a store of iX and a bitcast.
52804 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
52805 VT.getVectorElementType() == MVT::i1) {
52806
52808 StoredVal = DAG.getBitcast(NewVT, StoredVal);
52809
52810 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52811 St->getPointerInfo(), St->getOriginalAlign(),
52812 St->getMemOperand()->getFlags());
52813 }
52814
52815 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
52816 // This will avoid a copy to k-register.
52817 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
52818 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
52819 StoredVal.getOperand(0).getValueType() == MVT::i8) {
52820 SDValue Val = StoredVal.getOperand(0);
52821 // We must store zeros to the unused bits.
52822 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
52823 return DAG.getStore(St->getChain(), dl, Val,
52824 St->getBasePtr(), St->getPointerInfo(),
52825 St->getOriginalAlign(),
52826 St->getMemOperand()->getFlags());
52827 }
52828
52829 // Widen v2i1/v4i1 stores to v8i1.
52830 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
52831 Subtarget.hasAVX512()) {
52832 unsigned NumConcats = 8 / VT.getVectorNumElements();
52833 // We must store zeros to the unused bits.
52834 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
52835 Ops[0] = StoredVal;
52836 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
52837 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52838 St->getPointerInfo(), St->getOriginalAlign(),
52839 St->getMemOperand()->getFlags());
52840 }
52841
52842 // Turn vXi1 stores of constants into a scalar store.
52843 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
52844 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
52846 // If its a v64i1 store without 64-bit support, we need two stores.
52847 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
52848 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
52849 StoredVal->ops().slice(0, 32));
52851 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
52852 StoredVal->ops().slice(32, 32));
52854
52855 SDValue Ptr0 = St->getBasePtr();
52856 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
52857
52858 SDValue Ch0 =
52859 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
52860 St->getOriginalAlign(),
52861 St->getMemOperand()->getFlags());
52862 SDValue Ch1 =
52863 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
52865 St->getOriginalAlign(),
52866 St->getMemOperand()->getFlags());
52867 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
52868 }
52869
52870 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
52871 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52872 St->getPointerInfo(), St->getOriginalAlign(),
52873 St->getMemOperand()->getFlags());
52874 }
52875
52876 // Convert scalar fabs/fneg load-store to integer equivalents.
52877 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
52878 (StoredVal.getOpcode() == ISD::FABS ||
52879 StoredVal.getOpcode() == ISD::FNEG) &&
52880 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
52881 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
52882 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
52883 if (TLI.isTypeLegal(IntVT)) {
52885 unsigned SignOp = ISD::XOR;
52886 if (StoredVal.getOpcode() == ISD::FABS) {
52887 SignMask = ~SignMask;
52888 SignOp = ISD::AND;
52889 }
52890 SDValue LogicOp = DAG.getNode(
52891 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
52892 DAG.getConstant(SignMask, dl, IntVT));
52893 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
52894 St->getPointerInfo(), St->getOriginalAlign(),
52895 St->getMemOperand()->getFlags());
52896 }
52897 }
52898
52899 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
52900 // Sandy Bridge, perform two 16-byte stores.
52901 unsigned Fast;
52902 if (VT.is256BitVector() && StVT == VT &&
52903 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
52904 *St->getMemOperand(), &Fast) &&
52905 !Fast) {
52906 unsigned NumElems = VT.getVectorNumElements();
52907 if (NumElems < 2)
52908 return SDValue();
52909
52910 return splitVectorStore(St, DAG);
52911 }
52912
52913 // Split under-aligned vector non-temporal stores.
52914 if (St->isNonTemporal() && StVT == VT &&
52915 St->getAlign().value() < VT.getStoreSize()) {
52916 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
52917 // vectors or the legalizer can scalarize it to use MOVNTI.
52918 if (VT.is256BitVector() || VT.is512BitVector()) {
52919 unsigned NumElems = VT.getVectorNumElements();
52920 if (NumElems < 2)
52921 return SDValue();
52922 return splitVectorStore(St, DAG);
52923 }
52924
52925 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
52926 // to use MOVNTI.
52927 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
52928 MVT NTVT = Subtarget.hasSSE4A()
52929 ? MVT::v2f64
52930 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
52931 return scalarizeVectorStore(St, NTVT, DAG);
52932 }
52933 }
52934
52935 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
52936 // supported, but avx512f is by extending to v16i32 and truncating.
52937 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
52938 St->getValue().getOpcode() == ISD::TRUNCATE &&
52939 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
52940 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
52941 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
52942 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
52943 St->getValue().getOperand(0));
52944 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
52945 MVT::v16i8, St->getMemOperand());
52946 }
52947
52948 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
52949 if (!St->isTruncatingStore() &&
52950 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
52951 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
52952 StoredVal.hasOneUse() &&
52953 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
52954 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
52955 return EmitTruncSStore(IsSigned, St->getChain(),
52956 dl, StoredVal.getOperand(0), St->getBasePtr(),
52957 VT, St->getMemOperand(), DAG);
52958 }
52959
52960 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
52961 if (!St->isTruncatingStore()) {
52962 auto IsExtractedElement = [](SDValue V) {
52963 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
52964 V = V.getOperand(0);
52965 unsigned Opc = V.getOpcode();
52966 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
52967 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
52968 V.getOperand(0).hasOneUse())
52969 return V.getOperand(0);
52970 return SDValue();
52971 };
52972 if (SDValue Extract = IsExtractedElement(StoredVal)) {
52973 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
52974 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
52975 SDValue Src = Trunc.getOperand(0);
52976 MVT DstVT = Trunc.getSimpleValueType();
52977 MVT SrcVT = Src.getSimpleValueType();
52978 unsigned NumSrcElts = SrcVT.getVectorNumElements();
52979 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
52980 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
52981 if (NumTruncBits == VT.getSizeInBits() &&
52982 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
52983 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
52984 TruncVT, St->getMemOperand());
52985 }
52986 }
52987 }
52988 }
52989
52990 // Optimize trunc store (of multiple scalars) to shuffle and store.
52991 // First, pack all of the elements in one place. Next, store to memory
52992 // in fewer chunks.
52993 if (St->isTruncatingStore() && VT.isVector()) {
52994 if (TLI.isTruncStoreLegal(VT, StVT)) {
52995 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
52996 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
52997 dl, Val, St->getBasePtr(),
52998 St->getMemoryVT(), St->getMemOperand(), DAG);
52999 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53000 DAG, dl))
53001 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53002 dl, Val, St->getBasePtr(),
53003 St->getMemoryVT(), St->getMemOperand(), DAG);
53004 }
53005
53006 return SDValue();
53007 }
53008
53009 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53010 unsigned AddrSpace = St->getAddressSpace();
53011 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53012 AddrSpace == X86AS::PTR32_UPTR) {
53013 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53014 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53015 SDValue Cast =
53016 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53017 return DAG.getTruncStore(
53018 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53019 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
53020 St->getAAInfo());
53021 }
53022 }
53023
53024 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53025 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53026 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53027 Subtarget.hasCF() && St->isSimple()) {
53028 SDValue Cmov;
53029 if (StoredVal.getOpcode() == X86ISD::CMOV)
53030 Cmov = StoredVal;
53031 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53032 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53033 Cmov = StoredVal.getOperand(0);
53034 else
53035 return SDValue();
53036
53037 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53038 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53039 return SDValue();
53040
53041 bool InvertCC = false;
53042 SDValue V = SDValue(Ld, 0);
53043 if (V == Cmov.getOperand(1))
53044 InvertCC = true;
53045 else if (V != Cmov.getOperand(0))
53046 return SDValue();
53047
53048 SDVTList Tys = DAG.getVTList(MVT::Other);
53049 SDValue CC = Cmov.getOperand(2);
53050 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53051 if (InvertCC)
53052 CC = DAG.getTargetConstant(
53055 dl, MVT::i8);
53056 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53057 Cmov.getOperand(3)};
53058 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53059 St->getMemOperand());
53060 }
53061
53062 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53063 // the FP state in cases where an emms may be missing.
53064 // A preferable solution to the general problem is to figure out the right
53065 // places to insert EMMS. This qualifies as a quick hack.
53066
53067 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53068 if (VT.getSizeInBits() != 64)
53069 return SDValue();
53070
53071 const Function &F = DAG.getMachineFunction().getFunction();
53072 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53073 bool F64IsLegal =
53074 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53075
53076 if (!F64IsLegal || Subtarget.is64Bit())
53077 return SDValue();
53078
53079 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53080 cast<LoadSDNode>(St->getValue())->isSimple() &&
53081 St->getChain().hasOneUse() && St->isSimple()) {
53082 auto *Ld = cast<LoadSDNode>(St->getValue());
53083
53084 if (!ISD::isNormalLoad(Ld))
53085 return SDValue();
53086
53087 // Avoid the transformation if there are multiple uses of the loaded value.
53088 if (!Ld->hasNUsesOfValue(1, 0))
53089 return SDValue();
53090
53091 SDLoc LdDL(Ld);
53092 SDLoc StDL(N);
53093 // Lower to a single movq load/store pair.
53094 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53095 Ld->getBasePtr(), Ld->getMemOperand());
53096
53097 // Make sure new load is placed in same chain order.
53098 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53099 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53100 St->getMemOperand());
53101 }
53102
53103 // This is similar to the above case, but here we handle a scalar 64-bit
53104 // integer store that is extracted from a vector on a 32-bit target.
53105 // If we have SSE2, then we can treat it like a floating-point double
53106 // to get past legalization. The execution dependencies fixup pass will
53107 // choose the optimal machine instruction for the store if this really is
53108 // an integer or v2f32 rather than an f64.
53109 if (VT == MVT::i64 &&
53111 SDValue OldExtract = St->getOperand(1);
53112 SDValue ExtOp0 = OldExtract.getOperand(0);
53113 unsigned VecSize = ExtOp0.getValueSizeInBits();
53114 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53115 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53116 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53117 BitCast, OldExtract.getOperand(1));
53118 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53119 St->getPointerInfo(), St->getOriginalAlign(),
53120 St->getMemOperand()->getFlags());
53121 }
53122
53123 return SDValue();
53124}
53125
53128 const X86Subtarget &Subtarget) {
53129 auto *St = cast<MemIntrinsicSDNode>(N);
53130
53131 SDValue StoredVal = N->getOperand(1);
53132 MVT VT = StoredVal.getSimpleValueType();
53133 EVT MemVT = St->getMemoryVT();
53134
53135 // Figure out which elements we demand.
53136 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53137 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53138
53139 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53140 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53141 if (N->getOpcode() != ISD::DELETED_NODE)
53142 DCI.AddToWorklist(N);
53143 return SDValue(N, 0);
53144 }
53145
53146 return SDValue();
53147}
53148
53149/// Return 'true' if this vector operation is "horizontal"
53150/// and return the operands for the horizontal operation in LHS and RHS. A
53151/// horizontal operation performs the binary operation on successive elements
53152/// of its first operand, then on successive elements of its second operand,
53153/// returning the resulting values in a vector. For example, if
53154/// A = < float a0, float a1, float a2, float a3 >
53155/// and
53156/// B = < float b0, float b1, float b2, float b3 >
53157/// then the result of doing a horizontal operation on A and B is
53158/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53159/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53160/// A horizontal-op B, for some already available A and B, and if so then LHS is
53161/// set to A, RHS to B, and the routine returns 'true'.
53162static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53163 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53164 bool IsCommutative,
53165 SmallVectorImpl<int> &PostShuffleMask,
53166 bool ForceHorizOp) {
53167 // If either operand is undef, bail out. The binop should be simplified.
53168 if (LHS.isUndef() || RHS.isUndef())
53169 return false;
53170
53171 // Look for the following pattern:
53172 // A = < float a0, float a1, float a2, float a3 >
53173 // B = < float b0, float b1, float b2, float b3 >
53174 // and
53175 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53176 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53177 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53178 // which is A horizontal-op B.
53179
53180 MVT VT = LHS.getSimpleValueType();
53181 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53182 "Unsupported vector type for horizontal add/sub");
53183 unsigned NumElts = VT.getVectorNumElements();
53184
53185 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53186 SmallVectorImpl<int> &ShuffleMask) {
53187 bool UseSubVector = false;
53188 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53189 Op.getOperand(0).getValueType().is256BitVector() &&
53190 llvm::isNullConstant(Op.getOperand(1))) {
53191 Op = Op.getOperand(0);
53192 UseSubVector = true;
53193 }
53195 SmallVector<int, 16> SrcMask, ScaledMask;
53197 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53198 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53199 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53200 })) {
53201 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53202 if (!UseSubVector && SrcOps.size() <= 2 &&
53203 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53204 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53205 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53206 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53207 }
53208 if (UseSubVector && SrcOps.size() == 1 &&
53209 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53210 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53211 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53212 ShuffleMask.assign(Mask.begin(), Mask.end());
53213 }
53214 }
53215 };
53216
53217 // View LHS in the form
53218 // LHS = VECTOR_SHUFFLE A, B, LMask
53219 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53220 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53221 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53222 SDValue A, B;
53224 GetShuffle(LHS, A, B, LMask);
53225
53226 // Likewise, view RHS in the form
53227 // RHS = VECTOR_SHUFFLE C, D, RMask
53228 SDValue C, D;
53230 GetShuffle(RHS, C, D, RMask);
53231
53232 // At least one of the operands should be a vector shuffle.
53233 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53234 if (NumShuffles == 0)
53235 return false;
53236
53237 if (LMask.empty()) {
53238 A = LHS;
53239 for (unsigned i = 0; i != NumElts; ++i)
53240 LMask.push_back(i);
53241 }
53242
53243 if (RMask.empty()) {
53244 C = RHS;
53245 for (unsigned i = 0; i != NumElts; ++i)
53246 RMask.push_back(i);
53247 }
53248
53249 // If we have an unary mask, ensure the other op is set to null.
53250 if (isUndefOrInRange(LMask, 0, NumElts))
53251 B = SDValue();
53252 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53253 A = SDValue();
53254
53255 if (isUndefOrInRange(RMask, 0, NumElts))
53256 D = SDValue();
53257 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53258 C = SDValue();
53259
53260 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53261 // RHS operands and shuffle mask.
53262 if (A != C) {
53263 std::swap(C, D);
53265 }
53266 // Check that the shuffles are both shuffling the same vectors.
53267 if (!(A == C && B == D))
53268 return false;
53269
53270 PostShuffleMask.clear();
53271 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53272
53273 // LHS and RHS are now:
53274 // LHS = shuffle A, B, LMask
53275 // RHS = shuffle A, B, RMask
53276 // Check that the masks correspond to performing a horizontal operation.
53277 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53278 // so we just repeat the inner loop if this is a 256-bit op.
53279 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53280 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53281 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53282 assert((NumEltsPer128BitChunk % 2 == 0) &&
53283 "Vector type should have an even number of elements in each lane");
53284 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53285 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53286 // Ignore undefined components.
53287 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53288 if (LIdx < 0 || RIdx < 0 ||
53289 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53290 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53291 continue;
53292
53293 // Check that successive odd/even elements are being operated on. If not,
53294 // this is not a horizontal operation.
53295 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53296 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53297 return false;
53298
53299 // Compute the post-shuffle mask index based on where the element
53300 // is stored in the HOP result, and where it needs to be moved to.
53301 int Base = LIdx & ~1u;
53302 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53303 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53304
53305 // The low half of the 128-bit result must choose from A.
53306 // The high half of the 128-bit result must choose from B,
53307 // unless B is undef. In that case, we are always choosing from A.
53308 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53309 Index += NumEltsPer64BitChunk;
53310 PostShuffleMask[i + j] = Index;
53311 }
53312 }
53313
53314 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53315 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53316
53317 bool IsIdentityPostShuffle =
53318 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53319 if (IsIdentityPostShuffle)
53320 PostShuffleMask.clear();
53321
53322 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53323 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53324 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53325 return false;
53326
53327 // If the source nodes are already used in HorizOps then always accept this.
53328 // Shuffle folding should merge these back together.
53329 auto FoundHorizUser = [&](SDNode *User) {
53330 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53331 };
53332 ForceHorizOp =
53333 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53334 llvm::any_of(NewRHS->users(), FoundHorizUser));
53335
53336 // Assume a SingleSource HOP if we only shuffle one input and don't need to
53337 // shuffle the result.
53338 if (!ForceHorizOp &&
53339 !shouldUseHorizontalOp(NewLHS == NewRHS &&
53340 (NumShuffles < 2 || !IsIdentityPostShuffle),
53341 DAG, Subtarget))
53342 return false;
53343
53344 LHS = DAG.getBitcast(VT, NewLHS);
53345 RHS = DAG.getBitcast(VT, NewRHS);
53346 return true;
53347}
53348
53349// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
53351 const X86Subtarget &Subtarget) {
53352 EVT VT = N->getValueType(0);
53353 unsigned Opcode = N->getOpcode();
53354 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
53355 SmallVector<int, 8> PostShuffleMask;
53356
53357 auto MergableHorizOp = [N](unsigned HorizOpcode) {
53358 return N->hasOneUse() &&
53359 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
53360 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
53361 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
53362 };
53363
53364 switch (Opcode) {
53365 case ISD::FADD:
53366 case ISD::FSUB:
53367 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
53368 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
53369 SDValue LHS = N->getOperand(0);
53370 SDValue RHS = N->getOperand(1);
53371 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
53372 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53373 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53374 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
53375 if (!PostShuffleMask.empty())
53376 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53377 DAG.getUNDEF(VT), PostShuffleMask);
53378 return HorizBinOp;
53379 }
53380 }
53381 break;
53382 case ISD::ADD:
53383 case ISD::SUB:
53384 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
53385 VT == MVT::v16i16 || VT == MVT::v8i32)) {
53386 SDValue LHS = N->getOperand(0);
53387 SDValue RHS = N->getOperand(1);
53388 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
53389 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53390 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53391 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
53392 ArrayRef<SDValue> Ops) {
53393 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
53394 };
53395 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
53396 {LHS, RHS}, HOpBuilder);
53397 if (!PostShuffleMask.empty())
53398 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53399 DAG.getUNDEF(VT), PostShuffleMask);
53400 return HorizBinOp;
53401 }
53402 }
53403 break;
53404 }
53405
53406 return SDValue();
53407}
53408
53409// Try to combine the following nodes
53410// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
53411// <i32 -2147483648[float -0.000000e+00]> 0
53412// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
53413// <(load 4 from constant-pool)> t0, t29
53414// [t30: v16i32 = bitcast t27]
53415// t6: v16i32 = xor t7, t27[t30]
53416// t11: v16f32 = bitcast t6
53417// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
53418// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
53419// t22: v16f32 = bitcast t7
53420// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
53421// t24: v32f16 = bitcast t23
53423 const X86Subtarget &Subtarget) {
53424 EVT VT = N->getValueType(0);
53425 SDValue LHS = N->getOperand(0);
53426 SDValue RHS = N->getOperand(1);
53427 int CombineOpcode =
53428 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
53429 auto combineConjugation = [&](SDValue &r) {
53430 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
53431 SDValue XOR = LHS.getOperand(0);
53432 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
53433 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
53434 if (XORRHS.isConstant()) {
53435 APInt ConjugationInt32 = APInt(32, 0x80000000);
53436 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
53437 if ((XORRHS.getBitWidth() == 32 &&
53438 XORRHS.getConstant() == ConjugationInt32) ||
53439 (XORRHS.getBitWidth() == 64 &&
53440 XORRHS.getConstant() == ConjugationInt64)) {
53441 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
53442 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
53443 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
53444 r = DAG.getBitcast(VT, FCMulC);
53445 return true;
53446 }
53447 }
53448 }
53449 }
53450 return false;
53451 };
53452 SDValue Res;
53453 if (combineConjugation(Res))
53454 return Res;
53455 std::swap(LHS, RHS);
53456 if (combineConjugation(Res))
53457 return Res;
53458 return Res;
53459}
53460
53461// Try to combine the following nodes:
53462// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
53464 const X86Subtarget &Subtarget) {
53465 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
53467 Flags.hasAllowContract();
53468 };
53469
53470 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
53471 return DAG.getTarget().Options.NoSignedZerosFPMath ||
53472 Flags.hasNoSignedZeros();
53473 };
53474 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
53475 APInt AI = APInt(32, 0x80008000);
53476 KnownBits Bits = DAG.computeKnownBits(Op);
53477 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
53478 Bits.getConstant() == AI;
53479 };
53480
53481 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
53482 !AllowContract(N->getFlags()))
53483 return SDValue();
53484
53485 EVT VT = N->getValueType(0);
53486 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
53487 return SDValue();
53488
53489 SDValue LHS = N->getOperand(0);
53490 SDValue RHS = N->getOperand(1);
53491 bool IsConj;
53492 SDValue FAddOp1, MulOp0, MulOp1;
53493 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
53494 &IsVectorAllNegativeZero,
53495 &HasNoSignedZero](SDValue N) -> bool {
53496 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
53497 return false;
53498 SDValue Op0 = N.getOperand(0);
53499 unsigned Opcode = Op0.getOpcode();
53500 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
53501 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
53502 MulOp0 = Op0.getOperand(0);
53503 MulOp1 = Op0.getOperand(1);
53504 IsConj = Opcode == X86ISD::VFCMULC;
53505 return true;
53506 }
53507 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
53509 HasNoSignedZero(Op0->getFlags())) ||
53510 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
53511 MulOp0 = Op0.getOperand(0);
53512 MulOp1 = Op0.getOperand(1);
53513 IsConj = Opcode == X86ISD::VFCMADDC;
53514 return true;
53515 }
53516 }
53517 return false;
53518 };
53519
53520 if (GetCFmulFrom(LHS))
53521 FAddOp1 = RHS;
53522 else if (GetCFmulFrom(RHS))
53523 FAddOp1 = LHS;
53524 else
53525 return SDValue();
53526
53527 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
53528 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
53529 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
53530 // FIXME: How do we handle when fast math flags of FADD are different from
53531 // CFMUL's?
53532 SDValue CFmul =
53533 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
53534 return DAG.getBitcast(VT, CFmul);
53535}
53536
53537/// Do target-specific dag combines on floating-point adds/subs.
53539 const X86Subtarget &Subtarget) {
53540 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
53541 return HOp;
53542
53543 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
53544 return COp;
53545
53546 return SDValue();
53547}
53548
53550 const X86Subtarget &Subtarget) {
53551 EVT VT = N->getValueType(0);
53552 SDValue Src = N->getOperand(0);
53553 EVT SrcVT = Src.getValueType();
53554 SDLoc DL(N);
53555
53556 if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
53557 SrcVT != MVT::v2f32)
53558 return SDValue();
53559
53560 return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
53561 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
53562 DAG.getUNDEF(SrcVT)));
53563}
53564
53565/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
53566/// the codegen.
53567/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
53568/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
53569/// anything that is guaranteed to be transformed by DAGCombiner.
53571 const X86Subtarget &Subtarget,
53572 const SDLoc &DL) {
53573 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
53574 SDValue Src = N->getOperand(0);
53575 unsigned SrcOpcode = Src.getOpcode();
53576 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53577
53578 EVT VT = N->getValueType(0);
53579 EVT SrcVT = Src.getValueType();
53580
53581 auto IsFreeTruncation = [VT](SDValue Op) {
53582 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
53583
53584 // See if this has been extended from a smaller/equal size to
53585 // the truncation size, allowing a truncation to combine with the extend.
53586 unsigned Opcode = Op.getOpcode();
53587 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
53588 Opcode == ISD::ZERO_EXTEND) &&
53589 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
53590 return true;
53591
53592 // See if this is a single use constant which can be constant folded.
53593 // NOTE: We don't peek throught bitcasts here because there is currently
53594 // no support for constant folding truncate+bitcast+vector_of_constants. So
53595 // we'll just send up with a truncate on both operands which will
53596 // get turned back into (truncate (binop)) causing an infinite loop.
53597 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
53598 };
53599
53600 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
53601 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
53602 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
53603 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
53604 };
53605
53606 // Don't combine if the operation has other uses.
53607 if (!Src.hasOneUse())
53608 return SDValue();
53609
53610 // Only support vector truncation for now.
53611 // TODO: i64 scalar math would benefit as well.
53612 if (!VT.isVector())
53613 return SDValue();
53614
53615 // In most cases its only worth pre-truncating if we're only facing the cost
53616 // of one truncation.
53617 // i.e. if one of the inputs will constant fold or the input is repeated.
53618 switch (SrcOpcode) {
53619 case ISD::MUL:
53620 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
53621 // better to truncate if we have the chance.
53622 if (SrcVT.getScalarType() == MVT::i64 &&
53623 TLI.isOperationLegal(SrcOpcode, VT) &&
53624 !TLI.isOperationLegal(SrcOpcode, SrcVT))
53625 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
53626 [[fallthrough]];
53627 case ISD::AND:
53628 case ISD::XOR:
53629 case ISD::OR:
53630 case ISD::ADD:
53631 case ISD::SUB: {
53632 SDValue Op0 = Src.getOperand(0);
53633 SDValue Op1 = Src.getOperand(1);
53634 if (TLI.isOperationLegal(SrcOpcode, VT) &&
53635 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
53636 return TruncateArithmetic(Op0, Op1);
53637 break;
53638 }
53639 }
53640
53641 return SDValue();
53642}
53643
53644// Try to form a MULHU or MULHS node by looking for
53645// (trunc (srl (mul ext, ext), 16))
53646// TODO: This is X86 specific because we want to be able to handle wide types
53647// before type legalization. But we can only do it if the vector will be
53648// legalized via widening/splitting. Type legalization can't handle promotion
53649// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
53650// combiner.
53651static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
53652 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
53653 using namespace llvm::SDPatternMatch;
53654
53655 if (!Subtarget.hasSSE2())
53656 return SDValue();
53657
53658 // Only handle vXi16 types that are at least 128-bits unless they will be
53659 // widened.
53660 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
53661 return SDValue();
53662
53663 // Input type should be at least vXi32.
53664 EVT InVT = Src.getValueType();
53665 if (InVT.getVectorElementType().getSizeInBits() < 32)
53666 return SDValue();
53667
53668 // First instruction should be a right shift by 16 of a multiply.
53669 SDValue LHS, RHS;
53670 if (!sd_match(Src,
53671 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_SpecificInt(16))))
53672 return SDValue();
53673
53674 // Count leading sign/zero bits on both inputs - if there are enough then
53675 // truncation back to vXi16 will be cheap - either as a pack/shuffle
53676 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
53677 // truncations may actually be free by peeking through to the ext source.
53678 auto IsSext = [&DAG](SDValue V) {
53679 return DAG.ComputeMaxSignificantBits(V) <= 16;
53680 };
53681 auto IsZext = [&DAG](SDValue V) {
53682 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
53683 };
53684
53685 bool IsSigned = IsSext(LHS) && IsSext(RHS);
53686 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
53687 if (!IsSigned && !IsUnsigned)
53688 return SDValue();
53689
53690 // Check if both inputs are extensions, which will be removed by truncation.
53691 auto isOpTruncateFree = [](SDValue Op) {
53692 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
53693 Op.getOpcode() == ISD::ZERO_EXTEND)
53694 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
53695 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
53696 };
53697 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
53698
53699 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
53700 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
53701 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
53702 // will have to split anyway.
53703 unsigned InSizeInBits = InVT.getSizeInBits();
53704 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
53705 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
53706 (InSizeInBits % 16) == 0) {
53707 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53708 InVT.getSizeInBits() / 16);
53709 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
53710 DAG.getBitcast(BCVT, RHS));
53711 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
53712 }
53713
53714 // Truncate back to source type.
53715 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
53716 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
53717
53718 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
53719 return DAG.getNode(Opc, DL, VT, LHS, RHS);
53720}
53721
53722// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
53723// from one vector with signed bytes from another vector, adds together
53724// adjacent pairs of 16-bit products, and saturates the result before
53725// truncating to 16-bits.
53726//
53727// Which looks something like this:
53728// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
53729// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
53731 const X86Subtarget &Subtarget,
53732 const SDLoc &DL) {
53733 if (!VT.isVector() || !Subtarget.hasSSSE3())
53734 return SDValue();
53735
53736 unsigned NumElems = VT.getVectorNumElements();
53737 EVT ScalarVT = VT.getVectorElementType();
53738 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
53739 return SDValue();
53740
53741 SDValue SSatVal = detectSSatPattern(In, VT);
53742 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
53743 return SDValue();
53744
53745 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
53746 // of multiplies from even/odd elements.
53747 SDValue N0 = SSatVal.getOperand(0);
53748 SDValue N1 = SSatVal.getOperand(1);
53749
53750 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
53751 return SDValue();
53752
53753 SDValue N00 = N0.getOperand(0);
53754 SDValue N01 = N0.getOperand(1);
53755 SDValue N10 = N1.getOperand(0);
53756 SDValue N11 = N1.getOperand(1);
53757
53758 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
53759 // Canonicalize zero_extend to LHS.
53760 if (N01.getOpcode() == ISD::ZERO_EXTEND)
53761 std::swap(N00, N01);
53762 if (N11.getOpcode() == ISD::ZERO_EXTEND)
53763 std::swap(N10, N11);
53764
53765 // Ensure we have a zero_extend and a sign_extend.
53766 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
53767 N01.getOpcode() != ISD::SIGN_EXTEND ||
53768 N10.getOpcode() != ISD::ZERO_EXTEND ||
53769 N11.getOpcode() != ISD::SIGN_EXTEND)
53770 return SDValue();
53771
53772 // Peek through the extends.
53773 N00 = N00.getOperand(0);
53774 N01 = N01.getOperand(0);
53775 N10 = N10.getOperand(0);
53776 N11 = N11.getOperand(0);
53777
53778 // Ensure the extend is from vXi8.
53779 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
53780 N01.getValueType().getVectorElementType() != MVT::i8 ||
53781 N10.getValueType().getVectorElementType() != MVT::i8 ||
53782 N11.getValueType().getVectorElementType() != MVT::i8)
53783 return SDValue();
53784
53785 // All inputs should be build_vectors.
53786 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
53787 N01.getOpcode() != ISD::BUILD_VECTOR ||
53788 N10.getOpcode() != ISD::BUILD_VECTOR ||
53790 return SDValue();
53791
53792 // N00/N10 are zero extended. N01/N11 are sign extended.
53793
53794 // For each element, we need to ensure we have an odd element from one vector
53795 // multiplied by the odd element of another vector and the even element from
53796 // one of the same vectors being multiplied by the even element from the
53797 // other vector. So we need to make sure for each element i, this operator
53798 // is being performed:
53799 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
53800 SDValue ZExtIn, SExtIn;
53801 for (unsigned i = 0; i != NumElems; ++i) {
53802 SDValue N00Elt = N00.getOperand(i);
53803 SDValue N01Elt = N01.getOperand(i);
53804 SDValue N10Elt = N10.getOperand(i);
53805 SDValue N11Elt = N11.getOperand(i);
53806 // TODO: Be more tolerant to undefs.
53807 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53808 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53809 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53811 return SDValue();
53812 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
53813 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
53814 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
53815 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
53816 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
53817 return SDValue();
53818 unsigned IdxN00 = ConstN00Elt->getZExtValue();
53819 unsigned IdxN01 = ConstN01Elt->getZExtValue();
53820 unsigned IdxN10 = ConstN10Elt->getZExtValue();
53821 unsigned IdxN11 = ConstN11Elt->getZExtValue();
53822 // Add is commutative so indices can be reordered.
53823 if (IdxN00 > IdxN10) {
53824 std::swap(IdxN00, IdxN10);
53825 std::swap(IdxN01, IdxN11);
53826 }
53827 // N0 indices be the even element. N1 indices must be the next odd element.
53828 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
53829 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
53830 return SDValue();
53831 SDValue N00In = N00Elt.getOperand(0);
53832 SDValue N01In = N01Elt.getOperand(0);
53833 SDValue N10In = N10Elt.getOperand(0);
53834 SDValue N11In = N11Elt.getOperand(0);
53835 // First time we find an input capture it.
53836 if (!ZExtIn) {
53837 ZExtIn = N00In;
53838 SExtIn = N01In;
53839 }
53840 if (ZExtIn != N00In || SExtIn != N01In ||
53841 ZExtIn != N10In || SExtIn != N11In)
53842 return SDValue();
53843 }
53844
53845 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
53846 EVT ExtVT = Ext.getValueType();
53847 if (ExtVT.getVectorNumElements() != NumElems * 2) {
53848 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
53849 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
53850 DAG.getVectorIdxConstant(0, DL));
53851 }
53852 };
53853 ExtractVec(ZExtIn);
53854 ExtractVec(SExtIn);
53855
53856 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
53857 ArrayRef<SDValue> Ops) {
53858 // Shrink by adding truncate nodes and let DAGCombine fold with the
53859 // sources.
53860 EVT InVT = Ops[0].getValueType();
53861 assert(InVT.getScalarType() == MVT::i8 &&
53862 "Unexpected scalar element type");
53863 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
53864 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53865 InVT.getVectorNumElements() / 2);
53866 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
53867 };
53868 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
53869 PMADDBuilder);
53870}
53871
53873 const X86Subtarget &Subtarget) {
53874 EVT VT = N->getValueType(0);
53875 SDValue Src = N->getOperand(0);
53876 SDLoc DL(N);
53877
53878 // Attempt to pre-truncate inputs to arithmetic ops instead.
53879 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
53880 return V;
53881
53882 // Try to detect PMADD
53883 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
53884 return PMAdd;
53885
53886 // Try to combine truncation with signed/unsigned saturation.
53887 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
53888 return Val;
53889
53890 // Try to combine PMULHUW/PMULHW for vXi16.
53891 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
53892 return V;
53893
53894 // The bitcast source is a direct mmx result.
53895 // Detect bitcasts between i32 to x86mmx
53896 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
53897 SDValue BCSrc = Src.getOperand(0);
53898 if (BCSrc.getValueType() == MVT::x86mmx)
53899 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
53900 }
53901
53902 // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
53903 if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
53904 Src.hasOneUse())
53905 return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
53906
53907 return SDValue();
53908}
53909
53912 EVT VT = N->getValueType(0);
53913 SDValue In = N->getOperand(0);
53914 SDLoc DL(N);
53915
53916 if (SDValue SSatVal = detectSSatPattern(In, VT))
53917 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
53918 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
53919 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
53920
53921 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53922 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
53923 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53924 return SDValue(N, 0);
53925
53926 return SDValue();
53927}
53928
53929/// Returns the negated value if the node \p N flips sign of FP value.
53930///
53931/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
53932/// or FSUB(0, x)
53933/// AVX512F does not have FXOR, so FNEG is lowered as
53934/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
53935/// In this case we go though all bitcasts.
53936/// This also recognizes splat of a negated value and returns the splat of that
53937/// value.
53938static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
53939 if (N->getOpcode() == ISD::FNEG)
53940 return N->getOperand(0);
53941
53942 // Don't recurse exponentially.
53944 return SDValue();
53945
53946 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
53947
53949 EVT VT = Op->getValueType(0);
53950
53951 // Make sure the element size doesn't change.
53952 if (VT.getScalarSizeInBits() != ScalarSize)
53953 return SDValue();
53954
53955 unsigned Opc = Op.getOpcode();
53956 switch (Opc) {
53957 case ISD::VECTOR_SHUFFLE: {
53958 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
53959 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
53960 if (!Op.getOperand(1).isUndef())
53961 return SDValue();
53962 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
53963 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
53964 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
53965 cast<ShuffleVectorSDNode>(Op)->getMask());
53966 break;
53967 }
53969 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
53970 // -V, INDEX).
53971 SDValue InsVector = Op.getOperand(0);
53972 SDValue InsVal = Op.getOperand(1);
53973 if (!InsVector.isUndef())
53974 return SDValue();
53975 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
53976 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
53977 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
53978 NegInsVal, Op.getOperand(2));
53979 break;
53980 }
53981 case ISD::FSUB:
53982 case ISD::XOR:
53983 case X86ISD::FXOR: {
53984 SDValue Op1 = Op.getOperand(1);
53985 SDValue Op0 = Op.getOperand(0);
53986
53987 // For XOR and FXOR, we want to check if constant
53988 // bits of Op1 are sign bit masks. For FSUB, we
53989 // have to check if constant bits of Op0 are sign
53990 // bit masks and hence we swap the operands.
53991 if (Opc == ISD::FSUB)
53992 std::swap(Op0, Op1);
53993
53994 APInt UndefElts;
53995 SmallVector<APInt, 16> EltBits;
53996 // Extract constant bits and see if they are all
53997 // sign bit masks. Ignore the undef elements.
53998 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
53999 /* AllowWholeUndefs */ true,
54000 /* AllowPartialUndefs */ false)) {
54001 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54002 if (!UndefElts[I] && !EltBits[I].isSignMask())
54003 return SDValue();
54004
54005 // Only allow bitcast from correctly-sized constant.
54006 Op0 = peekThroughBitcasts(Op0);
54007 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54008 return Op0;
54009 }
54010 break;
54011 } // case
54012 } // switch
54013
54014 return SDValue();
54015}
54016
54017static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54018 bool NegRes) {
54019 if (NegMul) {
54020 switch (Opcode) {
54021 // clang-format off
54022 default: llvm_unreachable("Unexpected opcode");
54023 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54024 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54025 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54026 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54027 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54028 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54029 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54030 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54031 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54032 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54033 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54034 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54035 // clang-format on
54036 }
54037 }
54038
54039 if (NegAcc) {
54040 switch (Opcode) {
54041 // clang-format off
54042 default: llvm_unreachable("Unexpected opcode");
54043 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54044 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54045 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54046 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54047 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54048 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54049 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54050 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54051 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54052 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54053 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54054 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54055 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54056 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54057 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54058 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54059 // clang-format on
54060 }
54061 }
54062
54063 if (NegRes) {
54064 switch (Opcode) {
54065 // For accuracy reason, we never combine fneg and fma under strict FP.
54066 // clang-format off
54067 default: llvm_unreachable("Unexpected opcode");
54068 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54069 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54070 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54071 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54072 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54073 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54074 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54075 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54076 // clang-format on
54077 }
54078 }
54079
54080 return Opcode;
54081}
54082
54083/// Do target-specific dag combines on floating point negations.
54086 const X86Subtarget &Subtarget) {
54087 EVT OrigVT = N->getValueType(0);
54088 SDValue Arg = isFNEG(DAG, N);
54089 if (!Arg)
54090 return SDValue();
54091
54092 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54093 EVT VT = Arg.getValueType();
54094 EVT SVT = VT.getScalarType();
54095 SDLoc DL(N);
54096
54097 // Let legalize expand this if it isn't a legal type yet.
54098 if (!TLI.isTypeLegal(VT))
54099 return SDValue();
54100
54101 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54102 // use of a constant by performing (-0 - A*B) instead.
54103 // FIXME: Check rounding control flags as well once it becomes available.
54104 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54105 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54106 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54107 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54108 Arg.getOperand(1), Zero);
54109 return DAG.getBitcast(OrigVT, NewNode);
54110 }
54111
54112 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54113 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54114 if (SDValue NegArg =
54115 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54116 return DAG.getBitcast(OrigVT, NegArg);
54117
54118 return SDValue();
54119}
54120
54122 bool LegalOperations,
54123 bool ForCodeSize,
54125 unsigned Depth) const {
54126 // fneg patterns are removable even if they have multiple uses.
54127 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54129 return DAG.getBitcast(Op.getValueType(), Arg);
54130 }
54131
54132 EVT VT = Op.getValueType();
54133 EVT SVT = VT.getScalarType();
54134 unsigned Opc = Op.getOpcode();
54135 SDNodeFlags Flags = Op.getNode()->getFlags();
54136 switch (Opc) {
54137 case ISD::FMA:
54138 case X86ISD::FMSUB:
54139 case X86ISD::FNMADD:
54140 case X86ISD::FNMSUB:
54141 case X86ISD::FMADD_RND:
54142 case X86ISD::FMSUB_RND:
54143 case X86ISD::FNMADD_RND:
54144 case X86ISD::FNMSUB_RND: {
54145 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54146 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54148 break;
54149
54150 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54151 // if it may have signed zeros.
54152 if (!Flags.hasNoSignedZeros())
54153 break;
54154
54155 // This is always negatible for free but we might be able to remove some
54156 // extra operand negations as well.
54158 for (int i = 0; i != 3; ++i)
54159 NewOps[i] = getCheaperNegatedExpression(
54160 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54161
54162 bool NegA = !!NewOps[0];
54163 bool NegB = !!NewOps[1];
54164 bool NegC = !!NewOps[2];
54165 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54166
54167 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54169
54170 // Fill in the non-negated ops with the original values.
54171 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54172 if (!NewOps[i])
54173 NewOps[i] = Op.getOperand(i);
54174 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54175 }
54176 case X86ISD::FRCP:
54177 if (SDValue NegOp0 =
54178 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54179 ForCodeSize, Cost, Depth + 1))
54180 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54181 break;
54182 }
54183
54184 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54185 ForCodeSize, Cost, Depth);
54186}
54187
54189 const X86Subtarget &Subtarget) {
54190 MVT VT = N->getSimpleValueType(0);
54191 // If we have integer vector types available, use the integer opcodes.
54192 if (!VT.isVector() || !Subtarget.hasSSE2())
54193 return SDValue();
54194
54195 SDLoc dl(N);
54196
54197 unsigned IntBits = VT.getScalarSizeInBits();
54198 MVT IntSVT = MVT::getIntegerVT(IntBits);
54199 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
54200
54201 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54202 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54203 unsigned IntOpcode;
54204 switch (N->getOpcode()) {
54205 // clang-format off
54206 default: llvm_unreachable("Unexpected FP logic op");
54207 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54208 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54209 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54210 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54211 // clang-format on
54212 }
54213 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54214 return DAG.getBitcast(VT, IntOp);
54215}
54216
54217
54218/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54220 if (N->getOpcode() != ISD::XOR)
54221 return SDValue();
54222
54223 SDValue LHS = N->getOperand(0);
54224 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54225 return SDValue();
54226
54228 X86::CondCode(LHS->getConstantOperandVal(0)));
54229 SDLoc DL(N);
54230 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54231}
54232
54234 const X86Subtarget &Subtarget) {
54235 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54236 "Invalid opcode for combing with CTLZ");
54237 if (Subtarget.hasFastLZCNT())
54238 return SDValue();
54239
54240 EVT VT = N->getValueType(0);
54241 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54242 (VT != MVT::i64 || !Subtarget.is64Bit()))
54243 return SDValue();
54244
54245 SDValue N0 = N->getOperand(0);
54246 SDValue N1 = N->getOperand(1);
54247
54248 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54250 return SDValue();
54251
54252 SDValue OpCTLZ;
54253 SDValue OpSizeTM1;
54254
54255 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54256 OpCTLZ = N1;
54257 OpSizeTM1 = N0;
54258 } else if (N->getOpcode() == ISD::SUB) {
54259 return SDValue();
54260 } else {
54261 OpCTLZ = N0;
54262 OpSizeTM1 = N1;
54263 }
54264
54265 if (!OpCTLZ.hasOneUse())
54266 return SDValue();
54267 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
54268 if (!C)
54269 return SDValue();
54270
54271 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54272 return SDValue();
54273 EVT OpVT = VT;
54274 SDValue Op = OpCTLZ.getOperand(0);
54275 if (VT == MVT::i8) {
54276 // Zero extend to i32 since there is not an i8 bsr.
54277 OpVT = MVT::i32;
54278 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
54279 }
54280
54281 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
54282 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
54283 if (VT == MVT::i8)
54284 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
54285
54286 return Op;
54287}
54288
54291 const X86Subtarget &Subtarget) {
54292 SDValue N0 = N->getOperand(0);
54293 SDValue N1 = N->getOperand(1);
54294 EVT VT = N->getValueType(0);
54295 SDLoc DL(N);
54296
54297 // If this is SSE1 only convert to FXOR to avoid scalarization.
54298 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
54299 return DAG.getBitcast(MVT::v4i32,
54300 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
54301 DAG.getBitcast(MVT::v4f32, N0),
54302 DAG.getBitcast(MVT::v4f32, N1)));
54303 }
54304
54305 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
54306 return Cmp;
54307
54308 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
54309 return R;
54310
54311 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
54312 return R;
54313
54314 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
54315 return R;
54316
54317 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
54318 DAG, DCI, Subtarget))
54319 return FPLogic;
54320
54321 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
54322 return R;
54323
54324 if (DCI.isBeforeLegalizeOps())
54325 return SDValue();
54326
54327 if (SDValue SetCC = foldXor1SetCC(N, DAG))
54328 return SetCC;
54329
54330 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
54331 return R;
54332
54333 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
54334 return RV;
54335
54336 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
54337 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54338 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
54339 N0.getOperand(0).getValueType().isVector() &&
54340 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
54341 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
54342 return DAG.getBitcast(
54343 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
54344 }
54345
54346 // Handle AVX512 mask widening.
54347 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
54348 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
54349 VT.getVectorElementType() == MVT::i1 &&
54351 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
54352 return DAG.getNode(
54354 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
54355 N0.getOperand(2));
54356 }
54357
54358 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
54359 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
54360 // TODO: Under what circumstances could this be performed in DAGCombine?
54361 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
54362 N0.getOperand(0).getOpcode() == N->getOpcode()) {
54363 SDValue TruncExtSrc = N0.getOperand(0);
54364 auto *N1C = dyn_cast<ConstantSDNode>(N1);
54365 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
54366 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
54367 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
54368 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
54369 return DAG.getNode(ISD::XOR, DL, VT, LHS,
54370 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
54371 }
54372 }
54373
54374 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
54375 return R;
54376
54377 return combineFneg(N, DAG, DCI, Subtarget);
54378}
54379
54382 const X86Subtarget &Subtarget) {
54383 SDValue N0 = N->getOperand(0);
54384 EVT VT = N->getValueType(0);
54385
54386 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
54387 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
54388 SDValue Src = N0.getOperand(0);
54389 EVT SrcVT = Src.getValueType();
54390 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
54391 (DCI.isBeforeLegalize() ||
54392 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
54393 Subtarget.hasSSSE3()) {
54394 unsigned NumElts = SrcVT.getVectorNumElements();
54395 SmallVector<int, 32> ReverseMask(NumElts);
54396 for (unsigned I = 0; I != NumElts; ++I)
54397 ReverseMask[I] = (NumElts - 1) - I;
54398 SDValue Rev =
54399 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
54400 return DAG.getBitcast(VT, Rev);
54401 }
54402 }
54403
54404 return SDValue();
54405}
54406
54407// Various combines to try to convert to avgceilu.
54410 const X86Subtarget &Subtarget) {
54411 unsigned Opcode = N->getOpcode();
54412 SDValue N0 = N->getOperand(0);
54413 SDValue N1 = N->getOperand(1);
54414 EVT VT = N->getValueType(0);
54415 EVT SVT = VT.getScalarType();
54416 SDLoc DL(N);
54417
54418 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
54419 // Only useful on vXi8 which doesn't have good SRA handling.
54420 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
54422 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
54423 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
54424 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
54425 return DAG.getNode(ISD::XOR, DL, VT,
54426 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
54427 }
54428
54429 return SDValue();
54430}
54431
54434 const X86Subtarget &Subtarget) {
54435 EVT VT = N->getValueType(0);
54436 unsigned NumBits = VT.getSizeInBits();
54437
54438 // TODO - Constant Folding.
54439
54440 // Simplify the inputs.
54441 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54442 APInt DemandedMask(APInt::getAllOnes(NumBits));
54443 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54444 return SDValue(N, 0);
54445
54446 return SDValue();
54447}
54448
54450 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
54451}
54452
54453/// If a value is a scalar FP zero or a vector FP zero (potentially including
54454/// undefined elements), return a zero constant that may be used to fold away
54455/// that value. In the case of a vector, the returned constant will not contain
54456/// undefined elements even if the input parameter does. This makes it suitable
54457/// to be used as a replacement operand with operations (eg, bitwise-and) where
54458/// an undef should not propagate.
54460 const X86Subtarget &Subtarget) {
54462 return SDValue();
54463
54464 if (V.getValueType().isVector())
54465 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
54466
54467 return V;
54468}
54469
54471 const X86Subtarget &Subtarget) {
54472 SDValue N0 = N->getOperand(0);
54473 SDValue N1 = N->getOperand(1);
54474 EVT VT = N->getValueType(0);
54475 SDLoc DL(N);
54476
54477 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
54478 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
54479 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
54480 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
54481 return SDValue();
54482
54483 auto isAllOnesConstantFP = [](SDValue V) {
54484 if (V.getSimpleValueType().isVector())
54485 return ISD::isBuildVectorAllOnes(V.getNode());
54486 auto *C = dyn_cast<ConstantFPSDNode>(V);
54487 return C && C->getConstantFPValue()->isAllOnesValue();
54488 };
54489
54490 // fand (fxor X, -1), Y --> fandn X, Y
54491 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
54492 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
54493
54494 // fand X, (fxor Y, -1) --> fandn Y, X
54495 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
54496 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
54497
54498 return SDValue();
54499}
54500
54501/// Do target-specific dag combines on X86ISD::FAND nodes.
54503 const X86Subtarget &Subtarget) {
54504 // FAND(0.0, x) -> 0.0
54505 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
54506 return V;
54507
54508 // FAND(x, 0.0) -> 0.0
54509 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54510 return V;
54511
54512 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
54513 return V;
54514
54515 return lowerX86FPLogicOp(N, DAG, Subtarget);
54516}
54517
54518/// Do target-specific dag combines on X86ISD::FANDN nodes.
54520 const X86Subtarget &Subtarget) {
54521 // FANDN(0.0, x) -> x
54522 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54523 return N->getOperand(1);
54524
54525 // FANDN(x, 0.0) -> 0.0
54526 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54527 return V;
54528
54529 return lowerX86FPLogicOp(N, DAG, Subtarget);
54530}
54531
54532/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
54535 const X86Subtarget &Subtarget) {
54536 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
54537
54538 // F[X]OR(0.0, x) -> x
54539 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54540 return N->getOperand(1);
54541
54542 // F[X]OR(x, 0.0) -> x
54543 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
54544 return N->getOperand(0);
54545
54546 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
54547 return NewVal;
54548
54549 return lowerX86FPLogicOp(N, DAG, Subtarget);
54550}
54551
54552/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
54554 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
54555
54556 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
54557 if (!DAG.getTarget().Options.NoNaNsFPMath ||
54559 return SDValue();
54560
54561 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
54562 // into FMINC and FMAXC, which are Commutative operations.
54563 unsigned NewOp = 0;
54564 switch (N->getOpcode()) {
54565 default: llvm_unreachable("unknown opcode");
54566 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
54567 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
54568 }
54569
54570 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
54571 N->getOperand(0), N->getOperand(1));
54572}
54573
54575 const X86Subtarget &Subtarget) {
54576 EVT VT = N->getValueType(0);
54577 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
54578 return SDValue();
54579
54580 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54581
54582 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
54583 (Subtarget.hasSSE2() && VT == MVT::f64) ||
54584 (Subtarget.hasFP16() && VT == MVT::f16) ||
54585 (VT.isVector() && TLI.isTypeLegal(VT))))
54586 return SDValue();
54587
54588 SDValue Op0 = N->getOperand(0);
54589 SDValue Op1 = N->getOperand(1);
54590 SDLoc DL(N);
54591 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
54592
54593 // If we don't have to respect NaN inputs, this is a direct translation to x86
54594 // min/max instructions.
54595 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
54596 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54597
54598 // If one of the operands is known non-NaN use the native min/max instructions
54599 // with the non-NaN input as second operand.
54600 if (DAG.isKnownNeverNaN(Op1))
54601 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54602 if (DAG.isKnownNeverNaN(Op0))
54603 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
54604
54605 // If we have to respect NaN inputs, this takes at least 3 instructions.
54606 // Favor a library call when operating on a scalar and minimizing code size.
54607 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
54608 return SDValue();
54609
54610 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
54611 VT);
54612
54613 // There are 4 possibilities involving NaN inputs, and these are the required
54614 // outputs:
54615 // Op1
54616 // Num NaN
54617 // ----------------
54618 // Num | Max | Op0 |
54619 // Op0 ----------------
54620 // NaN | Op1 | NaN |
54621 // ----------------
54622 //
54623 // The SSE FP max/min instructions were not designed for this case, but rather
54624 // to implement:
54625 // Min = Op1 < Op0 ? Op1 : Op0
54626 // Max = Op1 > Op0 ? Op1 : Op0
54627 //
54628 // So they always return Op0 if either input is a NaN. However, we can still
54629 // use those instructions for fmaxnum by selecting away a NaN input.
54630
54631 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
54632 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
54633 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
54634
54635 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
54636 // are NaN, the NaN value of Op1 is the result.
54637 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
54638}
54639
54642 EVT VT = N->getValueType(0);
54643 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54644
54645 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
54646 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
54647 return SDValue(N, 0);
54648
54649 // Convert a full vector load into vzload when not all bits are needed.
54650 SDValue In = N->getOperand(0);
54651 MVT InVT = In.getSimpleValueType();
54652 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
54653 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
54654 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54655 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
54656 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
54657 MVT MemVT = MVT::getIntegerVT(NumBits);
54658 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
54659 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
54660 SDLoc dl(N);
54661 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
54662 DAG.getBitcast(InVT, VZLoad));
54663 DCI.CombineTo(N, Convert);
54664 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54666 return SDValue(N, 0);
54667 }
54668 }
54669
54670 return SDValue();
54671}
54672
54676 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
54677 EVT VT = N->getValueType(0);
54678
54679 // Convert a full vector load into vzload when not all bits are needed.
54680 SDValue In = N->getOperand(IsStrict ? 1 : 0);
54681 MVT InVT = In.getSimpleValueType();
54682 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
54683 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
54684 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54685 LoadSDNode *LN = cast<LoadSDNode>(In);
54686 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
54687 MVT MemVT = MVT::getFloatingPointVT(NumBits);
54688 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
54689 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
54690 SDLoc dl(N);
54691 if (IsStrict) {
54692 SDValue Convert =
54693 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
54694 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
54695 DCI.CombineTo(N, Convert, Convert.getValue(1));
54696 } else {
54697 SDValue Convert =
54698 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
54699 DCI.CombineTo(N, Convert);
54700 }
54701 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54703 return SDValue(N, 0);
54704 }
54705 }
54706
54707 return SDValue();
54708}
54709
54710/// Do target-specific dag combines on X86ISD::ANDNP nodes.
54713 const X86Subtarget &Subtarget) {
54714 SDValue N0 = N->getOperand(0);
54715 SDValue N1 = N->getOperand(1);
54716 MVT VT = N->getSimpleValueType(0);
54717 int NumElts = VT.getVectorNumElements();
54718 unsigned EltSizeInBits = VT.getScalarSizeInBits();
54719 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54720 SDLoc DL(N);
54721
54722 // ANDNP(undef, x) -> 0
54723 // ANDNP(x, undef) -> 0
54724 if (N0.isUndef() || N1.isUndef())
54725 return DAG.getConstant(0, DL, VT);
54726
54727 // ANDNP(0, x) -> x
54729 return N1;
54730
54731 // ANDNP(x, 0) -> 0
54733 return DAG.getConstant(0, DL, VT);
54734
54735 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
54737 return DAG.getNOT(DL, N0, VT);
54738
54739 // Turn ANDNP back to AND if input is inverted.
54740 if (SDValue Not = IsNOT(N0, DAG))
54741 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
54742
54743 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
54744 // to make use of predicated selects.
54745 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
54746 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
54747 SDValue Src = N0.getOperand(0);
54748 EVT SrcVT = Src.getValueType();
54749 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
54750 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
54751 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
54752 getZeroVector(VT, Subtarget, DAG, DL));
54753 }
54754
54755 // Constant Folding
54756 APInt Undefs0, Undefs1;
54757 SmallVector<APInt> EltBits0, EltBits1;
54758 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
54759 /*AllowWholeUndefs*/ true,
54760 /*AllowPartialUndefs*/ true)) {
54761 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
54762 /*AllowWholeUndefs*/ true,
54763 /*AllowPartialUndefs*/ true)) {
54764 SmallVector<APInt> ResultBits;
54765 for (int I = 0; I != NumElts; ++I)
54766 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
54767 return getConstVector(ResultBits, VT, DAG, DL);
54768 }
54769
54770 // Constant fold NOT(N0) to allow us to use AND.
54771 // Ensure this is only performed if we can confirm that the bitcasted source
54772 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
54773 if (N0->hasOneUse()) {
54775 if (BC0.getOpcode() != ISD::BITCAST) {
54776 for (APInt &Elt : EltBits0)
54777 Elt = ~Elt;
54778 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
54779 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
54780 }
54781 }
54782 }
54783
54784 // Attempt to recursively combine a bitmask ANDNP with shuffles.
54785 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
54786 SDValue Op(N, 0);
54787 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
54788 return Res;
54789
54790 // If either operand is a constant mask, then only the elements that aren't
54791 // zero are actually demanded by the other operand.
54792 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
54793 APInt UndefElts;
54794 SmallVector<APInt> EltBits;
54795 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
54796 APInt DemandedElts = APInt::getAllOnes(NumElts);
54797 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
54798 EltBits)) {
54799 DemandedBits.clearAllBits();
54800 DemandedElts.clearAllBits();
54801 for (int I = 0; I != NumElts; ++I) {
54802 if (UndefElts[I]) {
54803 // We can't assume an undef src element gives an undef dst - the
54804 // other src might be zero.
54805 DemandedBits.setAllBits();
54806 DemandedElts.setBit(I);
54807 } else if ((Invert && !EltBits[I].isAllOnes()) ||
54808 (!Invert && !EltBits[I].isZero())) {
54809 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
54810 DemandedElts.setBit(I);
54811 }
54812 }
54813 }
54814 return std::make_pair(DemandedBits, DemandedElts);
54815 };
54816 APInt Bits0, Elts0;
54817 APInt Bits1, Elts1;
54818 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
54819 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
54820
54821 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
54822 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
54823 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
54824 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
54825 if (N->getOpcode() != ISD::DELETED_NODE)
54826 DCI.AddToWorklist(N);
54827 return SDValue(N, 0);
54828 }
54829 }
54830
54831 // Folds for better commutativity:
54832 if (N1->hasOneUse()) {
54833 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
54834 if (SDValue Not = IsNOT(N1, DAG))
54835 return DAG.getNOT(
54836 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
54837
54838 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
54839 // Zero out elements by setting the PSHUFB mask value to 0xFF.
54840 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
54842 if (BC1.getOpcode() == X86ISD::PSHUFB) {
54843 EVT ShufVT = BC1.getValueType();
54844 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
54845 DAG.getBitcast(ShufVT, N0));
54846 SDValue NewShuf =
54847 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
54848 return DAG.getBitcast(VT, NewShuf);
54849 }
54850 }
54851 }
54852
54853 return SDValue();
54854}
54855
54858 SDValue N1 = N->getOperand(1);
54859
54860 // BT ignores high bits in the bit index operand.
54861 unsigned BitWidth = N1.getValueSizeInBits();
54863 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
54864 if (N->getOpcode() != ISD::DELETED_NODE)
54865 DCI.AddToWorklist(N);
54866 return SDValue(N, 0);
54867 }
54868
54869 return SDValue();
54870}
54871
54874 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
54875 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
54876
54877 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
54878 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54879 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
54880 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
54881 if (N->getOpcode() != ISD::DELETED_NODE)
54882 DCI.AddToWorklist(N);
54883 return SDValue(N, 0);
54884 }
54885
54886 // Convert a full vector load into vzload when not all bits are needed.
54887 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
54888 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
54889 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
54890 SDLoc dl(N);
54891 if (IsStrict) {
54892 SDValue Convert = DAG.getNode(
54893 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
54894 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
54895 DCI.CombineTo(N, Convert, Convert.getValue(1));
54896 } else {
54897 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
54898 DAG.getBitcast(MVT::v8i16, VZLoad));
54899 DCI.CombineTo(N, Convert);
54900 }
54901
54902 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54904 return SDValue(N, 0);
54905 }
54906 }
54907 }
54908
54909 return SDValue();
54910}
54911
54912// Try to combine sext_in_reg of a cmov of constants by extending the constants.
54914 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
54915
54916 EVT DstVT = N->getValueType(0);
54917
54918 SDValue N0 = N->getOperand(0);
54919 SDValue N1 = N->getOperand(1);
54920 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54921
54922 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
54923 return SDValue();
54924
54925 // Look through single use any_extends / truncs.
54926 SDValue IntermediateBitwidthOp;
54927 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
54928 N0.hasOneUse()) {
54929 IntermediateBitwidthOp = N0;
54930 N0 = N0.getOperand(0);
54931 }
54932
54933 // See if we have a single use cmov.
54934 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
54935 return SDValue();
54936
54937 SDValue CMovOp0 = N0.getOperand(0);
54938 SDValue CMovOp1 = N0.getOperand(1);
54939
54940 // Make sure both operands are constants.
54941 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
54942 !isa<ConstantSDNode>(CMovOp1.getNode()))
54943 return SDValue();
54944
54945 SDLoc DL(N);
54946
54947 // If we looked through an any_extend/trunc above, add one to the constants.
54948 if (IntermediateBitwidthOp) {
54949 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
54950 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
54951 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
54952 }
54953
54954 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
54955 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
54956
54957 EVT CMovVT = DstVT;
54958 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
54959 if (DstVT == MVT::i16) {
54960 CMovVT = MVT::i32;
54961 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
54962 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
54963 }
54964
54965 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
54966 N0.getOperand(2), N0.getOperand(3));
54967
54968 if (CMovVT != DstVT)
54969 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
54970
54971 return CMov;
54972}
54973
54975 const X86Subtarget &Subtarget) {
54976 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
54977
54978 if (SDValue V = combineSextInRegCmov(N, DAG))
54979 return V;
54980
54981 EVT VT = N->getValueType(0);
54982 SDValue N0 = N->getOperand(0);
54983 SDValue N1 = N->getOperand(1);
54984 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54985 SDLoc dl(N);
54986
54987 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
54988 // both SSE and AVX2 since there is no sign-extended shift right
54989 // operation on a vector with 64-bit elements.
54990 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
54991 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
54992 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
54993 N0.getOpcode() == ISD::SIGN_EXTEND)) {
54994 SDValue N00 = N0.getOperand(0);
54995
54996 // EXTLOAD has a better solution on AVX2,
54997 // it may be replaced with X86ISD::VSEXT node.
54998 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
54999 if (!ISD::isNormalLoad(N00.getNode()))
55000 return SDValue();
55001
55002 // Attempt to promote any comparison mask ops before moving the
55003 // SIGN_EXTEND_INREG in the way.
55004 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55005 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55006
55007 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55008 SDValue Tmp =
55009 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55010 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55011 }
55012 }
55013 return SDValue();
55014}
55015
55016/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55017/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55018/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55019/// opportunities to combine math ops, use an LEA, or use a complex addressing
55020/// mode. This can eliminate extend, add, and shift instructions.
55022 const X86Subtarget &Subtarget) {
55023 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55024 Ext->getOpcode() != ISD::ZERO_EXTEND)
55025 return SDValue();
55026
55027 // TODO: This should be valid for other integer types.
55028 EVT VT = Ext->getValueType(0);
55029 if (VT != MVT::i64)
55030 return SDValue();
55031
55032 SDValue Add = Ext->getOperand(0);
55033 if (Add.getOpcode() != ISD::ADD)
55034 return SDValue();
55035
55036 SDValue AddOp0 = Add.getOperand(0);
55037 SDValue AddOp1 = Add.getOperand(1);
55038 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55039 bool NSW = Add->getFlags().hasNoSignedWrap();
55040 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55041 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55042 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55043
55044 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55045 // into the 'zext'
55046 if ((Sext && !NSW) || (!Sext && !NUW))
55047 return SDValue();
55048
55049 // Having a constant operand to the 'add' ensures that we are not increasing
55050 // the instruction count because the constant is extended for free below.
55051 // A constant operand can also become the displacement field of an LEA.
55052 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55053 if (!AddOp1C)
55054 return SDValue();
55055
55056 // Don't make the 'add' bigger if there's no hope of combining it with some
55057 // other 'add' or 'shl' instruction.
55058 // TODO: It may be profitable to generate simpler LEA instructions in place
55059 // of single 'add' instructions, but the cost model for selecting an LEA
55060 // currently has a high threshold.
55061 bool HasLEAPotential = false;
55062 for (auto *User : Ext->users()) {
55063 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55064 HasLEAPotential = true;
55065 break;
55066 }
55067 }
55068 if (!HasLEAPotential)
55069 return SDValue();
55070
55071 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55072 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55073 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55074 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55075
55076 // The wider add is guaranteed to not wrap because both operands are
55077 // sign-extended.
55078 SDNodeFlags Flags;
55079 Flags.setNoSignedWrap(NSW);
55080 Flags.setNoUnsignedWrap(NUW);
55081 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55082}
55083
55084// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55085// operands and the result of CMOV is not used anywhere else - promote CMOV
55086// itself instead of promoting its result. This could be beneficial, because:
55087// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55088// (or more) pseudo-CMOVs only when they go one-after-another and
55089// getting rid of result extension code after CMOV will help that.
55090// 2) Promotion of constant CMOV arguments is free, hence the
55091// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55092// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55093// promotion is also good in terms of code-size.
55094// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55095// promotion).
55097 SDValue CMovN = Extend->getOperand(0);
55098 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55099 return SDValue();
55100
55101 EVT TargetVT = Extend->getValueType(0);
55102 unsigned ExtendOpcode = Extend->getOpcode();
55103 SDLoc DL(Extend);
55104
55105 EVT VT = CMovN.getValueType();
55106 SDValue CMovOp0 = CMovN.getOperand(0);
55107 SDValue CMovOp1 = CMovN.getOperand(1);
55108
55109 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55110 !isa<ConstantSDNode>(CMovOp1.getNode()))
55111 return SDValue();
55112
55113 // Only extend to i32 or i64.
55114 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55115 return SDValue();
55116
55117 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55118 // are free.
55119 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55120 return SDValue();
55121
55122 // If this a zero extend to i64, we should only extend to i32 and use a free
55123 // zero extend to finish.
55124 EVT ExtendVT = TargetVT;
55125 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55126 ExtendVT = MVT::i32;
55127
55128 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55129 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55130
55131 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55132 CMovN.getOperand(2), CMovN.getOperand(3));
55133
55134 // Finish extending if needed.
55135 if (ExtendVT != TargetVT)
55136 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55137
55138 return Res;
55139}
55140
55141// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55142// result type.
55144 const X86Subtarget &Subtarget) {
55145 SDValue N0 = N->getOperand(0);
55146 EVT VT = N->getValueType(0);
55147 SDLoc dl(N);
55148
55149 // Only do this combine with AVX512 for vector extends.
55150 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55151 return SDValue();
55152
55153 // Only combine legal element types.
55154 EVT SVT = VT.getVectorElementType();
55155 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55156 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55157 return SDValue();
55158
55159 // We don't have CMPP Instruction for vxf16
55160 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55161 return SDValue();
55162 // We can only do this if the vector size in 256 bits or less.
55163 unsigned Size = VT.getSizeInBits();
55164 if (Size > 256 && Subtarget.useAVX512Regs())
55165 return SDValue();
55166
55167 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55168 // that's the only integer compares with we have.
55169 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
55171 return SDValue();
55172
55173 // Only do this combine if the extension will be fully consumed by the setcc.
55174 EVT N00VT = N0.getOperand(0).getValueType();
55175 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55176 if (Size != MatchingVecType.getSizeInBits())
55177 return SDValue();
55178
55179 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55180
55181 if (N->getOpcode() == ISD::ZERO_EXTEND)
55182 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55183
55184 return Res;
55185}
55186
55189 const X86Subtarget &Subtarget) {
55190 SDValue N0 = N->getOperand(0);
55191 EVT VT = N->getValueType(0);
55192 SDLoc DL(N);
55193
55194 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55195 if (!DCI.isBeforeLegalizeOps() &&
55197 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55198 N0->getOperand(1));
55199 bool ReplaceOtherUses = !N0.hasOneUse();
55200 DCI.CombineTo(N, Setcc);
55201 // Replace other uses with a truncate of the widened setcc_carry.
55202 if (ReplaceOtherUses) {
55203 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55204 N0.getValueType(), Setcc);
55205 DCI.CombineTo(N0.getNode(), Trunc);
55206 }
55207
55208 return SDValue(N, 0);
55209 }
55210
55211 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55212 return NewCMov;
55213
55214 if (!DCI.isBeforeLegalizeOps())
55215 return SDValue();
55216
55217 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55218 return V;
55219
55220 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55221 DAG, DCI, Subtarget))
55222 return V;
55223
55224 if (VT.isVector()) {
55225 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55226 return R;
55227
55229 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55230 }
55231
55232 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55233 return NewAdd;
55234
55235 return SDValue();
55236}
55237
55238// Inverting a constant vector is profitable if it can be eliminated and the
55239// inverted vector is already present in DAG. Otherwise, it will be loaded
55240// anyway.
55241//
55242// We determine which of the values can be completely eliminated and invert it.
55243// If both are eliminable, select a vector with the first negative element.
55246 "ConstantFP build vector expected");
55247 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55248 // can eliminate it. Since this function is invoked for each FMA with this
55249 // vector.
55250 auto IsNotFMA = [](SDNode *User) {
55251 return User->getOpcode() != ISD::FMA &&
55252 User->getOpcode() != ISD::STRICT_FMA;
55253 };
55254 if (llvm::any_of(V->users(), IsNotFMA))
55255 return SDValue();
55256
55258 EVT VT = V.getValueType();
55259 EVT EltVT = VT.getVectorElementType();
55260 for (const SDValue &Op : V->op_values()) {
55261 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55262 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55263 } else {
55264 assert(Op.isUndef());
55265 Ops.push_back(DAG.getUNDEF(EltVT));
55266 }
55267 }
55268
55269 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
55270 if (!NV)
55271 return SDValue();
55272
55273 // If an inverted version cannot be eliminated, choose it instead of the
55274 // original version.
55275 if (llvm::any_of(NV->users(), IsNotFMA))
55276 return SDValue(NV, 0);
55277
55278 // If the inverted version also can be eliminated, we have to consistently
55279 // prefer one of the values. We prefer a constant with a negative value on
55280 // the first place.
55281 // N.B. We need to skip undefs that may precede a value.
55282 for (const SDValue &Op : V->op_values()) {
55283 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55284 if (Cst->isNegative())
55285 return SDValue();
55286 break;
55287 }
55288 }
55289 return SDValue(NV, 0);
55290}
55291
55294 const X86Subtarget &Subtarget) {
55295 SDLoc dl(N);
55296 EVT VT = N->getValueType(0);
55298 bool IsStrict = N->isTargetOpcode()
55299 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
55300 : N->isStrictFPOpcode();
55301
55302 // Let legalize expand this if it isn't a legal type yet.
55303 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55304 if (!TLI.isTypeLegal(VT))
55305 return SDValue();
55306
55307 SDValue A = N->getOperand(IsStrict ? 1 : 0);
55308 SDValue B = N->getOperand(IsStrict ? 2 : 1);
55309 SDValue C = N->getOperand(IsStrict ? 3 : 2);
55310
55311 // If the operation allows fast-math and the target does not support FMA,
55312 // split this into mul+add to avoid libcall(s).
55313 SDNodeFlags Flags = N->getFlags();
55314 if (!IsStrict && Flags.hasAllowReassociation() &&
55315 TLI.isOperationExpand(ISD::FMA, VT)) {
55316 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
55317 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
55318 }
55319
55320 EVT ScalarVT = VT.getScalarType();
55321 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
55322 !Subtarget.hasAnyFMA()) &&
55323 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
55324 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
55325 return SDValue();
55326
55327 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
55328 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
55329 bool LegalOperations = !DCI.isBeforeLegalizeOps();
55330 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
55331 CodeSize)) {
55332 V = NegV;
55333 return true;
55334 }
55335 // Look through extract_vector_elts. If it comes from an FNEG, create a
55336 // new extract from the FNEG input.
55337 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
55338 isNullConstant(V.getOperand(1))) {
55339 SDValue Vec = V.getOperand(0);
55340 if (SDValue NegV = TLI.getCheaperNegatedExpression(
55341 Vec, DAG, LegalOperations, CodeSize)) {
55342 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
55343 NegV, V.getOperand(1));
55344 return true;
55345 }
55346 }
55347 // Lookup if there is an inverted version of constant vector V in DAG.
55348 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
55349 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
55350 V = NegV;
55351 return true;
55352 }
55353 }
55354 return false;
55355 };
55356
55357 // Do not convert the passthru input of scalar intrinsics.
55358 // FIXME: We could allow negations of the lower element only.
55359 bool NegA = invertIfNegative(A);
55360 bool NegB = invertIfNegative(B);
55361 bool NegC = invertIfNegative(C);
55362
55363 if (!NegA && !NegB && !NegC)
55364 return SDValue();
55365
55366 unsigned NewOpcode =
55367 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
55368
55369 // Propagate fast-math-flags to new FMA node.
55370 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
55371 if (IsStrict) {
55372 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
55373 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
55374 {N->getOperand(0), A, B, C});
55375 } else {
55376 if (N->getNumOperands() == 4)
55377 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
55378 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
55379 }
55380}
55381
55382// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
55383// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
55386 SDLoc dl(N);
55387 EVT VT = N->getValueType(0);
55388 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55389 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
55390 bool LegalOperations = !DCI.isBeforeLegalizeOps();
55391
55392 SDValue N2 = N->getOperand(2);
55393
55394 SDValue NegN2 =
55395 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
55396 if (!NegN2)
55397 return SDValue();
55398 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
55399
55400 if (N->getNumOperands() == 4)
55401 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
55402 NegN2, N->getOperand(3));
55403 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
55404 NegN2);
55405}
55406
55409 const X86Subtarget &Subtarget) {
55410 SDLoc dl(N);
55411 SDValue N0 = N->getOperand(0);
55412 EVT VT = N->getValueType(0);
55413
55414 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55415 // FIXME: Is this needed? We don't seem to have any tests for it.
55416 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
55418 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
55419 N0->getOperand(1));
55420 bool ReplaceOtherUses = !N0.hasOneUse();
55421 DCI.CombineTo(N, Setcc);
55422 // Replace other uses with a truncate of the widened setcc_carry.
55423 if (ReplaceOtherUses) {
55424 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55425 N0.getValueType(), Setcc);
55426 DCI.CombineTo(N0.getNode(), Trunc);
55427 }
55428
55429 return SDValue(N, 0);
55430 }
55431
55432 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55433 return NewCMov;
55434
55435 if (DCI.isBeforeLegalizeOps())
55436 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55437 return V;
55438
55439 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
55440 DAG, DCI, Subtarget))
55441 return V;
55442
55443 if (VT.isVector())
55444 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
55445 return R;
55446
55447 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55448 return NewAdd;
55449
55450 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
55451 return R;
55452
55453 // TODO: Combine with any target/faux shuffle.
55454 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
55456 SDValue N00 = N0.getOperand(0);
55457 SDValue N01 = N0.getOperand(1);
55458 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
55459 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
55460 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
55461 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
55462 return concatSubVectors(N00, N01, DAG, dl);
55463 }
55464 }
55465
55466 return SDValue();
55467}
55468
55469/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
55470/// pre-promote its result type since vXi1 vectors don't get promoted
55471/// during type legalization.
55474 const SDLoc &DL, SelectionDAG &DAG,
55475 const X86Subtarget &Subtarget) {
55476 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
55477 VT.getVectorElementType() == MVT::i1 &&
55478 (OpVT.getVectorElementType() == MVT::i8 ||
55479 OpVT.getVectorElementType() == MVT::i16)) {
55480 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
55481 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
55482 }
55483 return SDValue();
55484}
55485
55488 const X86Subtarget &Subtarget) {
55489 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
55490 const SDValue LHS = N->getOperand(0);
55491 const SDValue RHS = N->getOperand(1);
55492 EVT VT = N->getValueType(0);
55493 EVT OpVT = LHS.getValueType();
55494 SDLoc DL(N);
55495
55496 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
55497 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
55498 Subtarget))
55499 return V;
55500
55501 if (VT == MVT::i1) {
55502 X86::CondCode X86CC;
55503 if (SDValue V =
55504 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
55505 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
55506 }
55507
55508 if (OpVT.isScalarInteger()) {
55509 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
55510 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
55511 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
55512 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
55513 if (N0.getOperand(0) == N1)
55514 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
55515 N0.getOperand(1));
55516 if (N0.getOperand(1) == N1)
55517 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
55518 N0.getOperand(0));
55519 }
55520 return SDValue();
55521 };
55522 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
55523 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55524 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
55525 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55526
55527 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
55528 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
55529 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
55530 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
55531 if (N0.getOperand(0) == N1)
55532 return DAG.getNode(ISD::AND, DL, OpVT, N1,
55533 DAG.getNOT(DL, N0.getOperand(1), OpVT));
55534 if (N0.getOperand(1) == N1)
55535 return DAG.getNode(ISD::AND, DL, OpVT, N1,
55536 DAG.getNOT(DL, N0.getOperand(0), OpVT));
55537 }
55538 return SDValue();
55539 };
55540 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
55541 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55542 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
55543 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55544
55545 // cmpeq(trunc(x),C) --> cmpeq(x,C)
55546 // cmpne(trunc(x),C) --> cmpne(x,C)
55547 // iff x upper bits are zero.
55548 if (LHS.getOpcode() == ISD::TRUNCATE &&
55549 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
55550 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
55551 EVT SrcVT = LHS.getOperand(0).getValueType();
55553 OpVT.getScalarSizeInBits());
55554 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55555 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
55556 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
55557 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
55558 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
55559 }
55560
55561 // With C as a power of 2 and C != 0 and C != INT_MIN:
55562 // icmp eq Abs(X) C ->
55563 // (icmp eq A, C) | (icmp eq A, -C)
55564 // icmp ne Abs(X) C ->
55565 // (icmp ne A, C) & (icmp ne A, -C)
55566 // Both of these patterns can be better optimized in
55567 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
55568 // integers which is checked above.
55569 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
55570 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
55571 const APInt &CInt = C->getAPIntValue();
55572 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
55573 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
55574 SDValue BaseOp = LHS.getOperand(0);
55575 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
55576 SDValue SETCC1 = DAG.getSetCC(
55577 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
55578 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
55579 SETCC0, SETCC1);
55580 }
55581 }
55582 }
55583 }
55584 }
55585
55586 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
55588 // Using temporaries to avoid messing up operand ordering for later
55589 // transformations if this doesn't work.
55590 SDValue Op0 = LHS;
55591 SDValue Op1 = RHS;
55592 ISD::CondCode TmpCC = CC;
55593 // Put build_vector on the right.
55594 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
55595 std::swap(Op0, Op1);
55596 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
55597 }
55598
55599 bool IsSEXT0 =
55600 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
55601 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
55602 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
55603
55604 if (IsSEXT0 && IsVZero1) {
55605 assert(VT == Op0.getOperand(0).getValueType() &&
55606 "Unexpected operand type");
55607 if (TmpCC == ISD::SETGT)
55608 return DAG.getConstant(0, DL, VT);
55609 if (TmpCC == ISD::SETLE)
55610 return DAG.getConstant(1, DL, VT);
55611 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
55612 return DAG.getNOT(DL, Op0.getOperand(0), VT);
55613
55614 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
55615 "Unexpected condition code!");
55616 return Op0.getOperand(0);
55617 }
55618 }
55619
55620 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
55621 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
55622 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
55623 // a mask, there are signed AVX512 comparisons).
55624 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
55625 bool CanMakeSigned = false;
55627 KnownBits CmpKnown =
55629 // If we know LHS/RHS share the same sign bit at each element we can
55630 // make this signed.
55631 // NOTE: `computeKnownBits` on a vector type aggregates common bits
55632 // across all lanes. So a pattern where the sign varies from lane to
55633 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
55634 // missed. We could get around this by demanding each lane
55635 // independently, but this isn't the most important optimization and
55636 // that may eat into compile time.
55637 CanMakeSigned =
55638 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
55639 }
55640 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
55641 SDValue LHSOut = LHS;
55642 SDValue RHSOut = RHS;
55643 ISD::CondCode NewCC = CC;
55644 switch (CC) {
55645 case ISD::SETGE:
55646 case ISD::SETUGE:
55647 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
55648 /*NSW*/ true))
55649 LHSOut = NewLHS;
55650 else if (SDValue NewRHS = incDecVectorConstant(
55651 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
55652 RHSOut = NewRHS;
55653 else
55654 break;
55655
55656 [[fallthrough]];
55657 case ISD::SETUGT:
55658 NewCC = ISD::SETGT;
55659 break;
55660
55661 case ISD::SETLE:
55662 case ISD::SETULE:
55663 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
55664 /*NSW*/ true))
55665 LHSOut = NewLHS;
55666 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
55667 /*NSW*/ true))
55668 RHSOut = NewRHS;
55669 else
55670 break;
55671
55672 [[fallthrough]];
55673 case ISD::SETULT:
55674 // Will be swapped to SETGT in LowerVSETCC*.
55675 NewCC = ISD::SETLT;
55676 break;
55677 default:
55678 break;
55679 }
55680 if (NewCC != CC) {
55681 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
55682 NewCC, DL, DAG, Subtarget))
55683 return R;
55684 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
55685 }
55686 }
55687 }
55688
55689 if (SDValue R =
55690 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
55691 return R;
55692
55693 // In the middle end transforms:
55694 // `(or (icmp eq X, C), (icmp eq X, C+1))`
55695 // -> `(icmp ult (add x, -C), 2)`
55696 // Likewise inverted cases with `ugt`.
55697 //
55698 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
55699 // in worse codegen. So, undo the middle-end transform and go back to `(or
55700 // (icmp eq), (icmp eq))` form.
55701 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
55702 // the xmm approach.
55703 //
55704 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
55705 // ne))` as it doesn't end up instruction positive.
55706 // TODO: We might want to do this for avx512 as well if we `sext` the result.
55707 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
55708 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
55709 !Subtarget.hasAVX512() &&
55710 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
55711 Subtarget.hasAVX2()) &&
55712 LHS.hasOneUse()) {
55713
55714 APInt CmpC;
55715 SDValue AddC = LHS.getOperand(1);
55716 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
55718 // See which form we have depending on the constant/condition.
55719 SDValue C0 = SDValue();
55720 SDValue C1 = SDValue();
55721
55722 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
55723 // we will end up generating an additional constant. Keeping in the
55724 // current form has a slight latency cost, but it probably worth saving a
55725 // constant.
55728 // Pass
55729 }
55730 // Normal Cases
55731 else if ((CC == ISD::SETULT && CmpC == 2) ||
55732 (CC == ISD::SETULE && CmpC == 1)) {
55733 // These will constant fold.
55734 C0 = DAG.getNegative(AddC, DL, OpVT);
55735 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
55736 DAG.getAllOnesConstant(DL, OpVT));
55737 }
55738 // Inverted Cases
55739 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
55740 (CC == ISD::SETUGE && (-CmpC) == 2)) {
55741 // These will constant fold.
55742 C0 = DAG.getNOT(DL, AddC, OpVT);
55743 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
55744 DAG.getAllOnesConstant(DL, OpVT));
55745 }
55746 if (C0 && C1) {
55747 SDValue NewLHS =
55748 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
55749 SDValue NewRHS =
55750 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
55751 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
55752 }
55753 }
55754 }
55755
55756 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
55757 // to avoid scalarization via legalization because v4i32 is not a legal type.
55758 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
55759 LHS.getValueType() == MVT::v4f32)
55760 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
55761
55762 // X pred 0.0 --> X pred -X
55763 // If the negation of X already exists, use it in the comparison. This removes
55764 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
55765 // instructions in patterns with a 'select' node.
55767 SDVTList FNegVT = DAG.getVTList(OpVT);
55768 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
55769 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
55770 }
55771
55772 return SDValue();
55773}
55774
55777 const X86Subtarget &Subtarget) {
55778 SDValue Src = N->getOperand(0);
55779 MVT SrcVT = Src.getSimpleValueType();
55780 MVT VT = N->getSimpleValueType(0);
55781 unsigned NumBits = VT.getScalarSizeInBits();
55782 unsigned NumElts = SrcVT.getVectorNumElements();
55783 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
55784 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
55785
55786 // Perform constant folding.
55787 APInt UndefElts;
55788 SmallVector<APInt, 32> EltBits;
55789 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
55790 /*AllowWholeUndefs*/ true,
55791 /*AllowPartialUndefs*/ true)) {
55792 APInt Imm(32, 0);
55793 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
55794 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
55795 Imm.setBit(Idx);
55796
55797 return DAG.getConstant(Imm, SDLoc(N), VT);
55798 }
55799
55800 // Look through int->fp bitcasts that don't change the element width.
55801 unsigned EltWidth = SrcVT.getScalarSizeInBits();
55802 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
55803 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
55804 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
55805
55806 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
55807 // with scalar comparisons.
55808 if (SDValue NotSrc = IsNOT(Src, DAG)) {
55809 SDLoc DL(N);
55810 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
55811 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
55812 return DAG.getNode(ISD::XOR, DL, VT,
55813 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
55814 DAG.getConstant(NotMask, DL, VT));
55815 }
55816
55817 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
55818 // results with scalar comparisons.
55819 if (Src.getOpcode() == X86ISD::PCMPGT &&
55820 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
55821 SDLoc DL(N);
55822 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
55823 return DAG.getNode(ISD::XOR, DL, VT,
55824 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
55825 DAG.getConstant(NotMask, DL, VT));
55826 }
55827
55828 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
55829 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
55830 // iff pow2splat(c1).
55831 // Use KnownBits to determine if only a single bit is non-zero
55832 // in each element (pow2 or zero), and shift that bit to the msb.
55833 if (Src.getOpcode() == X86ISD::PCMPEQ) {
55834 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
55835 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
55836 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
55837 if (KnownLHS.countMaxPopulation() == 1 &&
55838 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
55839 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
55840 SDLoc DL(N);
55841 MVT ShiftVT = SrcVT;
55842 SDValue ShiftLHS = Src.getOperand(0);
55843 SDValue ShiftRHS = Src.getOperand(1);
55844 if (ShiftVT.getScalarType() == MVT::i8) {
55845 // vXi8 shifts - we only care about the signbit so can use PSLLW.
55846 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
55847 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
55848 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
55849 }
55850 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
55851 ShiftLHS, ShiftAmt, DAG);
55852 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
55853 ShiftRHS, ShiftAmt, DAG);
55854 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
55855 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
55856 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
55857 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
55858 }
55859 }
55860
55861 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
55862 if (N->isOnlyUserOf(Src.getNode())) {
55864 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
55865 APInt UndefElts;
55866 SmallVector<APInt, 32> EltBits;
55867 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
55868 UndefElts, EltBits)) {
55869 APInt Mask = APInt::getZero(NumBits);
55870 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
55871 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
55872 Mask.setBit(Idx);
55873 }
55874 SDLoc DL(N);
55875 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
55876 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
55877 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
55878 DAG.getConstant(Mask, DL, VT));
55879 }
55880 }
55881 }
55882
55883 // Simplify the inputs.
55884 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55885 APInt DemandedMask(APInt::getAllOnes(NumBits));
55886 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55887 return SDValue(N, 0);
55888
55889 return SDValue();
55890}
55891
55894 const X86Subtarget &Subtarget) {
55895 MVT VT = N->getSimpleValueType(0);
55896 unsigned NumBits = VT.getScalarSizeInBits();
55897
55898 // Simplify the inputs.
55899 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55900 APInt DemandedMask(APInt::getAllOnes(NumBits));
55901 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55902 return SDValue(N, 0);
55903
55904 return SDValue();
55905}
55906
55909 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
55910 SDValue Mask = MemOp->getMask();
55911
55912 // With vector masks we only demand the upper bit of the mask.
55913 if (Mask.getScalarValueSizeInBits() != 1) {
55914 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55915 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
55916 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
55917 if (N->getOpcode() != ISD::DELETED_NODE)
55918 DCI.AddToWorklist(N);
55919 return SDValue(N, 0);
55920 }
55921 }
55922
55923 return SDValue();
55924}
55925
55927 SDValue Index, SDValue Base, SDValue Scale,
55928 SelectionDAG &DAG) {
55929 SDLoc DL(GorS);
55930
55931 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
55932 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
55933 Gather->getMask(), Base, Index, Scale } ;
55934 return DAG.getMaskedGather(Gather->getVTList(),
55935 Gather->getMemoryVT(), DL, Ops,
55936 Gather->getMemOperand(),
55937 Gather->getIndexType(),
55938 Gather->getExtensionType());
55939 }
55940 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
55941 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
55942 Scatter->getMask(), Base, Index, Scale };
55943 return DAG.getMaskedScatter(Scatter->getVTList(),
55944 Scatter->getMemoryVT(), DL,
55945 Ops, Scatter->getMemOperand(),
55946 Scatter->getIndexType(),
55947 Scatter->isTruncatingStore());
55948}
55949
55952 SDLoc DL(N);
55953 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
55954 SDValue Index = GorS->getIndex();
55955 SDValue Base = GorS->getBasePtr();
55956 SDValue Scale = GorS->getScale();
55957 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55958
55959 if (DCI.isBeforeLegalize()) {
55960 unsigned IndexWidth = Index.getScalarValueSizeInBits();
55961
55962 // Shrink constant indices if they are larger than 32-bits.
55963 // Only do this before legalize types since v2i64 could become v2i32.
55964 // FIXME: We could check that the type is legal if we're after legalize
55965 // types, but then we would need to construct test cases where that happens.
55966 // FIXME: We could support more than just constant vectors, but we need to
55967 // careful with costing. A truncate that can be optimized out would be fine.
55968 // Otherwise we might only want to create a truncate if it avoids a split.
55969 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
55970 if (BV->isConstant() && IndexWidth > 32 &&
55971 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55972 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
55973 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
55974 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55975 }
55976 }
55977
55978 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
55979 // there are sufficient sign bits. Only do this before legalize types to
55980 // avoid creating illegal types in truncate.
55981 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
55982 Index.getOpcode() == ISD::ZERO_EXTEND) &&
55983 IndexWidth > 32 &&
55984 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
55985 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55986 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
55987 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
55988 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55989 }
55990 }
55991
55992 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
55993 // Try to move splat constant adders from the index operand to the base
55994 // pointer operand. Taking care to multiply by the scale. We can only do
55995 // this when index element type is the same as the pointer type.
55996 // Otherwise we need to be sure the math doesn't wrap before the scale.
55997 if (Index.getOpcode() == ISD::ADD &&
55998 Index.getValueType().getVectorElementType() == PtrVT &&
55999 isa<ConstantSDNode>(Scale)) {
56000 uint64_t ScaleAmt = Scale->getAsZExtVal();
56001 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
56002 BitVector UndefElts;
56003 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
56004 // FIXME: Allow non-constant?
56005 if (UndefElts.none()) {
56006 // Apply the scale.
56007 APInt Adder = C->getAPIntValue() * ScaleAmt;
56008 // Add it to the existing base.
56009 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56010 DAG.getConstant(Adder, DL, PtrVT));
56011 Index = Index.getOperand(0);
56012 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56013 }
56014 }
56015
56016 // It's also possible base is just a constant. In that case, just
56017 // replace it with 0 and move the displacement into the index.
56018 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
56019 isOneConstant(Scale)) {
56020 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
56021 // Combine the constant build_vector and the constant base.
56022 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
56023 Index.getOperand(1), Splat);
56024 // Add to the LHS of the original Index add.
56025 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
56026 Index.getOperand(0), Splat);
56027 Base = DAG.getConstant(0, DL, Base.getValueType());
56028 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56029 }
56030 }
56031 }
56032
56033 if (DCI.isBeforeLegalizeOps()) {
56034 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56035
56036 // Make sure the index is either i32 or i64
56037 if (IndexWidth != 32 && IndexWidth != 64) {
56038 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
56039 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
56040 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
56041 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56042 }
56043 }
56044
56045 // With vector masks we only demand the upper bit of the mask.
56046 SDValue Mask = GorS->getMask();
56047 if (Mask.getScalarValueSizeInBits() != 1) {
56048 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56049 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56050 if (N->getOpcode() != ISD::DELETED_NODE)
56051 DCI.AddToWorklist(N);
56052 return SDValue(N, 0);
56053 }
56054 }
56055
56056 return SDValue();
56057}
56058
56059// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
56061 const X86Subtarget &Subtarget) {
56062 SDLoc DL(N);
56063 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
56064 SDValue EFLAGS = N->getOperand(1);
56065
56066 // Try to simplify the EFLAGS and condition code operands.
56067 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
56068 return getSETCC(CC, Flags, DL, DAG);
56069
56070 return SDValue();
56071}
56072
56073/// Optimize branch condition evaluation.
56075 const X86Subtarget &Subtarget) {
56076 SDLoc DL(N);
56077 SDValue EFLAGS = N->getOperand(3);
56078 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
56079
56080 // Try to simplify the EFLAGS and condition code operands.
56081 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
56082 // RAUW them under us.
56083 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
56084 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
56085 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
56086 N->getOperand(1), Cond, Flags);
56087 }
56088
56089 return SDValue();
56090}
56091
56092// TODO: Could we move this to DAGCombine?
56094 SelectionDAG &DAG) {
56095 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
56096 // to optimize away operation when it's from a constant.
56097 //
56098 // The general transformation is:
56099 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
56100 // AND(VECTOR_CMP(x,y), constant2)
56101 // constant2 = UNARYOP(constant)
56102
56103 // Early exit if this isn't a vector operation, the operand of the
56104 // unary operation isn't a bitwise AND, or if the sizes of the operations
56105 // aren't the same.
56106 EVT VT = N->getValueType(0);
56107 bool IsStrict = N->isStrictFPOpcode();
56108 unsigned NumEltBits = VT.getScalarSizeInBits();
56109 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56110 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
56111 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
56112 VT.getSizeInBits() != Op0.getValueSizeInBits())
56113 return SDValue();
56114
56115 // Now check that the other operand of the AND is a constant. We could
56116 // make the transformation for non-constant splats as well, but it's unclear
56117 // that would be a benefit as it would not eliminate any operations, just
56118 // perform one more step in scalar code before moving to the vector unit.
56119 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
56120 // Bail out if the vector isn't a constant.
56121 if (!BV->isConstant())
56122 return SDValue();
56123
56124 // Everything checks out. Build up the new and improved node.
56125 SDLoc DL(N);
56126 EVT IntVT = BV->getValueType(0);
56127 // Create a new constant of the appropriate type for the transformed
56128 // DAG.
56129 SDValue SourceConst;
56130 if (IsStrict)
56131 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
56132 {N->getOperand(0), SDValue(BV, 0)});
56133 else
56134 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
56135 // The AND node needs bitcasts to/from an integer vector type around it.
56136 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
56137 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
56138 MaskConst);
56139 SDValue Res = DAG.getBitcast(VT, NewAnd);
56140 if (IsStrict)
56141 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
56142 return Res;
56143 }
56144
56145 return SDValue();
56146}
56147
56148/// If we are converting a value to floating-point, try to replace scalar
56149/// truncate of an extracted vector element with a bitcast. This tries to keep
56150/// the sequence on XMM registers rather than moving between vector and GPRs.
56152 // TODO: This is currently only used by combineSIntToFP, but it is generalized
56153 // to allow being called by any similar cast opcode.
56154 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
56155 SDValue Trunc = N->getOperand(0);
56156 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
56157 return SDValue();
56158
56159 SDValue ExtElt = Trunc.getOperand(0);
56160 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
56161 !isNullConstant(ExtElt.getOperand(1)))
56162 return SDValue();
56163
56164 EVT TruncVT = Trunc.getValueType();
56165 EVT SrcVT = ExtElt.getValueType();
56166 unsigned DestWidth = TruncVT.getSizeInBits();
56167 unsigned SrcWidth = SrcVT.getSizeInBits();
56168 if (SrcWidth % DestWidth != 0)
56169 return SDValue();
56170
56171 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
56172 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
56173 unsigned VecWidth = SrcVecVT.getSizeInBits();
56174 unsigned NumElts = VecWidth / DestWidth;
56175 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
56176 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
56177 SDLoc DL(N);
56178 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
56179 BitcastVec, ExtElt.getOperand(1));
56180 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
56181}
56182
56184 const X86Subtarget &Subtarget) {
56185 bool IsStrict = N->isStrictFPOpcode();
56186 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56187 EVT VT = N->getValueType(0);
56188 EVT InVT = Op0.getValueType();
56189
56190 // Using i16 as an intermediate type is a bad idea, unless we have HW support
56191 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
56192 // if hasFP16 support:
56193 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
56194 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
56195 // else
56196 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
56197 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
56198 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
56199 unsigned ScalarSize = InVT.getScalarSizeInBits();
56200 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
56201 ScalarSize >= 64)
56202 return SDValue();
56203 SDLoc dl(N);
56204 EVT DstVT =
56206 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
56207 : ScalarSize < 32 ? MVT::i32
56208 : MVT::i64,
56209 InVT.getVectorNumElements());
56210 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
56211 if (IsStrict)
56212 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56213 {N->getOperand(0), P});
56214 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56215 }
56216
56217 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
56218 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
56219 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
56220 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
56221 VT.getScalarType() != MVT::f16) {
56222 SDLoc dl(N);
56223 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
56224 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
56225
56226 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
56227 if (IsStrict)
56228 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56229 {N->getOperand(0), P});
56230 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56231 }
56232
56233 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
56234 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
56235 // the optimization here.
56236 SDNodeFlags Flags = N->getFlags();
56237 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
56238 if (IsStrict)
56239 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
56240 {N->getOperand(0), Op0});
56241 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
56242 }
56243
56244 return SDValue();
56245}
56246
56249 const X86Subtarget &Subtarget) {
56250 // First try to optimize away the conversion entirely when it's
56251 // conditionally from a constant. Vectors only.
56252 bool IsStrict = N->isStrictFPOpcode();
56254 return Res;
56255
56256 // Now move on to more general possibilities.
56257 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56258 EVT VT = N->getValueType(0);
56259 EVT InVT = Op0.getValueType();
56260
56261 // Using i16 as an intermediate type is a bad idea, unless we have HW support
56262 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
56263 // if hasFP16 support:
56264 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
56265 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
56266 // else
56267 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
56268 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
56269 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
56270 unsigned ScalarSize = InVT.getScalarSizeInBits();
56271 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
56272 ScalarSize >= 64)
56273 return SDValue();
56274 SDLoc dl(N);
56275 EVT DstVT =
56277 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
56278 : ScalarSize < 32 ? MVT::i32
56279 : MVT::i64,
56280 InVT.getVectorNumElements());
56281 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
56282 if (IsStrict)
56283 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56284 {N->getOperand(0), P});
56285 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56286 }
56287
56288 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
56289 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
56290 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
56291 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
56292 VT.getScalarType() != MVT::f16) {
56293 SDLoc dl(N);
56294 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
56295 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
56296 if (IsStrict)
56297 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56298 {N->getOperand(0), P});
56299 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56300 }
56301
56302 // Without AVX512DQ we only support i64 to float scalar conversion. For both
56303 // vectors and scalars, see if we know that the upper bits are all the sign
56304 // bit, in which case we can truncate the input to i32 and convert from that.
56305 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
56306 unsigned BitWidth = InVT.getScalarSizeInBits();
56307 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
56308 if (NumSignBits >= (BitWidth - 31)) {
56309 EVT TruncVT = MVT::i32;
56310 if (InVT.isVector())
56311 TruncVT = InVT.changeVectorElementType(TruncVT);
56312 SDLoc dl(N);
56313 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
56314 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
56315 if (IsStrict)
56316 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56317 {N->getOperand(0), Trunc});
56318 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
56319 }
56320 // If we're after legalize and the type is v2i32 we need to shuffle and
56321 // use CVTSI2P.
56322 assert(InVT == MVT::v2i64 && "Unexpected VT!");
56323 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
56324 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
56325 { 0, 2, -1, -1 });
56326 if (IsStrict)
56327 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
56328 {N->getOperand(0), Shuf});
56329 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
56330 }
56331 }
56332
56333 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
56334 // a 32-bit target where SSE doesn't support i64->FP operations.
56335 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
56336 Op0.getOpcode() == ISD::LOAD) {
56337 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
56338
56339 // This transformation is not supported if the result type is f16 or f128.
56340 if (VT == MVT::f16 || VT == MVT::f128)
56341 return SDValue();
56342
56343 // If we have AVX512DQ we can use packed conversion instructions unless
56344 // the VT is f80.
56345 if (Subtarget.hasDQI() && VT != MVT::f80)
56346 return SDValue();
56347
56348 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
56349 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
56350 std::pair<SDValue, SDValue> Tmp =
56351 Subtarget.getTargetLowering()->BuildFILD(
56352 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
56353 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
56354 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
56355 return Tmp.first;
56356 }
56357 }
56358
56359 if (IsStrict)
56360 return SDValue();
56361
56362 if (SDValue V = combineToFPTruncExtElt(N, DAG))
56363 return V;
56364
56365 return SDValue();
56366}
56367
56368// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
56370 const X86Subtarget &Subtarget) {
56371 if (!Subtarget.hasAVX10_2())
56372 return SDValue();
56373
56374 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
56375 EVT SrcVT = N->getOperand(0).getValueType();
56376 EVT DstVT = N->getValueType(0);
56377 SDLoc dl(N);
56378
56379 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
56380 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
56381
56382 // Concatenate the original v2f32 input and V2F32Value to create v4f32
56383 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
56384 N->getOperand(0), V2F32Value);
56385
56386 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
56387 if (IsSigned)
56388 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
56389
56390 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
56391 }
56392 return SDValue();
56393}
56394
56396 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
56397
56398 for (const SDNode *User : Flags->users()) {
56400 switch (User->getOpcode()) {
56401 default:
56402 // Be conservative.
56403 return true;
56404 case X86ISD::SETCC:
56406 CC = (X86::CondCode)User->getConstantOperandVal(0);
56407 break;
56408 case X86ISD::BRCOND:
56409 case X86ISD::CMOV:
56410 CC = (X86::CondCode)User->getConstantOperandVal(2);
56411 break;
56412 }
56413
56414 switch (CC) {
56415 // clang-format off
56416 default: break;
56417 case X86::COND_A: case X86::COND_AE:
56418 case X86::COND_B: case X86::COND_BE:
56419 case X86::COND_O: case X86::COND_NO:
56420 case X86::COND_G: case X86::COND_GE:
56421 case X86::COND_L: case X86::COND_LE:
56422 return true;
56423 // clang-format on
56424 }
56425 }
56426
56427 return false;
56428}
56429
56430static bool onlyZeroFlagUsed(SDValue Flags) {
56431 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
56432
56433 for (const SDNode *User : Flags->users()) {
56434 unsigned CCOpNo;
56435 switch (User->getOpcode()) {
56436 default:
56437 // Be conservative.
56438 return false;
56439 case X86ISD::SETCC:
56441 CCOpNo = 0;
56442 break;
56443 case X86ISD::BRCOND:
56444 case X86ISD::CMOV:
56445 CCOpNo = 2;
56446 break;
56447 }
56448
56449 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
56450 if (CC != X86::COND_E && CC != X86::COND_NE)
56451 return false;
56452 }
56453
56454 return true;
56455}
56456
56459 const X86Subtarget &Subtarget) {
56460 // Only handle test patterns.
56461 if (!isNullConstant(N->getOperand(1)))
56462 return SDValue();
56463
56464 // If we have a CMP of a truncated binop, see if we can make a smaller binop
56465 // and use its flags directly.
56466 // TODO: Maybe we should try promoting compares that only use the zero flag
56467 // first if we can prove the upper bits with computeKnownBits?
56468 SDLoc dl(N);
56469 SDValue Op = N->getOperand(0);
56470 EVT VT = Op.getValueType();
56471 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56472
56473 if (SDValue CMP =
56474 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
56475 return CMP;
56476
56477 // If we have a constant logical shift that's only used in a comparison
56478 // against zero turn it into an equivalent AND. This allows turning it into
56479 // a TEST instruction later.
56480 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
56481 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
56482 onlyZeroFlagUsed(SDValue(N, 0))) {
56483 unsigned BitWidth = VT.getSizeInBits();
56484 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
56485 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
56486 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
56487 APInt Mask = Op.getOpcode() == ISD::SRL
56488 ? APInt::getHighBitsSet(BitWidth, MaskBits)
56489 : APInt::getLowBitsSet(BitWidth, MaskBits);
56490 if (Mask.isSignedIntN(32)) {
56491 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
56492 DAG.getConstant(Mask, dl, VT));
56493 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56494 DAG.getConstant(0, dl, VT));
56495 }
56496 }
56497 }
56498
56499 // If we're extracting from a avx512 bool vector and comparing against zero,
56500 // then try to just bitcast the vector to an integer to use TEST/BT directly.
56501 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
56502 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
56503 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
56504 SDValue Src = Op.getOperand(0);
56505 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56506 isNullConstant(Src.getOperand(1)) &&
56507 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
56508 SDValue BoolVec = Src.getOperand(0);
56509 unsigned ShAmt = 0;
56510 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
56511 ShAmt = BoolVec.getConstantOperandVal(1);
56512 BoolVec = BoolVec.getOperand(0);
56513 }
56514 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
56515 EVT VecVT = BoolVec.getValueType();
56516 unsigned BitWidth = VecVT.getVectorNumElements();
56517 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
56518 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
56519 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
56520 Op = DAG.getBitcast(BCVT, BoolVec);
56521 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
56522 DAG.getConstant(Mask, dl, BCVT));
56523 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56524 DAG.getConstant(0, dl, BCVT));
56525 }
56526 }
56527 }
56528
56529 // Peek through any zero-extend if we're only testing for a zero result.
56530 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
56531 SDValue Src = Op.getOperand(0);
56532 EVT SrcVT = Src.getValueType();
56533 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
56534 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
56535 DAG.getConstant(0, dl, SrcVT));
56536 }
56537
56538 // Look for a truncate.
56539 if (Op.getOpcode() != ISD::TRUNCATE)
56540 return SDValue();
56541
56542 SDValue Trunc = Op;
56543 Op = Op.getOperand(0);
56544
56545 // See if we can compare with zero against the truncation source,
56546 // which should help using the Z flag from many ops. Only do this for
56547 // i32 truncated op to prevent partial-reg compares of promoted ops.
56548 EVT OpVT = Op.getValueType();
56549 APInt UpperBits =
56551 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
56552 onlyZeroFlagUsed(SDValue(N, 0))) {
56553 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56554 DAG.getConstant(0, dl, OpVT));
56555 }
56556
56557 // After this the truncate and arithmetic op must have a single use.
56558 if (!Trunc.hasOneUse() || !Op.hasOneUse())
56559 return SDValue();
56560
56561 unsigned NewOpc;
56562 switch (Op.getOpcode()) {
56563 default: return SDValue();
56564 case ISD::AND:
56565 // Skip and with constant. We have special handling for and with immediate
56566 // during isel to generate test instructions.
56567 if (isa<ConstantSDNode>(Op.getOperand(1)))
56568 return SDValue();
56569 NewOpc = X86ISD::AND;
56570 break;
56571 case ISD::OR: NewOpc = X86ISD::OR; break;
56572 case ISD::XOR: NewOpc = X86ISD::XOR; break;
56573 case ISD::ADD:
56574 // If the carry or overflow flag is used, we can't truncate.
56576 return SDValue();
56577 NewOpc = X86ISD::ADD;
56578 break;
56579 case ISD::SUB:
56580 // If the carry or overflow flag is used, we can't truncate.
56582 return SDValue();
56583 NewOpc = X86ISD::SUB;
56584 break;
56585 }
56586
56587 // We found an op we can narrow. Truncate its inputs.
56588 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
56589 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
56590
56591 // Use a X86 specific opcode to avoid DAG combine messing with it.
56592 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56593 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
56594
56595 // For AND, keep a CMP so that we can match the test pattern.
56596 if (NewOpc == X86ISD::AND)
56597 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56598 DAG.getConstant(0, dl, VT));
56599
56600 // Return the flags.
56601 return Op.getValue(1);
56602}
56603
56606 const X86Subtarget &ST) {
56607 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
56608 "Expected X86ISD::ADD or X86ISD::SUB");
56609
56610 SDLoc DL(N);
56611 SDValue LHS = N->getOperand(0);
56612 SDValue RHS = N->getOperand(1);
56613 MVT VT = LHS.getSimpleValueType();
56614 bool IsSub = X86ISD::SUB == N->getOpcode();
56615 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
56616
56617 if (IsSub && isOneConstant(N->getOperand(1)) && !N->hasAnyUseOfValue(0))
56618 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
56619 return CMP;
56620
56621 // If we don't use the flag result, simplify back to a generic ADD/SUB.
56622 if (!N->hasAnyUseOfValue(1)) {
56623 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
56624 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
56625 }
56626
56627 // Fold any similar generic ADD/SUB opcodes to reuse this node.
56628 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
56629 SDValue Ops[] = {N0, N1};
56630 SDVTList VTs = DAG.getVTList(N->getValueType(0));
56631 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
56632 SDValue Op(N, 0);
56633 if (Negate) {
56634 // Bail if this is only used by a user of the x86 add/sub.
56635 if (GenericAddSub->hasOneUse() &&
56636 GenericAddSub->user_begin()->isOnlyUserOf(N))
56637 return;
56638 Op = DAG.getNegative(Op, DL, VT);
56639 }
56640 DCI.CombineTo(GenericAddSub, Op);
56641 }
56642 };
56643 MatchGeneric(LHS, RHS, false);
56644 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
56645
56646 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
56647 // EFLAGS result doesn't change.
56648 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
56649 /*ZeroSecondOpOnly*/ true);
56650}
56651
56653 SDValue LHS = N->getOperand(0);
56654 SDValue RHS = N->getOperand(1);
56655 SDValue BorrowIn = N->getOperand(2);
56656
56657 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
56658 MVT VT = N->getSimpleValueType(0);
56659 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56660 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
56661 }
56662
56663 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
56664 // iff the flag result is dead.
56665 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
56666 !N->hasAnyUseOfValue(1))
56667 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56668 LHS.getOperand(1), BorrowIn);
56669
56670 return SDValue();
56671}
56672
56673// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
56676 SDValue LHS = N->getOperand(0);
56677 SDValue RHS = N->getOperand(1);
56678 SDValue CarryIn = N->getOperand(2);
56679 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
56680 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
56681
56682 // Canonicalize constant to RHS.
56683 if (LHSC && !RHSC)
56684 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
56685 CarryIn);
56686
56687 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
56688 // the result is either zero or one (depending on the input carry bit).
56689 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
56690 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
56691 // We don't have a good way to replace an EFLAGS use, so only do this when
56692 // dead right now.
56693 SDValue(N, 1).use_empty()) {
56694 SDLoc DL(N);
56695 EVT VT = N->getValueType(0);
56696 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
56697 SDValue Res1 = DAG.getNode(
56698 ISD::AND, DL, VT,
56700 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
56701 DAG.getConstant(1, DL, VT));
56702 return DCI.CombineTo(N, Res1, CarryOut);
56703 }
56704
56705 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
56706 // iff the flag result is dead.
56707 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
56708 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
56709 SDLoc DL(N);
56710 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
56711 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
56712 DAG.getConstant(0, DL, LHS.getValueType()),
56713 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
56714 }
56715
56716 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
56717 MVT VT = N->getSimpleValueType(0);
56718 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56719 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
56720 }
56721
56722 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
56723 // iff the flag result is dead.
56724 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
56725 !N->hasAnyUseOfValue(1))
56726 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56727 LHS.getOperand(1), CarryIn);
56728
56729 return SDValue();
56730}
56731
56733 const SDLoc &DL, EVT VT,
56734 const X86Subtarget &Subtarget) {
56735 using namespace SDPatternMatch;
56736
56737 // Example of pattern we try to detect:
56738 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
56739 //(add (build_vector (extract_elt t, 0),
56740 // (extract_elt t, 2),
56741 // (extract_elt t, 4),
56742 // (extract_elt t, 6)),
56743 // (build_vector (extract_elt t, 1),
56744 // (extract_elt t, 3),
56745 // (extract_elt t, 5),
56746 // (extract_elt t, 7)))
56747
56748 if (!Subtarget.hasSSE2())
56749 return SDValue();
56750
56751 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
56752 VT.getVectorNumElements() < 4 ||
56754 return SDValue();
56755
56756 SDValue Op0, Op1, Accum;
56757 if (!sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
56758 m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op1)))) &&
56759 !sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
56760 m_Add(m_Value(Accum), m_AllOf(m_Opc(ISD::BUILD_VECTOR),
56761 m_Value(Op1))))))
56762 return SDValue();
56763
56764 // Check if one of Op0,Op1 is of the form:
56765 // (build_vector (extract_elt Mul, 0),
56766 // (extract_elt Mul, 2),
56767 // (extract_elt Mul, 4),
56768 // ...
56769 // the other is of the form:
56770 // (build_vector (extract_elt Mul, 1),
56771 // (extract_elt Mul, 3),
56772 // (extract_elt Mul, 5),
56773 // ...
56774 // and identify Mul.
56775 SDValue Mul;
56776 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
56777 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
56778 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
56779 // TODO: Be more tolerant to undefs.
56780 APInt Idx0L, Idx0H, Idx1L, Idx1H;
56781 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
56782 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
56783 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
56784 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
56785 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
56786 return SDValue();
56787 // Commutativity of mul allows factors of a product to reorder.
56788 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
56789 std::swap(Idx0L, Idx1L);
56790 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
56791 std::swap(Idx0H, Idx1H);
56792 // Commutativity of add allows pairs of factors to reorder.
56793 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
56794 std::swap(Idx0L, Idx0H);
56795 std::swap(Idx1L, Idx1H);
56796 }
56797 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
56798 Idx1H != 2 * i + 3)
56799 return SDValue();
56800 if (!Mul) {
56801 // First time an extract_elt's source vector is visited. Must be a MUL
56802 // with 2X number of vector elements than the BUILD_VECTOR.
56803 // Both extracts must be from same MUL.
56804 Mul = Vec0L;
56805 if (Mul.getOpcode() != ISD::MUL ||
56806 Mul.getValueType().getVectorNumElements() != 2 * e)
56807 return SDValue();
56808 }
56809 // Check that the extract is from the same MUL previously seen.
56810 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
56811 return SDValue();
56812 }
56813
56814 // Check if the Mul source can be safely shrunk.
56815 ShrinkMode Mode;
56816 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
56817 Mode == ShrinkMode::MULU16)
56818 return SDValue();
56819
56820 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
56821 VT.getVectorNumElements() * 2);
56822 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
56823 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
56824
56825 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
56826 ArrayRef<SDValue> Ops) {
56827 EVT InVT = Ops[0].getValueType();
56828 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
56829 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
56830 InVT.getVectorNumElements() / 2);
56831 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
56832 };
56833 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
56834 if (Accum)
56835 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
56836 return R;
56837}
56838
56839// Attempt to turn this pattern into PMADDWD.
56840// (add (mul (sext (build_vector)), (sext (build_vector))),
56841// (mul (sext (build_vector)), (sext (build_vector)))
56843 const SDLoc &DL, EVT VT,
56844 const X86Subtarget &Subtarget) {
56845 using namespace SDPatternMatch;
56846
56847 if (!Subtarget.hasSSE2())
56848 return SDValue();
56849
56850 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
56851 VT.getVectorNumElements() < 4 ||
56853 return SDValue();
56854
56855 // All inputs need to be sign extends.
56856 // TODO: Support ZERO_EXTEND from known positive?
56857 SDValue N00, N01, N10, N11;
56858 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
56859 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
56860 return SDValue();
56861
56862 // Must be extending from vXi16.
56863 EVT InVT = N00.getValueType();
56864 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
56865 N10.getValueType() != InVT || N11.getValueType() != InVT)
56866 return SDValue();
56867
56868 // All inputs should be build_vectors.
56869 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
56870 N01.getOpcode() != ISD::BUILD_VECTOR ||
56871 N10.getOpcode() != ISD::BUILD_VECTOR ||
56873 return SDValue();
56874
56875 // For each element, we need to ensure we have an odd element from one vector
56876 // multiplied by the odd element of another vector and the even element from
56877 // one of the same vectors being multiplied by the even element from the
56878 // other vector. So we need to make sure for each element i, this operator
56879 // is being performed:
56880 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
56881 SDValue In0, In1;
56882 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
56883 SDValue N00Elt = N00.getOperand(i);
56884 SDValue N01Elt = N01.getOperand(i);
56885 SDValue N10Elt = N10.getOperand(i);
56886 SDValue N11Elt = N11.getOperand(i);
56887 // TODO: Be more tolerant to undefs.
56888 SDValue N00In, N01In, N10In, N11In;
56889 APInt IdxN00, IdxN01, IdxN10, IdxN11;
56890 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
56891 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
56892 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
56893 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
56894 return SDValue();
56895 // Add is commutative so indices can be reordered.
56896 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
56897 std::swap(IdxN00, IdxN10);
56898 std::swap(IdxN01, IdxN11);
56899 }
56900 // N0 indices be the even element. N1 indices must be the next odd element.
56901 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
56902 IdxN11 != 2 * i + 1)
56903 return SDValue();
56904
56905 // First time we find an input capture it.
56906 if (!In0) {
56907 In0 = N00In;
56908 In1 = N01In;
56909
56910 // The input vectors must be at least as wide as the output.
56911 // If they are larger than the output, we extract subvector below.
56912 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
56913 In1.getValueSizeInBits() < VT.getSizeInBits())
56914 return SDValue();
56915 }
56916 // Mul is commutative so the input vectors can be in any order.
56917 // Canonicalize to make the compares easier.
56918 if (In0 != N00In)
56919 std::swap(N00In, N01In);
56920 if (In0 != N10In)
56921 std::swap(N10In, N11In);
56922 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
56923 return SDValue();
56924 }
56925
56926 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
56927 ArrayRef<SDValue> Ops) {
56928 EVT OpVT = Ops[0].getValueType();
56929 assert(OpVT.getScalarType() == MVT::i16 &&
56930 "Unexpected scalar element type");
56931 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
56932 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
56933 OpVT.getVectorNumElements() / 2);
56934 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
56935 };
56936
56937 // If the output is narrower than an input, extract the low part of the input
56938 // vector.
56939 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
56940 VT.getVectorNumElements() * 2);
56941 if (OutVT16.bitsLT(In0.getValueType())) {
56942 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
56943 DAG.getVectorIdxConstant(0, DL));
56944 }
56945 if (OutVT16.bitsLT(In1.getValueType())) {
56946 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
56947 DAG.getVectorIdxConstant(0, DL));
56948 }
56949 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
56950 PMADDBuilder);
56951}
56952
56953// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
56954// If upper element in each pair of both VPMADDWD are zero then we can merge
56955// the operand elements and use the implicit add of VPMADDWD.
56956// TODO: Add support for VPMADDUBSW (which isn't commutable).
56958 const SDLoc &DL, EVT VT) {
56959 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
56960 return SDValue();
56961
56962 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
56963 if (VT.getSizeInBits() > 128)
56964 return SDValue();
56965
56966 unsigned NumElts = VT.getVectorNumElements();
56967 MVT OpVT = N0.getOperand(0).getSimpleValueType();
56969 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
56970
56971 bool Op0HiZero =
56972 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
56973 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
56974 bool Op1HiZero =
56975 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
56976 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
56977
56978 // TODO: Check for zero lower elements once we have actual codegen that
56979 // creates them.
56980 if (!Op0HiZero || !Op1HiZero)
56981 return SDValue();
56982
56983 // Create a shuffle mask packing the lower elements from each VPMADDWD.
56984 SmallVector<int> Mask;
56985 for (int i = 0; i != (int)NumElts; ++i) {
56986 Mask.push_back(2 * i);
56987 Mask.push_back(2 * (i + NumElts));
56988 }
56989
56990 SDValue LHS =
56991 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
56992 SDValue RHS =
56993 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
56994 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
56995}
56996
56997/// CMOV of constants requires materializing constant operands in registers.
56998/// Try to fold those constants into an 'add' instruction to reduce instruction
56999/// count. We do this with CMOV rather the generic 'select' because there are
57000/// earlier folds that may be used to turn select-of-constants into logic hacks.
57002 SelectionDAG &DAG,
57003 const X86Subtarget &Subtarget) {
57004 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57005 // better because we eliminate 1-2 instructions. This transform is still
57006 // an improvement without zero operands because we trade 2 move constants and
57007 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57008 // immediate asm operands (fit in 32-bits).
57009 auto isSuitableCmov = [](SDValue V) {
57010 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57011 return false;
57012 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57013 !isa<ConstantSDNode>(V.getOperand(1)))
57014 return false;
57015 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57016 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57017 V.getConstantOperandAPInt(1).isSignedIntN(32));
57018 };
57019
57020 // Match an appropriate CMOV as the first operand of the add.
57021 SDValue Cmov = N->getOperand(0);
57022 SDValue OtherOp = N->getOperand(1);
57023 if (!isSuitableCmov(Cmov))
57024 std::swap(Cmov, OtherOp);
57025 if (!isSuitableCmov(Cmov))
57026 return SDValue();
57027
57028 // Don't remove a load folding opportunity for the add. That would neutralize
57029 // any improvements from removing constant materializations.
57030 if (X86::mayFoldLoad(OtherOp, Subtarget))
57031 return SDValue();
57032
57033 EVT VT = N->getValueType(0);
57034 SDValue FalseOp = Cmov.getOperand(0);
57035 SDValue TrueOp = Cmov.getOperand(1);
57036
57037 // We will push the add through the select, but we can potentially do better
57038 // if we know there is another add in the sequence and this is pointer math.
57039 // In that case, we can absorb an add into the trailing memory op and avoid
57040 // a 3-operand LEA which is likely slower than a 2-operand LEA.
57041 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
57042 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
57043 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
57044 all_of(N->users(), [&](SDNode *Use) {
57045 auto *MemNode = dyn_cast<MemSDNode>(Use);
57046 return MemNode && MemNode->getBasePtr().getNode() == N;
57047 })) {
57048 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
57049 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
57050 // it is possible that choosing op1 might be better.
57051 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
57052 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
57053 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
57054 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
57055 Cmov.getOperand(2), Cmov.getOperand(3));
57056 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
57057 }
57058
57059 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
57060 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
57061 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
57062 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
57063 Cmov.getOperand(3));
57064}
57065
57068 const X86Subtarget &Subtarget) {
57069 EVT VT = N->getValueType(0);
57070 SDValue Op0 = N->getOperand(0);
57071 SDValue Op1 = N->getOperand(1);
57072 SDLoc DL(N);
57073
57074 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
57075 return Select;
57076
57077 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
57078 return MAdd;
57079 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
57080 return MAdd;
57081 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
57082 return MAdd;
57083
57084 // Try to synthesize horizontal adds from adds of shuffles.
57085 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
57086 return V;
57087
57088 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
57089 // iff X and Y won't overflow.
57090 if (Op0.getOpcode() == X86ISD::PSADBW && Op1.getOpcode() == X86ISD::PSADBW &&
57093 if (DAG.willNotOverflowAdd(false, Op0.getOperand(0), Op1.getOperand(0))) {
57094 MVT OpVT = Op0.getOperand(1).getSimpleValueType();
57095 SDValue Sum =
57096 DAG.getNode(ISD::ADD, DL, OpVT, Op0.getOperand(0), Op1.getOperand(0));
57097 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
57098 getZeroVector(OpVT, Subtarget, DAG, DL));
57099 }
57100 }
57101
57102 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
57103 // (sub Y, (sext (vXi1 X))).
57104 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
57105 // generic DAG combine without a legal type check, but adding this there
57106 // caused regressions.
57107 if (VT.isVector()) {
57108 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57109 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
57110 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
57111 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
57112 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
57113 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
57114 }
57115
57116 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
57117 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
57118 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
57119 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
57120 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
57121 }
57122 }
57123
57124 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
57125 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
57126 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
57127 using namespace SDPatternMatch;
57128 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
57129 if (sd_match(N, m_Add(m_Value(Accum),
57130 m_Node(ISD::CONCAT_VECTORS,
57132 m_Value(Lo1)),
57134 m_Value(Hi1)))))) {
57135 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
57136 concatSubVectors(Lo0, Hi0, DAG, DL),
57137 concatSubVectors(Lo1, Hi1, DAG, DL));
57138 }
57139 }
57140
57141 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
57142 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
57143 X86::isZeroNode(Op0.getOperand(1))) {
57144 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
57145 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
57146 Op0.getOperand(0), Op0.getOperand(2));
57147 }
57148
57149 return combineAddOrSubToADCOrSBB(N, DL, DAG);
57150}
57151
57152// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
57153// condition comes from the subtract node that produced -X. This matches the
57154// cmov expansion for absolute value. By swapping the operands we convert abs
57155// to nabs.
57156static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
57157 SelectionDAG &DAG) {
57158 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
57159 return SDValue();
57160
57161 SDValue Cond = N1.getOperand(3);
57162 if (Cond.getOpcode() != X86ISD::SUB)
57163 return SDValue();
57164 assert(Cond.getResNo() == 1 && "Unexpected result number");
57165
57166 SDValue FalseOp = N1.getOperand(0);
57167 SDValue TrueOp = N1.getOperand(1);
57169
57170 // ABS condition should come from a negate operation.
57171 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
57172 isNullConstant(Cond.getOperand(0))) {
57173 // Get the X and -X from the negate.
57174 SDValue NegX = Cond.getValue(0);
57175 SDValue X = Cond.getOperand(1);
57176
57177 // Cmov operands should be X and NegX. Order doesn't matter.
57178 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
57179 return SDValue();
57180
57181 // Build a new CMOV with the operands swapped.
57182 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
57183 N1.getOperand(2), Cond);
57184 // Convert sub to add.
57185 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
57186 }
57187
57188 // Handle ABD special case:
57189 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
57190 // ABD condition should come from a pair of matching subtracts.
57191 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
57192 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
57193 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
57194 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
57195 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
57196 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
57197 // Build a new CMOV with the operands swapped.
57198 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
57199 Cond);
57200 }
57201
57202 return SDValue();
57203}
57204
57206 SDValue Op0 = N->getOperand(0);
57207 SDValue Op1 = N->getOperand(1);
57208
57209 // (sub C (zero_extend (setcc)))
57210 // =>
57211 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
57212 // Don't disturb (sub 0 setcc), which is easily done with neg.
57213 EVT VT = N->getValueType(0);
57214 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
57215 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
57216 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
57217 Op1.getOperand(0).hasOneUse()) {
57218 SDValue SetCC = Op1.getOperand(0);
57221 APInt NewImm = Op0C->getAPIntValue() - 1;
57222 SDLoc DL(Op1);
57223 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
57224 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
57225 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
57226 DAG.getConstant(NewImm, DL, VT));
57227 }
57228
57229 return SDValue();
57230}
57231
57233 // res, flags2 = sub 0, (setcc cc, flag)
57234 // cload/cstore ..., cond_ne, flag2
57235 // ->
57236 // cload/cstore cc, flag
57237 if (N->getConstantOperandVal(3) != X86::COND_NE)
57238 return SDValue();
57239
57240 SDValue Sub = N->getOperand(4);
57241 if (Sub.getOpcode() != X86ISD::SUB)
57242 return SDValue();
57243
57244 SDValue SetCC = Sub.getOperand(1);
57245
57246 if (!X86::isZeroNode(Sub.getOperand(0)) || SetCC.getOpcode() != X86ISD::SETCC)
57247 return SDValue();
57248
57249 SmallVector<SDValue, 5> Ops(N->op_values());
57250 Ops[3] = SetCC.getOperand(0);
57251 Ops[4] = SetCC.getOperand(1);
57252
57253 return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops,
57254 cast<MemSDNode>(N)->getMemoryVT(),
57255 cast<MemSDNode>(N)->getMemOperand());
57256}
57257
57260 const X86Subtarget &Subtarget) {
57261 EVT VT = N->getValueType(0);
57262 SDValue Op0 = N->getOperand(0);
57263 SDValue Op1 = N->getOperand(1);
57264 SDLoc DL(N);
57265
57266 auto IsNonOpaqueConstant = [&](SDValue Op) {
57268 /*AllowOpaques*/ false);
57269 };
57270
57271 // X86 can't encode an immediate LHS of a sub. See if we can push the
57272 // negation into a preceding instruction. If the RHS of the sub is a XOR with
57273 // one use and a constant, invert the immediate, saving one register.
57274 // However, ignore cases where C1 is 0, as those will become a NEG.
57275 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
57276 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
57277 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
57278 Op1->hasOneUse()) {
57279 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
57280 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
57281 SDValue NewAdd =
57282 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
57283 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
57284 }
57285
57286 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
57287 return V;
57288
57289 // Try to synthesize horizontal subs from subs of shuffles.
57290 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
57291 return V;
57292
57293 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
57294 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
57295 X86::isZeroNode(Op1.getOperand(1))) {
57296 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
57297 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
57298 Op1.getOperand(0), Op1.getOperand(2));
57299 }
57300
57301 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
57302 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
57303 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
57304 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
57305 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
57306 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
57307 Op1.getOperand(1), Op1.getOperand(2));
57308 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
57309 }
57310
57311 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
57312 return V;
57313
57314 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
57315 return V;
57316
57317 return combineSubSetcc(N, DAG);
57318}
57319
57321 const X86Subtarget &Subtarget) {
57322 unsigned Opcode = N->getOpcode();
57323 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
57324 "Unknown PCMP opcode");
57325
57326 SDValue LHS = N->getOperand(0);
57327 SDValue RHS = N->getOperand(1);
57328 MVT VT = N->getSimpleValueType(0);
57329 unsigned EltBits = VT.getScalarSizeInBits();
57330 unsigned NumElts = VT.getVectorNumElements();
57331 SDLoc DL(N);
57332
57333 if (LHS == RHS)
57334 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
57335 : DAG.getConstant(0, DL, VT);
57336
57337 // Constant Folding.
57338 // PCMPEQ(X,UNDEF) -> UNDEF
57339 // PCMPGT(X,UNDEF) -> 0
57340 // PCMPGT(UNDEF,X) -> 0
57341 APInt LHSUndefs, RHSUndefs;
57342 SmallVector<APInt> LHSBits, RHSBits;
57343 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
57344 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
57345 APInt Ones = APInt::getAllOnes(EltBits);
57346 APInt Zero = APInt::getZero(EltBits);
57347 SmallVector<APInt> Results(NumElts);
57348 for (unsigned I = 0; I != NumElts; ++I) {
57349 if (Opcode == X86ISD::PCMPEQ) {
57350 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
57351 } else {
57352 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
57353 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
57354 }
57355 }
57356 if (Opcode == X86ISD::PCMPEQ)
57357 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
57358 return getConstVector(Results, VT, DAG, DL);
57359 }
57360
57361 return SDValue();
57362}
57363
57364// Helper to determine if we can convert an integer comparison to a float
57365// comparison byt casting the operands.
57366static std::optional<unsigned>
57367CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
57368 unsigned NumSignificantBitsRHS) {
57369 MVT SVT = VT.getScalarType();
57370 assert(SVT == MVT::f32 && "Only tested for float so far");
57371 const fltSemantics &Sem = SVT.getFltSemantics();
57372 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
57373 "Only PCMPEQ/PCMPGT currently supported");
57374
57375 // TODO: Handle bitcastable integers.
57376
57377 // For cvt + signed compare we need lhs and rhs to be exactly representable as
57378 // a fp value.
57379 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
57380 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
57381 return ISD::SINT_TO_FP;
57382
57383 return std::nullopt;
57384}
57385
57386/// Helper that combines an array of subvector ops as if they were the operands
57387/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
57388/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
57392 const X86Subtarget &Subtarget) {
57393 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
57394 unsigned EltSizeInBits = VT.getScalarSizeInBits();
57395
57396 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
57397 return DAG.getUNDEF(VT);
57398
57399 if (llvm::all_of(Ops, [](SDValue Op) {
57400 return ISD::isBuildVectorAllZeros(Op.getNode());
57401 }))
57402 return getZeroVector(VT, Subtarget, DAG, DL);
57403
57404 SDValue Op0 = Ops[0];
57405 bool IsSplat = llvm::all_equal(Ops);
57406 unsigned NumOps = Ops.size();
57407 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57408 LLVMContext &Ctx = *DAG.getContext();
57409
57410 // Repeated subvectors.
57411 if (IsSplat &&
57412 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
57413 // If this broadcast is inserted into both halves, use a larger broadcast.
57414 if (Op0.getOpcode() == X86ISD::VBROADCAST)
57415 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
57416
57417 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
57418 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
57419 (Subtarget.hasAVX2() ||
57421 VT.getScalarType(), Subtarget)))
57422 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
57423 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
57424 Op0.getOperand(0),
57425 DAG.getVectorIdxConstant(0, DL)));
57426
57427 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
57428 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
57429 (Subtarget.hasAVX2() ||
57430 (EltSizeInBits >= 32 &&
57431 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
57432 Op0.getOperand(0).getValueType() == VT.getScalarType())
57433 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
57434
57435 // concat_vectors(extract_subvector(broadcast(x)),
57436 // extract_subvector(broadcast(x))) -> broadcast(x)
57437 // concat_vectors(extract_subvector(subv_broadcast(x)),
57438 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
57439 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
57440 Op0.getOperand(0).getValueType() == VT) {
57441 SDValue SrcVec = Op0.getOperand(0);
57442 if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
57444 return Op0.getOperand(0);
57445 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
57446 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
57447 return Op0.getOperand(0);
57448 }
57449
57450 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
57451 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
57452 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
57453 return DAG.getNode(Op0.getOpcode(), DL, VT,
57455 Op0.getOperand(0), Op0.getOperand(0)),
57456 Op0.getOperand(1));
57457 }
57458
57459 // TODO: This should go in combineX86ShufflesRecursively eventually.
57460 if (NumOps == 2) {
57461 SDValue Src0 = peekThroughBitcasts(Ops[0]);
57462 SDValue Src1 = peekThroughBitcasts(Ops[1]);
57463 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
57465 EVT SrcVT0 = Src0.getOperand(0).getValueType();
57466 EVT SrcVT1 = Src1.getOperand(0).getValueType();
57467 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
57468 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
57469 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
57470 // Only concat of subvector high halves which vperm2x128 is best at.
57471 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
57472 SrcVT1.is256BitVector() &&
57473 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
57474 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
57475 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
57476 DAG.getBitcast(VT, Src0.getOperand(0)),
57477 DAG.getBitcast(VT, Src1.getOperand(0)),
57478 DAG.getTargetConstant(0x31, DL, MVT::i8));
57479 }
57480 // concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> x.
57481 if (Src0.getOperand(0) == Src1.getOperand(0) &&
57482 Src0.getConstantOperandAPInt(1) == 0 &&
57483 Src1.getConstantOperandAPInt(1) ==
57485 return DAG.getBitcast(VT, extractSubVector(Src0.getOperand(0), 0, DAG,
57486 DL, VT.getSizeInBits()));
57487 }
57488 }
57489 }
57490
57491 // Repeated opcode.
57492 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
57493 // but it currently struggles with different vector widths.
57494 if (llvm::all_of(Ops, [Op0](SDValue Op) {
57495 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
57496 })) {
57497 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
57499 for (SDValue SubOp : SubOps)
57500 Subs.push_back(SubOp.getOperand(I));
57501 // Attempt to peek through bitcasts and concat the original subvectors.
57502 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
57503 if (SubVT.isSimple() && SubVT.isVector()) {
57504 EVT ConcatVT =
57506 SubVT.getVectorElementCount() * Subs.size());
57507 for (SDValue &Sub : Subs)
57508 Sub = DAG.getBitcast(SubVT, Sub);
57509 return DAG.getBitcast(
57510 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
57511 }
57512 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
57513 };
57514 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
57515 bool AllConstants = true;
57516 bool AllSubs = true;
57517 unsigned VecSize = VT.getSizeInBits();
57518 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
57519 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
57520 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
57521 }))
57522 return true;
57523 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
57524 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
57525 unsigned SubSize = BC.getValueSizeInBits();
57526 unsigned EltSize = BC.getScalarValueSizeInBits();
57527 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
57529 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
57530 BC.getOperand(0).getValueSizeInBits() == VecSize &&
57531 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
57532 }
57533 return AllConstants || AllSubs;
57534 };
57535
57536 switch (Op0.getOpcode()) {
57537 case ISD::VECTOR_SHUFFLE: {
57538 if (NumOps == 2 && VT.is256BitVector() &&
57539 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
57540 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
57541 int NumSubElts = Op0.getValueType().getVectorNumElements();
57542 SmallVector<int> NewMask;
57543 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
57544 M = M >= NumSubElts ? M + NumSubElts : M;
57545 NewMask.push_back(M);
57546 }
57547 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
57548 if (0 <= M)
57549 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
57550 NewMask.push_back(M);
57551 }
57552 return DAG.getVectorShuffle(VT, DL, ConcatSubOperand(VT, Ops, 0),
57553 ConcatSubOperand(VT, Ops, 1), NewMask);
57554 }
57555 break;
57556 }
57557 case X86ISD::VBROADCAST: {
57558 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
57559 return Op.getOperand(0).getValueType().is128BitVector();
57560 })) {
57561 if (VT == MVT::v4f64 || VT == MVT::v4i64)
57562 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
57563 ConcatSubOperand(VT, Ops, 0),
57564 ConcatSubOperand(VT, Ops, 0));
57565 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
57566 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
57567 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
57569 DL, VT, ConcatSubOperand(VT, Ops, 0),
57570 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
57571 }
57572 break;
57573 }
57574 case X86ISD::MOVDDUP:
57575 case X86ISD::MOVSHDUP:
57576 case X86ISD::MOVSLDUP: {
57577 if (!IsSplat)
57578 return DAG.getNode(Op0.getOpcode(), DL, VT,
57579 ConcatSubOperand(VT, Ops, 0));
57580 break;
57581 }
57582 case X86ISD::SHUFP: {
57583 // Add SHUFPD support if/when necessary.
57584 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
57585 llvm::all_of(Ops, [Op0](SDValue Op) {
57586 return Op.getOperand(2) == Op0.getOperand(2);
57587 })) {
57588 return DAG.getNode(Op0.getOpcode(), DL, VT,
57589 ConcatSubOperand(VT, Ops, 0),
57590 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
57591 }
57592 break;
57593 }
57594 case X86ISD::UNPCKH:
57595 case X86ISD::UNPCKL: {
57596 // Don't concatenate build_vector patterns.
57597 if (!IsSplat && EltSizeInBits >= 32 &&
57598 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57599 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
57600 none_of(Ops, [](SDValue Op) {
57601 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
57603 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
57605 })) {
57606 return DAG.getNode(Op0.getOpcode(), DL, VT,
57607 ConcatSubOperand(VT, Ops, 0),
57608 ConcatSubOperand(VT, Ops, 1));
57609 }
57610 break;
57611 }
57612 case X86ISD::PSHUFHW:
57613 case X86ISD::PSHUFLW:
57614 case X86ISD::PSHUFD:
57615 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
57616 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
57617 return DAG.getNode(Op0.getOpcode(), DL, VT,
57618 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
57619 }
57620 [[fallthrough]];
57621 case X86ISD::VPERMILPI:
57622 if (!IsSplat && EltSizeInBits == 32 &&
57623 (VT.is256BitVector() ||
57624 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
57625 all_of(Ops, [&Op0](SDValue Op) {
57626 return Op0.getOperand(1) == Op.getOperand(1);
57627 })) {
57628 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
57629 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
57630 Res =
57631 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
57632 return DAG.getBitcast(VT, Res);
57633 }
57634 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
57635 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
57636 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
57637 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
57638 return DAG.getNode(Op0.getOpcode(), DL, VT,
57639 ConcatSubOperand(VT, Ops, 0),
57640 DAG.getTargetConstant(Idx, DL, MVT::i8));
57641 }
57642 break;
57643 case X86ISD::PSHUFB:
57644 case X86ISD::PSADBW:
57645 case X86ISD::VPMADDUBSW:
57646 case X86ISD::VPMADDWD:
57647 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57648 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
57649 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
57650 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
57651 NumOps * SrcVT.getVectorNumElements());
57652 return DAG.getNode(Op0.getOpcode(), DL, VT,
57653 ConcatSubOperand(SrcVT, Ops, 0),
57654 ConcatSubOperand(SrcVT, Ops, 1));
57655 }
57656 break;
57657 case X86ISD::VPERMV:
57658 if (!IsSplat && NumOps == 2 &&
57659 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
57660 MVT OpVT = Op0.getSimpleValueType();
57661 int NumSrcElts = OpVT.getVectorNumElements();
57662 SmallVector<int, 64> ConcatMask;
57663 for (unsigned i = 0; i != NumOps; ++i) {
57664 SmallVector<int, 64> SubMask;
57666 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
57667 break;
57668 for (int M : SubMask) {
57669 if (0 <= M)
57670 M += i * NumSrcElts;
57671 ConcatMask.push_back(M);
57672 }
57673 }
57674 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
57675 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
57676 Ops[1].getOperand(1), DAG, DL);
57677 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
57678 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
57679 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
57680 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
57681 }
57682 }
57683 break;
57684 case X86ISD::VPERMV3:
57685 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
57686 MVT OpVT = Op0.getSimpleValueType();
57687 int NumSrcElts = OpVT.getVectorNumElements();
57688 SmallVector<int, 64> ConcatMask;
57689 for (unsigned i = 0; i != NumOps; ++i) {
57690 SmallVector<int, 64> SubMask;
57692 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
57693 break;
57694 for (int M : SubMask) {
57695 if (0 <= M) {
57696 int Src = M < NumSrcElts ? 0 : 2;
57697 M += M < NumSrcElts ? 0 : NumSrcElts;
57698
57699 // Reference the lowest sub if they upper sub is the same.
57700 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
57701 M += i * NumSrcElts;
57702 }
57703 ConcatMask.push_back(M);
57704 }
57705 }
57706 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
57707 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
57708 Ops[1].getOperand(0), DAG, DL);
57709 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
57710 Ops[1].getOperand(2), DAG, DL);
57711 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
57712 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
57713 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
57714 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
57715 }
57716 }
57717 break;
57718 case X86ISD::VPERM2X128: {
57719 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
57720 assert(NumOps == 2 && "Bad concat_vectors operands");
57721 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
57722 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
57723 // TODO: Handle zero'd subvectors.
57724 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
57725 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
57726 (int)((Imm1 >> 4) & 0x3)};
57727 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
57728 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
57729 Ops[0].getOperand(1), DAG, DL);
57730 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
57731 Ops[1].getOperand(1), DAG, DL);
57732 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
57733 DAG.getBitcast(ShuffleVT, LHS),
57734 DAG.getBitcast(ShuffleVT, RHS),
57735 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
57736 return DAG.getBitcast(VT, Res);
57737 }
57738 }
57739 break;
57740 }
57741 case X86ISD::SHUF128: {
57742 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
57743 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
57744 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
57745 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
57746 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
57747 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
57748 Ops[0].getOperand(1), DAG, DL);
57749 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
57750 Ops[1].getOperand(1), DAG, DL);
57751 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
57752 DAG.getTargetConstant(Imm, DL, MVT::i8));
57753 }
57754 break;
57755 }
57756 case ISD::TRUNCATE:
57757 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
57758 EVT SrcVT = Ops[0].getOperand(0).getValueType();
57759 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
57760 SrcVT == Ops[1].getOperand(0).getValueType() &&
57761 Subtarget.useAVX512Regs() &&
57762 Subtarget.getPreferVectorWidth() >= 512 &&
57763 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
57764 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
57765 return DAG.getNode(ISD::TRUNCATE, DL, VT,
57766 ConcatSubOperand(NewSrcVT, Ops, 0));
57767 }
57768 }
57769 break;
57770 case ISD::ANY_EXTEND:
57771 case ISD::SIGN_EXTEND:
57772 case ISD::ZERO_EXTEND:
57773 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
57774 if (!IsSplat && NumOps == 2 &&
57775 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57776 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57777 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
57778 EVT SrcVT = Ops[0].getOperand(0).getValueType();
57779 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
57780 SrcVT == Ops[1].getOperand(0).getValueType()) {
57781 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
57782 return DAG.getNode(Op0.getOpcode(), DL, VT,
57783 ConcatSubOperand(NewSrcVT, Ops, 0));
57784 }
57785 }
57786 break;
57787 case X86ISD::VSHLI:
57788 case X86ISD::VSRLI:
57789 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
57790 // TODO: Move this to LowerShiftByScalarImmediate?
57791 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
57792 llvm::all_of(Ops, [](SDValue Op) {
57793 return Op.getConstantOperandAPInt(1) == 32;
57794 })) {
57795 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
57796 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
57797 if (Op0.getOpcode() == X86ISD::VSHLI) {
57798 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
57799 {8, 0, 8, 2, 8, 4, 8, 6});
57800 } else {
57801 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
57802 {1, 8, 3, 8, 5, 8, 7, 8});
57803 }
57804 return DAG.getBitcast(VT, Res);
57805 }
57806 [[fallthrough]];
57807 case X86ISD::VSRAI:
57808 case X86ISD::VSHL:
57809 case X86ISD::VSRL:
57810 case X86ISD::VSRA:
57811 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
57812 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57813 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
57814 llvm::all_of(Ops, [Op0](SDValue Op) {
57815 return Op0.getOperand(1) == Op.getOperand(1);
57816 })) {
57817 return DAG.getNode(Op0.getOpcode(), DL, VT,
57818 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
57819 }
57820 break;
57821 case X86ISD::VPERMI:
57822 case X86ISD::VROTLI:
57823 case X86ISD::VROTRI:
57824 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57825 llvm::all_of(Ops, [Op0](SDValue Op) {
57826 return Op0.getOperand(1) == Op.getOperand(1);
57827 })) {
57828 return DAG.getNode(Op0.getOpcode(), DL, VT,
57829 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
57830 }
57831 break;
57832 case ISD::AND:
57833 case ISD::OR:
57834 case ISD::XOR:
57835 case X86ISD::ANDNP:
57836 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57837 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
57838 return DAG.getNode(Op0.getOpcode(), DL, VT,
57839 ConcatSubOperand(VT, Ops, 0),
57840 ConcatSubOperand(VT, Ops, 1));
57841 }
57842 break;
57843 case X86ISD::PCMPEQ:
57844 case X86ISD::PCMPGT:
57845 if (!IsSplat && VT.is256BitVector() &&
57846 (Subtarget.hasInt256() || VT == MVT::v8i32) &&
57847 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
57848 if (Subtarget.hasInt256())
57849 return DAG.getNode(Op0.getOpcode(), DL, VT,
57850 ConcatSubOperand(VT, Ops, 0),
57851 ConcatSubOperand(VT, Ops, 1));
57852
57853 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
57854 // TODO: Handle v4f64 as well?
57855 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
57856 for (unsigned I = 0; I != NumOps; ++I) {
57857 MaxSigBitsLHS =
57858 std::max(MaxSigBitsLHS,
57859 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
57860 MaxSigBitsRHS =
57861 std::max(MaxSigBitsRHS,
57862 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
57863 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
57864 break;
57865 }
57866
57867 ISD::CondCode ICC =
57869 ISD::CondCode FCC =
57871
57872 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
57873 MVT FpVT = VT.changeVectorElementType(FpSVT);
57874
57875 if (std::optional<unsigned> CastOpc =
57876 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
57877 SDValue LHS = ConcatSubOperand(VT, Ops, 0);
57878 SDValue RHS = ConcatSubOperand(VT, Ops, 1);
57879 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
57880 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
57881
57882 bool IsAlwaysSignaling;
57883 unsigned FSETCC =
57884 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
57885 return DAG.getBitcast(
57886 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
57887 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
57888 }
57889 }
57890 break;
57891 case ISD::CTPOP:
57892 case ISD::CTTZ:
57893 case ISD::CTLZ:
57896 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57897 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
57898 return DAG.getNode(Op0.getOpcode(), DL, VT,
57899 ConcatSubOperand(VT, Ops, 0));
57900 }
57901 break;
57903 if (!IsSplat &&
57904 (VT.is256BitVector() ||
57905 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
57906 llvm::all_of(Ops, [Op0](SDValue Op) {
57907 return Op0.getOperand(2) == Op.getOperand(2);
57908 })) {
57909 return DAG.getNode(Op0.getOpcode(), DL, VT,
57910 ConcatSubOperand(VT, Ops, 0),
57911 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
57912 }
57913 break;
57914 case ISD::ADD:
57915 case ISD::SUB:
57916 case ISD::MUL:
57917 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57918 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57919 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
57920 return DAG.getNode(Op0.getOpcode(), DL, VT,
57921 ConcatSubOperand(VT, Ops, 0),
57922 ConcatSubOperand(VT, Ops, 1));
57923 }
57924 break;
57925 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
57926 // their latency are short, so here we don't replace them unless we won't
57927 // introduce extra VINSERT.
57928 case ISD::FADD:
57929 case ISD::FSUB:
57930 case ISD::FMUL:
57931 if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
57932 (VT.is256BitVector() ||
57933 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
57934 return DAG.getNode(Op0.getOpcode(), DL, VT,
57935 ConcatSubOperand(VT, Ops, 0),
57936 ConcatSubOperand(VT, Ops, 1));
57937 }
57938 break;
57939 case ISD::FDIV:
57940 if (!IsSplat && (VT.is256BitVector() ||
57941 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
57942 return DAG.getNode(Op0.getOpcode(), DL, VT,
57943 ConcatSubOperand(VT, Ops, 0),
57944 ConcatSubOperand(VT, Ops, 1));
57945 }
57946 break;
57947 case X86ISD::HADD:
57948 case X86ISD::HSUB:
57949 case X86ISD::FHADD:
57950 case X86ISD::FHSUB:
57951 if (!IsSplat && VT.is256BitVector() &&
57952 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
57953 return DAG.getNode(Op0.getOpcode(), DL, VT,
57954 ConcatSubOperand(VT, Ops, 0),
57955 ConcatSubOperand(VT, Ops, 1));
57956 }
57957 break;
57958 case X86ISD::PACKSS:
57959 case X86ISD::PACKUS:
57960 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57961 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
57962 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
57963 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
57964 NumOps * SrcVT.getVectorNumElements());
57965 return DAG.getNode(Op0.getOpcode(), DL, VT,
57966 ConcatSubOperand(SrcVT, Ops, 0),
57967 ConcatSubOperand(SrcVT, Ops, 1));
57968 }
57969 break;
57970 case X86ISD::PALIGNR:
57971 if (!IsSplat &&
57972 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57973 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
57974 llvm::all_of(Ops, [Op0](SDValue Op) {
57975 return Op0.getOperand(2) == Op.getOperand(2);
57976 })) {
57977 return DAG.getNode(Op0.getOpcode(), DL, VT,
57978 ConcatSubOperand(VT, Ops, 0),
57979 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
57980 }
57981 break;
57982 case X86ISD::BLENDI:
57983 if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) {
57984 uint64_t Mask0 = Ops[0].getConstantOperandVal(2);
57985 uint64_t Mask1 = Ops[1].getConstantOperandVal(2);
57986 // MVT::v16i16 has repeated blend mask.
57987 if (Op0.getSimpleValueType() == MVT::v16i16) {
57988 Mask0 = (Mask0 << 8) | Mask0;
57989 Mask1 = (Mask1 << 8) | Mask1;
57990 }
57991 uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0;
57993 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
57994 SDValue Sel =
57995 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
57996 return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1),
57997 ConcatSubOperand(VT, Ops, 0));
57998 }
57999 break;
58000 case ISD::VSELECT:
58001 if (!IsSplat && Subtarget.hasAVX512() &&
58002 (VT.is256BitVector() ||
58003 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58004 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
58005 EVT SelVT = Ops[0].getOperand(0).getValueType();
58006 if (SelVT.getVectorElementType() == MVT::i1) {
58007 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
58008 NumOps * SelVT.getVectorNumElements());
58009 if (TLI.isTypeLegal(SelVT))
58010 return DAG.getNode(Op0.getOpcode(), DL, VT,
58011 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
58012 ConcatSubOperand(VT, Ops, 1),
58013 ConcatSubOperand(VT, Ops, 2));
58014 }
58015 }
58016 [[fallthrough]];
58017 case X86ISD::BLENDV:
58018 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
58019 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
58020 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
58021 EVT SelVT = Ops[0].getOperand(0).getValueType();
58022 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
58023 if (TLI.isTypeLegal(SelVT))
58024 return DAG.getNode(Op0.getOpcode(), DL, VT,
58025 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
58026 ConcatSubOperand(VT, Ops, 1),
58027 ConcatSubOperand(VT, Ops, 2));
58028 }
58029 break;
58030 }
58031 }
58032
58033 // Fold subvector loads into one.
58034 // If needed, look through bitcasts to get to the load.
58035 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
58036 unsigned Fast;
58037 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
58038 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
58039 *FirstLd->getMemOperand(), &Fast) &&
58040 Fast) {
58041 if (SDValue Ld =
58042 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
58043 return Ld;
58044 }
58045 }
58046
58047 // Attempt to fold target constant loads.
58048 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
58049 SmallVector<APInt> EltBits;
58050 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
58051 for (unsigned I = 0; I != NumOps; ++I) {
58052 APInt OpUndefElts;
58053 SmallVector<APInt> OpEltBits;
58054 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
58055 OpEltBits, /*AllowWholeUndefs*/ true,
58056 /*AllowPartialUndefs*/ false))
58057 break;
58058 EltBits.append(OpEltBits);
58059 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
58060 }
58061 if (EltBits.size() == VT.getVectorNumElements()) {
58062 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
58063 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
58064 SDValue CV = DAG.getConstantPool(C, PVT);
58067 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
58068 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
58069 DAG.ReplaceAllUsesOfValueWith(Op0, Sub);
58070 return Ld;
58071 }
58072 }
58073
58074 // If this simple subvector or scalar/subvector broadcast_load is inserted
58075 // into both halves, use a larger broadcast_load. Update other uses to use
58076 // an extracted subvector.
58077 if (IsSplat &&
58078 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58079 if (ISD::isNormalLoad(Op0.getNode()) ||
58082 auto *Mem = cast<MemSDNode>(Op0);
58083 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
58086 if (SDValue BcastLd =
58087 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
58088 SDValue BcastSrc =
58089 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
58090 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
58091 return BcastLd;
58092 }
58093 }
58094 }
58095
58096 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
58097 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
58098 Subtarget.useAVX512Regs()) {
58099 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58100 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
58101 Res = DAG.getBitcast(ShuffleVT, Res);
58102 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
58103 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58104 return DAG.getBitcast(VT, Res);
58105 }
58106
58107 return SDValue();
58108}
58109
58112 const X86Subtarget &Subtarget) {
58113 EVT VT = N->getValueType(0);
58114 EVT SrcVT = N->getOperand(0).getValueType();
58115 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58116 SmallVector<SDValue, 4> Ops(N->ops());
58117
58118 if (VT.getVectorElementType() == MVT::i1) {
58119 // Attempt to constant fold.
58120 unsigned SubSizeInBits = SrcVT.getSizeInBits();
58122 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
58123 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
58124 if (!C) break;
58125 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
58126 if (I == (E - 1)) {
58127 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
58128 if (TLI.isTypeLegal(IntVT))
58129 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
58130 }
58131 }
58132
58133 // Don't do anything else for i1 vectors.
58134 return SDValue();
58135 }
58136
58137 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
58138 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
58139 DCI, Subtarget))
58140 return R;
58141 }
58142
58143 return SDValue();
58144}
58145
58148 const X86Subtarget &Subtarget) {
58149 if (DCI.isBeforeLegalizeOps())
58150 return SDValue();
58151
58152 MVT OpVT = N->getSimpleValueType(0);
58153
58154 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
58155
58156 SDLoc dl(N);
58157 SDValue Vec = N->getOperand(0);
58158 SDValue SubVec = N->getOperand(1);
58159
58160 uint64_t IdxVal = N->getConstantOperandVal(2);
58161 MVT SubVecVT = SubVec.getSimpleValueType();
58162
58163 if (Vec.isUndef() && SubVec.isUndef())
58164 return DAG.getUNDEF(OpVT);
58165
58166 // Inserting undefs/zeros into zeros/undefs is a zero vector.
58167 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
58168 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
58169 return getZeroVector(OpVT, Subtarget, DAG, dl);
58170
58172 // If we're inserting into a zero vector and then into a larger zero vector,
58173 // just insert into the larger zero vector directly.
58174 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
58176 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
58177 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
58178 getZeroVector(OpVT, Subtarget, DAG, dl),
58179 SubVec.getOperand(1),
58180 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
58181 }
58182
58183 // If we're inserting into a zero vector and our input was extracted from an
58184 // insert into a zero vector of the same type and the extraction was at
58185 // least as large as the original insertion. Just insert the original
58186 // subvector into a zero vector.
58187 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
58188 isNullConstant(SubVec.getOperand(1)) &&
58190 SDValue Ins = SubVec.getOperand(0);
58191 if (isNullConstant(Ins.getOperand(2)) &&
58192 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
58193 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
58194 SubVecVT.getFixedSizeInBits())
58195 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
58196 getZeroVector(OpVT, Subtarget, DAG, dl),
58197 Ins.getOperand(1), N->getOperand(2));
58198 }
58199 }
58200
58201 // Stop here if this is an i1 vector.
58202 if (IsI1Vector)
58203 return SDValue();
58204
58205 // Eliminate an intermediate vector widening:
58206 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
58207 // insert_subvector X, Y, Idx
58208 // TODO: This is a more general version of a DAGCombiner fold, can we move it
58209 // there?
58210 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
58211 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
58212 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
58213 SubVec.getOperand(1), N->getOperand(2));
58214
58215 // If this is an insert of an extract, combine to a shuffle. Don't do this
58216 // if the insert or extract can be represented with a subregister operation.
58217 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58218 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
58219 (IdxVal != 0 ||
58220 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
58221 int ExtIdxVal = SubVec.getConstantOperandVal(1);
58222 if (ExtIdxVal != 0) {
58223 int VecNumElts = OpVT.getVectorNumElements();
58224 int SubVecNumElts = SubVecVT.getVectorNumElements();
58225 SmallVector<int, 64> Mask(VecNumElts);
58226 // First create an identity shuffle mask.
58227 for (int i = 0; i != VecNumElts; ++i)
58228 Mask[i] = i;
58229 // Now insert the extracted portion.
58230 for (int i = 0; i != SubVecNumElts; ++i)
58231 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
58232
58233 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
58234 }
58235 }
58236
58237 // Match concat_vector style patterns.
58238 SmallVector<SDValue, 2> SubVectorOps;
58239 if (collectConcatOps(N, SubVectorOps, DAG)) {
58240 if (SDValue Fold =
58241 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
58242 return Fold;
58243
58244 // If we're inserting all zeros into the upper half, change this to
58245 // a concat with zero. We will match this to a move
58246 // with implicit upper bit zeroing during isel.
58247 // We do this here because we don't want combineConcatVectorOps to
58248 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
58249 if (SubVectorOps.size() == 2 &&
58250 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
58251 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
58252 getZeroVector(OpVT, Subtarget, DAG, dl),
58253 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
58254
58255 // Attempt to recursively combine to a shuffle.
58256 if (all_of(SubVectorOps, [](SDValue SubOp) {
58257 return isTargetShuffle(SubOp.getOpcode());
58258 })) {
58259 SDValue Op(N, 0);
58260 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
58261 return Res;
58262 }
58263 }
58264
58265 // If this is a broadcast insert into an upper undef, use a larger broadcast.
58266 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
58267 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
58268
58269 // If this is a broadcast load inserted into an upper undef, use a larger
58270 // broadcast load.
58271 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
58272 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
58273 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
58274 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
58275 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
58276 SDValue BcastLd =
58278 MemIntr->getMemoryVT(),
58279 MemIntr->getMemOperand());
58280 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
58281 return BcastLd;
58282 }
58283
58284 // If we're splatting the lower half subvector of a full vector load into the
58285 // upper half, attempt to create a subvector broadcast.
58286 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
58287 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
58288 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
58289 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
58290 if (VecLd && SubLd &&
58291 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
58292 SubVec.getValueSizeInBits() / 8, 0))
58293 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
58294 SubLd, 0, DAG);
58295 }
58296
58297 return SDValue();
58298}
58299
58300/// If we are extracting a subvector of a vector select and the select condition
58301/// is composed of concatenated vectors, try to narrow the select width. This
58302/// is a common pattern for AVX1 integer code because 256-bit selects may be
58303/// legal, but there is almost no integer math/logic available for 256-bit.
58304/// This function should only be called with legal types (otherwise, the calls
58305/// to get simple value types will assert).
58307 SelectionDAG &DAG) {
58308 SDValue Sel = Ext->getOperand(0);
58309 if (Sel.getOpcode() != ISD::VSELECT ||
58310 !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
58311 return SDValue();
58312
58313 // Note: We assume simple value types because this should only be called with
58314 // legal operations/types.
58315 // TODO: This can be extended to handle extraction to 256-bits.
58316 MVT VT = Ext->getSimpleValueType(0);
58317 if (!VT.is128BitVector())
58318 return SDValue();
58319
58320 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
58321 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
58322 return SDValue();
58323
58324 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
58325 MVT SelVT = Sel.getSimpleValueType();
58326 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
58327 "Unexpected vector type with legal operations");
58328
58329 unsigned SelElts = SelVT.getVectorNumElements();
58330 unsigned CastedElts = WideVT.getVectorNumElements();
58331 unsigned ExtIdx = Ext->getConstantOperandVal(1);
58332 if (SelElts % CastedElts == 0) {
58333 // The select has the same or more (narrower) elements than the extract
58334 // operand. The extraction index gets scaled by that factor.
58335 ExtIdx *= (SelElts / CastedElts);
58336 } else if (CastedElts % SelElts == 0) {
58337 // The select has less (wider) elements than the extract operand. Make sure
58338 // that the extraction index can be divided evenly.
58339 unsigned IndexDivisor = CastedElts / SelElts;
58340 if (ExtIdx % IndexDivisor != 0)
58341 return SDValue();
58342 ExtIdx /= IndexDivisor;
58343 } else {
58344 llvm_unreachable("Element count of simple vector types are not divisible?");
58345 }
58346
58347 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
58348 unsigned NarrowElts = SelElts / NarrowingFactor;
58349 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
58350 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
58351 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
58352 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
58353 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
58354 return DAG.getBitcast(VT, NarrowSel);
58355}
58356
58359 const X86Subtarget &Subtarget) {
58360 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
58361 // eventually get combined/lowered into ANDNP) with a concatenated operand,
58362 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
58363 // We let generic combining take over from there to simplify the
58364 // insert/extract and 'not'.
58365 // This pattern emerges during AVX1 legalization. We handle it before lowering
58366 // to avoid complications like splitting constant vector loads.
58367
58368 // Capture the original wide type in the likely case that we need to bitcast
58369 // back to this type.
58370 if (!N->getValueType(0).isSimple())
58371 return SDValue();
58372
58373 MVT VT = N->getSimpleValueType(0);
58374 SDValue InVec = N->getOperand(0);
58375 unsigned IdxVal = N->getConstantOperandVal(1);
58376 SDValue InVecBC = peekThroughBitcasts(InVec);
58377 EVT InVecVT = InVec.getValueType();
58378 unsigned SizeInBits = VT.getSizeInBits();
58379 unsigned InSizeInBits = InVecVT.getSizeInBits();
58380 unsigned NumSubElts = VT.getVectorNumElements();
58381 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58382 SDLoc DL(N);
58383
58384 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
58385 TLI.isTypeLegal(InVecVT) &&
58386 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
58387 auto isConcatenatedNot = [](SDValue V) {
58388 V = peekThroughBitcasts(V);
58389 if (!isBitwiseNot(V))
58390 return false;
58391 SDValue NotOp = V->getOperand(0);
58393 };
58394 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
58395 isConcatenatedNot(InVecBC.getOperand(1))) {
58396 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
58397 SDValue Concat = splitVectorIntBinary(InVecBC, DAG, SDLoc(InVecBC));
58398 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
58399 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
58400 }
58401 }
58402
58403 if (DCI.isBeforeLegalizeOps())
58404 return SDValue();
58405
58406 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
58407 return V;
58408
58410 return getZeroVector(VT, Subtarget, DAG, DL);
58411
58412 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
58413 if (VT.getScalarType() == MVT::i1)
58414 return DAG.getConstant(1, DL, VT);
58415 return getOnesVector(VT, DAG, DL);
58416 }
58417
58418 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
58419 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
58420
58421 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
58422 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58423 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
58424 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
58425 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
58426 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
58427 }
58428
58429 // If we are extracting from an insert into a larger vector, replace with a
58430 // smaller insert if we don't access less than the original subvector. Don't
58431 // do this for i1 vectors.
58432 // TODO: Relax the matching indices requirement?
58433 if (VT.getVectorElementType() != MVT::i1 &&
58434 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
58435 IdxVal == InVec.getConstantOperandVal(2) &&
58436 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
58437 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
58438 InVec.getOperand(0), N->getOperand(1));
58439 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
58440 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
58441 InVec.getOperand(1),
58442 DAG.getVectorIdxConstant(NewIdxVal, DL));
58443 }
58444
58445 // If we're extracting an upper subvector from a broadcast we should just
58446 // extract the lowest subvector instead which should allow
58447 // SimplifyDemandedVectorElts do more simplifications.
58448 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
58450 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
58451 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
58452
58453 // If we're extracting a broadcasted subvector, just use the lowest subvector.
58454 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58455 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
58456 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
58457
58458 // Attempt to extract from the source of a shuffle vector.
58459 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
58460 SmallVector<int, 32> ShuffleMask;
58461 SmallVector<int, 32> ScaledMask;
58462 SmallVector<SDValue, 2> ShuffleInputs;
58463 unsigned NumSubVecs = InSizeInBits / SizeInBits;
58464 // Decode the shuffle mask and scale it so its shuffling subvectors.
58465 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
58466 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
58467 unsigned SubVecIdx = IdxVal / NumSubElts;
58468 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
58469 return DAG.getUNDEF(VT);
58470 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
58471 return getZeroVector(VT, Subtarget, DAG, DL);
58472 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
58473 if (Src.getValueSizeInBits() == InSizeInBits) {
58474 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
58475 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
58476 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
58477 DL, SizeInBits);
58478 }
58479 }
58480 }
58481
58482 auto IsExtractFree = [](SDValue V) {
58483 if (V.hasOneUse()) {
58485 if (V.getOpcode() == ISD::LOAD)
58486 return true;
58487 }
58488 V = peekThroughBitcasts(V);
58489 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
58490 return true;
58492 return true;
58493 return V.isUndef();
58494 };
58495
58496 // If we're extracting the lowest subvector and we're the only user,
58497 // we may be able to perform this with a smaller vector width.
58498 unsigned InOpcode = InVec.getOpcode();
58499 if (InVec.hasOneUse()) {
58500 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
58501 // v2f64 CVTDQ2PD(v4i32).
58502 if (InOpcode == ISD::SINT_TO_FP &&
58503 InVec.getOperand(0).getValueType() == MVT::v4i32) {
58504 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
58505 }
58506 // v2f64 CVTUDQ2PD(v4i32).
58507 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
58508 InVec.getOperand(0).getValueType() == MVT::v4i32) {
58509 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
58510 }
58511 // v2f64 CVTPS2PD(v4f32).
58512 if (InOpcode == ISD::FP_EXTEND &&
58513 InVec.getOperand(0).getValueType() == MVT::v4f32) {
58514 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
58515 }
58516 }
58517 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
58518 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
58519 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
58520 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
58521 Subtarget.hasVLX())) &&
58522 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
58523 SDValue Src = InVec.getOperand(0);
58524 if (Src.getValueType().getScalarSizeInBits() == 32)
58525 return DAG.getNode(InOpcode, DL, VT,
58526 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
58527 }
58528 if (IdxVal == 0 &&
58529 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
58530 (SizeInBits == 128 || SizeInBits == 256) &&
58531 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
58532 SDValue Ext = InVec.getOperand(0);
58533 if (Ext.getValueSizeInBits() > SizeInBits)
58534 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
58535 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
58536 return DAG.getNode(ExtOp, DL, VT, Ext);
58537 }
58538 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
58539 InVec.getOperand(0).getValueType().is256BitVector() &&
58540 InVec.getOperand(1).getValueType().is256BitVector() &&
58541 InVec.getOperand(2).getValueType().is256BitVector()) {
58542 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
58543 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
58544 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
58545 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
58546 }
58547 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
58548 (SizeInBits == 128 || SizeInBits == 256)) {
58549 SDValue InVecSrc = InVec.getOperand(0);
58550 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
58551 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
58552 return DAG.getNode(InOpcode, DL, VT, Ext);
58553 }
58554
58555 if (SizeInBits == 128 || SizeInBits == 256) {
58556 switch (InOpcode) {
58557 case X86ISD::MOVDDUP:
58558 return DAG.getNode(
58559 InOpcode, DL, VT,
58560 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
58561 case X86ISD::PSHUFD:
58562 case X86ISD::VPERMILPI:
58563 if (InVec.getOperand(0).hasOneUse()) {
58564 uint64_t M = InVec.getConstantOperandVal(1) & 255;
58565 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
58566 return DAG.getNode(InOpcode, DL, VT,
58567 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58568 DL, SizeInBits),
58569 DAG.getTargetConstant(M, DL, MVT::i8));
58570 }
58571 break;
58572 case X86ISD::PCMPEQ:
58573 case X86ISD::PCMPGT:
58574 case X86ISD::UNPCKH:
58575 case X86ISD::UNPCKL:
58576 if (IsExtractFree(InVec.getOperand(0)) ||
58577 IsExtractFree(InVec.getOperand(1)))
58578 return DAG.getNode(InOpcode, DL, VT,
58579 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58580 DL, SizeInBits),
58581 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
58582 DL, SizeInBits));
58583 break;
58584 case X86ISD::CMPP:
58585 if (IsExtractFree(InVec.getOperand(0)) ||
58586 IsExtractFree(InVec.getOperand(1)))
58587 return DAG.getNode(InOpcode, DL, VT,
58588 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58589 DL, SizeInBits),
58590 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
58591 DL, SizeInBits),
58592 InVec.getOperand(2));
58593 break;
58594 case X86ISD::BLENDI:
58595 if (IsExtractFree(InVec.getOperand(0)) ||
58596 IsExtractFree(InVec.getOperand(1))) {
58597 uint64_t M = InVec.getConstantOperandVal(2) & 255;
58598 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
58599 return DAG.getNode(InOpcode, DL, VT,
58600 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58601 DL, SizeInBits),
58602 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
58603 DL, SizeInBits),
58604 DAG.getTargetConstant(M, DL, MVT::i8));
58605 }
58606 break;
58607 case X86ISD::VPERMV3:
58608 if (IdxVal != 0) {
58609 SDValue Src0 = InVec.getOperand(0);
58610 SDValue Mask = InVec.getOperand(1);
58611 SDValue Src1 = InVec.getOperand(2);
58612 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
58613 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
58614 DL, InSizeInBits);
58615 SDValue Shuffle =
58616 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
58617 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
58618 }
58619 break;
58620 }
58621 }
58622 }
58623
58624 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
58625 // as this is very likely to fold into a shuffle/truncation.
58626 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
58627 InVecVT.getScalarSizeInBits() == 64 &&
58628 InVec.getConstantOperandAPInt(1) == 32) {
58629 SDValue Ext =
58630 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
58631 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
58632 }
58633
58634 return SDValue();
58635}
58636
58638 const X86Subtarget &Subtarget) {
58639 using namespace SDPatternMatch;
58640 EVT VT = N->getValueType(0);
58641 SDValue Src = N->getOperand(0);
58642 SDLoc DL(N);
58643
58644 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
58645 // This occurs frequently in our masked scalar intrinsic code and our
58646 // floating point select lowering with AVX512.
58647 // TODO: SimplifyDemandedBits instead?
58648 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
58649 isOneConstant(Src.getOperand(1)))
58650 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
58651
58652 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
58653 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
58654 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
58655 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
58656 isNullConstant(Src.getOperand(1)))
58657 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
58658 Src.getOperand(1));
58659
58660 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
58661 // TODO: Move to DAGCombine/SimplifyDemandedBits?
58662 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
58663 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
58664 if (Op.getValueType() != MVT::i64)
58665 return SDValue();
58666 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
58667 if (Op.getOpcode() == Opc &&
58668 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
58669 return Op.getOperand(0);
58670 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
58671 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
58672 if (Ld->getExtensionType() == Ext &&
58673 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
58674 return Op;
58675 if (IsZeroExt) {
58676 KnownBits Known = DAG.computeKnownBits(Op);
58677 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
58678 return Op;
58679 }
58680 return SDValue();
58681 };
58682
58683 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
58684 return DAG.getBitcast(
58685 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
58686 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
58687
58688 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
58689 return DAG.getBitcast(
58690 VT,
58691 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
58692 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
58693 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
58694 }
58695
58696 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST) {
58697 SDValue SrcOp = Src.getOperand(0);
58698 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
58699 if (SrcOp.getValueType() == MVT::f64)
58700 return DAG.getBitcast(
58701 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
58702 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
58703 if (SrcOp.getValueType() == MVT::x86mmx)
58704 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
58705 }
58706
58707 if (VT == MVT::v4i32) {
58708 SDValue HalfSrc;
58709 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
58710 // to remove XMM->GPR->XMM moves.
58711 if (sd_match(Src, m_AnyExt(m_BitCast(
58712 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
58713 return DAG.getBitcast(
58714 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
58715 }
58716
58717 // See if we're broadcasting the scalar value, in which case just reuse that.
58718 // Ensure the same SDValue from the SDNode use is being used.
58719 if (VT.getScalarType() == Src.getValueType())
58720 for (SDNode *User : Src->users())
58721 if (User->getOpcode() == X86ISD::VBROADCAST &&
58722 Src == User->getOperand(0)) {
58723 unsigned SizeInBits = VT.getFixedSizeInBits();
58724 unsigned BroadcastSizeInBits =
58725 User->getValueSizeInBits(0).getFixedValue();
58726 if (BroadcastSizeInBits == SizeInBits)
58727 return SDValue(User, 0);
58728 if (BroadcastSizeInBits > SizeInBits)
58729 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
58730 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
58731 // coverage.
58732 }
58733
58734 // Check for cases where we've ended up with a scalarized shift, typically
58735 // during type legalization.
58736 switch (Src.getOpcode()) {
58737 case ISD::SHL:
58738 case ISD::SRL:
58739 case ISD::SRA:
58740 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
58741 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
58742 Src.hasOneUse()) {
58743 SDValue SrcVec =
58744 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
58745 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
58746 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
58747 Amt->getZExtValue(), DAG);
58748 }
58749 }
58750 break;
58751 case ISD::FSHL:
58752 case ISD::FSHR:
58753 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
58754 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
58755 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
58756 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
58757 Src.hasOneUse()) {
58758 uint64_t AmtVal =
58759 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
58760 SDValue SrcVec0 =
58761 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
58762 SDValue SrcVec1 =
58763 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
58764 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
58765 DAG.getConstant(AmtVal, DL, VT));
58766 }
58767 }
58768 break;
58769 }
58770
58771 return SDValue();
58772}
58773
58774// Simplify PMULDQ and PMULUDQ operations.
58777 const X86Subtarget &Subtarget) {
58778 SDValue LHS = N->getOperand(0);
58779 SDValue RHS = N->getOperand(1);
58780
58781 // Canonicalize constant to RHS.
58784 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
58785
58786 // Multiply by zero.
58787 // Don't return RHS as it may contain UNDEFs.
58788 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
58789 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
58790
58791 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
58792 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58793 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
58794 return SDValue(N, 0);
58795
58796 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
58797 // convert it to any_extend_invec, due to the LegalOperations check, do the
58798 // conversion directly to a vector shuffle manually. This exposes combine
58799 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
58800 // combineX86ShufflesRecursively on SSE4.1 targets.
58801 // FIXME: This is basically a hack around several other issues related to
58802 // ANY_EXTEND_VECTOR_INREG.
58803 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
58804 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
58805 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
58806 LHS.getOperand(0).getValueType() == MVT::v4i32) {
58807 SDLoc dl(N);
58808 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
58809 LHS.getOperand(0), { 0, -1, 1, -1 });
58810 LHS = DAG.getBitcast(MVT::v2i64, LHS);
58811 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
58812 }
58813 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
58814 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
58815 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
58816 RHS.getOperand(0).getValueType() == MVT::v4i32) {
58817 SDLoc dl(N);
58818 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
58819 RHS.getOperand(0), { 0, -1, 1, -1 });
58820 RHS = DAG.getBitcast(MVT::v2i64, RHS);
58821 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
58822 }
58823
58824 return SDValue();
58825}
58826
58827// Simplify VPMADDUBSW/VPMADDWD operations.
58830 MVT VT = N->getSimpleValueType(0);
58831 SDValue LHS = N->getOperand(0);
58832 SDValue RHS = N->getOperand(1);
58833 unsigned Opc = N->getOpcode();
58834 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
58835 assert((Opc == X86ISD::VPMADDWD || Opc == X86ISD::VPMADDUBSW) &&
58836 "Unexpected PMADD opcode");
58837
58838 // Multiply by zero.
58839 // Don't return LHS/RHS as it may contain UNDEFs.
58840 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
58842 return DAG.getConstant(0, SDLoc(N), VT);
58843
58844 // Constant folding.
58845 APInt LHSUndefs, RHSUndefs;
58846 SmallVector<APInt> LHSBits, RHSBits;
58847 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
58848 unsigned DstEltBits = VT.getScalarSizeInBits();
58849 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
58850 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
58851 SmallVector<APInt> Result;
58852 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
58853 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
58854 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
58855 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
58856 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
58857 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
58858 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
58859 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
58860 Result.push_back(Res);
58861 }
58862 return getConstVector(Result, VT, DAG, SDLoc(N));
58863 }
58864
58865 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58866 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
58867 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
58868 return SDValue(N, 0);
58869
58870 return SDValue();
58871}
58872
58875 const X86Subtarget &Subtarget) {
58876 EVT VT = N->getValueType(0);
58877 SDValue In = N->getOperand(0);
58878 unsigned Opcode = N->getOpcode();
58879 unsigned InOpcode = In.getOpcode();
58880 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58881 SDLoc DL(N);
58882
58883 // Try to merge vector loads and extend_inreg to an extload.
58884 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
58885 In.hasOneUse()) {
58886 auto *Ld = cast<LoadSDNode>(In);
58887 if (Ld->isSimple()) {
58888 MVT SVT = In.getSimpleValueType().getVectorElementType();
58891 : ISD::ZEXTLOAD;
58892 EVT MemVT = VT.changeVectorElementType(SVT);
58893 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
58894 SDValue Load = DAG.getExtLoad(
58895 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
58896 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
58897 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
58898 return Load;
58899 }
58900 }
58901 }
58902
58903 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
58904 if (Opcode == InOpcode)
58905 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
58906
58907 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
58908 // -> EXTEND_VECTOR_INREG(X).
58909 // TODO: Handle non-zero subvector indices.
58910 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
58911 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
58912 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
58913 In.getValueSizeInBits())
58914 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
58915
58916 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
58917 // TODO: Move to DAGCombine?
58918 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
58919 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
58920 In.getValueSizeInBits() == VT.getSizeInBits()) {
58921 unsigned NumElts = VT.getVectorNumElements();
58922 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
58923 EVT EltVT = In.getOperand(0).getValueType();
58924 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
58925 for (unsigned I = 0; I != NumElts; ++I)
58926 Elts[I * Scale] = In.getOperand(I);
58927 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
58928 }
58929
58930 // Attempt to combine as a shuffle on SSE41+ targets.
58931 if (Subtarget.hasSSE41()) {
58932 SDValue Op(N, 0);
58933 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
58934 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
58935 return Res;
58936 }
58937
58938 return SDValue();
58939}
58940
58943 EVT VT = N->getValueType(0);
58944 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58945 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
58946 return DAG.getConstant(0, SDLoc(N), VT);
58947
58948 // Fold kshiftr(extract_subvector(X,C1),C2)
58949 // --> extract_subvector(kshiftr(X,C1+C2),0)
58950 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
58951 if (N->getOpcode() == X86ISD::KSHIFTR) {
58952 SDLoc DL(N);
58953 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
58954 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
58955 SDValue Src = N->getOperand(0).getOperand(0);
58956 uint64_t Amt = N->getConstantOperandVal(1) +
58957 N->getOperand(0).getConstantOperandVal(1);
58958 EVT SrcVT = Src.getValueType();
58959 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
58960 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
58961 DAG.getTargetConstant(Amt, DL, MVT::i8));
58962 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
58963 DAG.getVectorIdxConstant(0, DL));
58964 }
58965 }
58966 }
58967
58968 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
58969 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
58970 return SDValue(N, 0);
58971
58972 return SDValue();
58973}
58974
58975// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
58976// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
58977// extra instructions between the conversion due to going to scalar and back.
58979 const X86Subtarget &Subtarget) {
58980 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
58981 return SDValue();
58982
58983 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
58984 return SDValue();
58985
58986 if (N->getValueType(0) != MVT::f32 ||
58987 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
58988 return SDValue();
58989
58990 SDLoc dl(N);
58991 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
58992 N->getOperand(0).getOperand(0));
58993 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
58994 DAG.getTargetConstant(4, dl, MVT::i32));
58995 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
58996 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
58997 DAG.getVectorIdxConstant(0, dl));
58998}
58999
59002 const X86Subtarget &Subtarget) {
59003 EVT VT = N->getValueType(0);
59004 bool IsStrict = N->isStrictFPOpcode();
59005 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
59006 EVT SrcVT = Src.getValueType();
59007
59008 SDLoc dl(N);
59009 if (SrcVT.getScalarType() == MVT::bf16) {
59010 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
59011 !IsStrict && Src.getOperand(0).getValueType() == VT)
59012 return Src.getOperand(0);
59013
59014 if (!SrcVT.isVector())
59015 return SDValue();
59016
59017 assert(!IsStrict && "Strict FP doesn't support BF16");
59018 if (VT.getVectorElementType() == MVT::f64) {
59019 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
59020 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
59021 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
59022 }
59023 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
59024 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
59025 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
59026 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
59027 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
59028 return DAG.getBitcast(VT, Src);
59029 }
59030
59031 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
59032 return SDValue();
59033
59034 if (Subtarget.hasFP16())
59035 return SDValue();
59036
59037 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
59038 return SDValue();
59039
59040 if (VT.getVectorElementType() != MVT::f32 &&
59041 VT.getVectorElementType() != MVT::f64)
59042 return SDValue();
59043
59044 unsigned NumElts = VT.getVectorNumElements();
59045 if (NumElts == 1 || !isPowerOf2_32(NumElts))
59046 return SDValue();
59047
59048 // Convert the input to vXi16.
59049 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
59050 Src = DAG.getBitcast(IntVT, Src);
59051
59052 // Widen to at least 8 input elements.
59053 if (NumElts < 8) {
59054 unsigned NumConcats = 8 / NumElts;
59055 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
59056 : DAG.getConstant(0, dl, IntVT);
59057 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
59058 Ops[0] = Src;
59059 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
59060 }
59061
59062 // Destination is vXf32 with at least 4 elements.
59063 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
59064 std::max(4U, NumElts));
59065 SDValue Cvt, Chain;
59066 if (IsStrict) {
59067 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
59068 {N->getOperand(0), Src});
59069 Chain = Cvt.getValue(1);
59070 } else {
59071 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
59072 }
59073
59074 if (NumElts < 4) {
59075 assert(NumElts == 2 && "Unexpected size");
59076 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
59077 DAG.getVectorIdxConstant(0, dl));
59078 }
59079
59080 if (IsStrict) {
59081 // Extend to the original VT if necessary.
59082 if (Cvt.getValueType() != VT) {
59083 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
59084 {Chain, Cvt});
59085 Chain = Cvt.getValue(1);
59086 }
59087 return DAG.getMergeValues({Cvt, Chain}, dl);
59088 }
59089
59090 // Extend to the original VT if necessary.
59091 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
59092}
59093
59094// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
59095// from. Limit this to cases where the loads have the same input chain and the
59096// output chains are unused. This avoids any memory ordering issues.
59099 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
59100 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
59101 "Unknown broadcast load type");
59102
59103 // Only do this if the chain result is unused.
59104 if (N->hasAnyUseOfValue(1))
59105 return SDValue();
59106
59107 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
59108
59109 SDValue Ptr = MemIntrin->getBasePtr();
59110 SDValue Chain = MemIntrin->getChain();
59111 EVT VT = N->getSimpleValueType(0);
59112 EVT MemVT = MemIntrin->getMemoryVT();
59113
59114 // Look at other users of our base pointer and try to find a wider broadcast.
59115 // The input chain and the size of the memory VT must match.
59116 for (SDNode *User : Ptr->users())
59117 if (User != N && User->getOpcode() == N->getOpcode() &&
59118 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
59119 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
59120 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
59121 MemVT.getSizeInBits() &&
59122 !User->hasAnyUseOfValue(1) &&
59123 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
59124 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
59125 VT.getSizeInBits());
59126 Extract = DAG.getBitcast(VT, Extract);
59127 return DCI.CombineTo(N, Extract, SDValue(User, 1));
59128 }
59129
59130 return SDValue();
59131}
59132
59134 const X86Subtarget &Subtarget) {
59135 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
59136 return SDValue();
59137
59138 bool IsStrict = N->isStrictFPOpcode();
59139 EVT VT = N->getValueType(0);
59140 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
59141 EVT SrcVT = Src.getValueType();
59142
59143 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
59144 SrcVT.getVectorElementType() != MVT::f32)
59145 return SDValue();
59146
59147 SDLoc dl(N);
59148
59149 SDValue Cvt, Chain;
59150 unsigned NumElts = VT.getVectorNumElements();
59151 if (Subtarget.hasFP16()) {
59152 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
59153 // v4f32 (xint_to_fp v4i64))))
59154 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
59155 // v8f16 (CVTXI2P v4i64)))
59156 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
59157 Src.getNumOperands() == 2) {
59158 SDValue Cvt0, Cvt1;
59159 SDValue Op0 = Src.getOperand(0);
59160 SDValue Op1 = Src.getOperand(1);
59161 bool IsOp0Strict = Op0->isStrictFPOpcode();
59162 if (Op0.getOpcode() != Op1.getOpcode() ||
59163 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
59164 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
59165 return SDValue();
59166 }
59167 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
59168 if (IsStrict) {
59169 assert(IsOp0Strict && "Op0 must be strict node");
59170 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
59173 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
59174 {Op0.getOperand(0), Op0.getOperand(1)});
59175 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
59176 {Op1.getOperand(0), Op1.getOperand(1)});
59177 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
59178 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
59179 }
59180 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
59182 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
59183 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
59184 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
59185 }
59186 return SDValue();
59187 }
59188
59189 if (NumElts == 1 || !isPowerOf2_32(NumElts))
59190 return SDValue();
59191
59192 // Widen to at least 4 input elements.
59193 if (NumElts < 4)
59194 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
59195 DAG.getConstantFP(0.0, dl, SrcVT));
59196
59197 // Destination is v8i16 with at least 8 elements.
59198 EVT CvtVT =
59199 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
59200 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
59201 if (IsStrict) {
59202 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
59203 {N->getOperand(0), Src, Rnd});
59204 Chain = Cvt.getValue(1);
59205 } else {
59206 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
59207 }
59208
59209 // Extract down to real number of elements.
59210 if (NumElts < 8) {
59212 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
59213 DAG.getVectorIdxConstant(0, dl));
59214 }
59215
59216 Cvt = DAG.getBitcast(VT, Cvt);
59217
59218 if (IsStrict)
59219 return DAG.getMergeValues({Cvt, Chain}, dl);
59220
59221 return Cvt;
59222}
59223
59225 SDValue Src = N->getOperand(0);
59226
59227 // Turn MOVDQ2Q+simple_load into an mmx load.
59228 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
59229 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
59230
59231 if (LN->isSimple()) {
59232 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
59233 LN->getBasePtr(),
59234 LN->getPointerInfo(),
59235 LN->getOriginalAlign(),
59236 LN->getMemOperand()->getFlags());
59237 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
59238 return NewLd;
59239 }
59240 }
59241
59242 return SDValue();
59243}
59244
59247 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
59248 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59249 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
59250 return SDValue(N, 0);
59251
59252 return SDValue();
59253}
59254
59255// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
59256// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
59257// use x86mmx instead.
59259 SDLoc dl(N);
59260
59261 bool MadeChange = false, CastReturnVal = false;
59263 for (const SDValue &Arg : N->op_values()) {
59264 if (Arg.getValueType() == MVT::v1i64) {
59265 MadeChange = true;
59266 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
59267 } else
59268 Args.push_back(Arg);
59269 }
59270 SDVTList VTs = N->getVTList();
59271 SDVTList NewVTs = VTs;
59272 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
59273 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
59274 NewVTArr[0] = MVT::x86mmx;
59275 NewVTs = DAG.getVTList(NewVTArr);
59276 MadeChange = true;
59277 CastReturnVal = true;
59278 }
59279
59280 if (MadeChange) {
59281 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
59282 if (CastReturnVal) {
59284 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
59285 Returns.push_back(Result.getValue(i));
59286 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
59287 return DAG.getMergeValues(Returns, dl);
59288 }
59289 return Result;
59290 }
59291 return SDValue();
59292}
59295 if (!DCI.isBeforeLegalize())
59296 return SDValue();
59297
59298 unsigned IntNo = N->getConstantOperandVal(0);
59299 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
59300
59301 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59302 return FixupMMXIntrinsicTypes(N, DAG);
59303
59304 return SDValue();
59305}
59306
59309 if (!DCI.isBeforeLegalize())
59310 return SDValue();
59311
59312 unsigned IntNo = N->getConstantOperandVal(1);
59313 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
59314
59315 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59316 return FixupMMXIntrinsicTypes(N, DAG);
59317
59318 return SDValue();
59319}
59320
59323 if (!DCI.isBeforeLegalize())
59324 return SDValue();
59325
59326 unsigned IntNo = N->getConstantOperandVal(1);
59327 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
59328
59329 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59330 return FixupMMXIntrinsicTypes(N, DAG);
59331
59332 return SDValue();
59333}
59334
59336 DAGCombinerInfo &DCI) const {
59337 SelectionDAG &DAG = DCI.DAG;
59338 switch (N->getOpcode()) {
59339 // clang-format off
59340 default: break;
59342 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
59344 case X86ISD::PEXTRW:
59345 case X86ISD::PEXTRB:
59346 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
59348 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
59350 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
59352 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
59353 case ISD::VSELECT:
59354 case ISD::SELECT:
59355 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
59356 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
59357 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
59358 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
59359 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
59360 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
59361 case X86ISD::ADD:
59362 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
59363 case X86ISD::CLOAD:
59364 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
59365 case X86ISD::SBB: return combineSBB(N, DAG);
59366 case X86ISD::ADC: return combineADC(N, DAG, DCI);
59367 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
59368 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
59369 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
59370 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
59371 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
59372 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
59373 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
59374 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
59375 case ISD::AVGCEILS:
59376 case ISD::AVGCEILU:
59377 case ISD::AVGFLOORS:
59378 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
59379 case X86ISD::BEXTR:
59380 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
59381 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
59382 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
59383 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
59384 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
59386 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
59387 case ISD::SINT_TO_FP:
59389 return combineSIntToFP(N, DAG, DCI, Subtarget);
59390 case ISD::UINT_TO_FP:
59392 return combineUIntToFP(N, DAG, Subtarget);
59393 case ISD::LRINT:
59394 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
59395 case ISD::FADD:
59396 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
59397 case X86ISD::VFCMULC:
59398 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
59399 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
59400 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
59401 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
59402 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
59403 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
59404 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
59405 case X86ISD::FXOR:
59406 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
59407 case X86ISD::FMIN:
59408 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
59409 case ISD::FMINNUM:
59410 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
59411 case X86ISD::CVTSI2P:
59412 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
59413 case X86ISD::CVTP2SI:
59414 case X86ISD::CVTP2UI:
59416 case X86ISD::CVTTP2SI:
59418 case X86ISD::CVTTP2UI:
59419 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
59421 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
59422 case X86ISD::BT: return combineBT(N, DAG, DCI);
59423 case ISD::ANY_EXTEND:
59424 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
59425 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
59426 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
59430 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
59431 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
59432 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
59433 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
59434 case X86ISD::PACKSS:
59435 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
59436 case X86ISD::HADD:
59437 case X86ISD::HSUB:
59438 case X86ISD::FHADD:
59439 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
59440 case X86ISD::VSHL:
59441 case X86ISD::VSRA:
59442 case X86ISD::VSRL:
59443 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
59444 case X86ISD::VSHLI:
59445 case X86ISD::VSRAI:
59446 case X86ISD::VSRLI:
59447 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
59449 case X86ISD::PINSRB:
59450 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
59451 case X86ISD::SHUFP: // Handle all target specific shuffles
59452 case X86ISD::INSERTPS:
59453 case X86ISD::EXTRQI:
59454 case X86ISD::INSERTQI:
59455 case X86ISD::VALIGN:
59456 case X86ISD::PALIGNR:
59457 case X86ISD::VSHLDQ:
59458 case X86ISD::VSRLDQ:
59459 case X86ISD::BLENDI:
59460 case X86ISD::UNPCKH:
59461 case X86ISD::UNPCKL:
59462 case X86ISD::MOVHLPS:
59463 case X86ISD::MOVLHPS:
59464 case X86ISD::PSHUFB:
59465 case X86ISD::PSHUFD:
59466 case X86ISD::PSHUFHW:
59467 case X86ISD::PSHUFLW:
59468 case X86ISD::MOVSHDUP:
59469 case X86ISD::MOVSLDUP:
59470 case X86ISD::MOVDDUP:
59471 case X86ISD::MOVSS:
59472 case X86ISD::MOVSD:
59473 case X86ISD::MOVSH:
59474 case X86ISD::VBROADCAST:
59475 case X86ISD::VPPERM:
59476 case X86ISD::VPERMI:
59477 case X86ISD::VPERMV:
59478 case X86ISD::VPERMV3:
59479 case X86ISD::VPERMIL2:
59480 case X86ISD::VPERMILPI:
59481 case X86ISD::VPERMILPV:
59482 case X86ISD::VPERM2X128:
59483 case X86ISD::SHUF128:
59484 case X86ISD::VZEXT_MOVL:
59485 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
59486 case X86ISD::FMADD_RND:
59487 case X86ISD::FMSUB:
59489 case X86ISD::FMSUB_RND:
59490 case X86ISD::FNMADD:
59492 case X86ISD::FNMADD_RND:
59493 case X86ISD::FNMSUB:
59495 case X86ISD::FNMSUB_RND:
59496 case ISD::FMA:
59497 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
59500 case X86ISD::FMADDSUB:
59501 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
59502 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
59503 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
59504 case X86ISD::MGATHER:
59505 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
59506 case ISD::MGATHER:
59507 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
59508 case X86ISD::PCMPEQ:
59509 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
59510 case X86ISD::PMULDQ:
59511 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
59512 case X86ISD::VPMADDUBSW:
59513 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
59514 case X86ISD::KSHIFTL:
59515 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
59516 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
59518 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
59520 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
59522 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
59523 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
59524 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
59525 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
59526 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
59527 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
59529 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
59530 // clang-format on
59531 }
59532
59533 return SDValue();
59534}
59535
59537 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
59538}
59539
59540// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
59542 EVT ExtVT) const {
59543 return Subtarget.hasAVX512() || !VT.isVector();
59544}
59545
59546bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
59547 if (!isTypeLegal(VT))
59548 return false;
59549
59550 // There are no vXi8 shifts.
59551 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
59552 return false;
59553
59554 // TODO: Almost no 8-bit ops are desirable because they have no actual
59555 // size/speed advantages vs. 32-bit ops, but they do have a major
59556 // potential disadvantage by causing partial register stalls.
59557 //
59558 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
59559 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
59560 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
59561 // check for a constant operand to the multiply.
59562 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
59563 return false;
59564
59565 // i16 instruction encodings are longer and some i16 instructions are slow,
59566 // so those are not desirable.
59567 if (VT == MVT::i16) {
59568 switch (Opc) {
59569 default:
59570 break;
59571 case ISD::LOAD:
59572 case ISD::SIGN_EXTEND:
59573 case ISD::ZERO_EXTEND:
59574 case ISD::ANY_EXTEND:
59575 case ISD::MUL:
59576 return false;
59577 case ISD::SHL:
59578 case ISD::SRA:
59579 case ISD::SRL:
59580 case ISD::SUB:
59581 case ISD::ADD:
59582 case ISD::AND:
59583 case ISD::OR:
59584 case ISD::XOR:
59585 // NDD instruction never has "partial register write" issue b/c it has
59586 // destination register's upper bits [63:OSIZE]) zeroed even when
59587 // OSIZE=8/16.
59588 return Subtarget.hasNDD();
59589 }
59590 }
59591
59592 // Any legal type not explicitly accounted for above here is desirable.
59593 return true;
59594}
59595
59598 int JTI,
59599 SelectionDAG &DAG) const {
59600 const Module *M = DAG.getMachineFunction().getFunction().getParent();
59601 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
59602 if (IsCFProtectionSupported) {
59603 // In case control-flow branch protection is enabled, we need to add
59604 // notrack prefix to the indirect branch.
59605 // In order to do that we create NT_BRIND SDNode.
59606 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
59607 SDValue Chain = Value;
59608 // Jump table debug info is only needed if CodeView is enabled.
59610 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
59611 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
59612 }
59613
59614 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
59615}
59616
59619 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
59621 EVT VT = LogicOp->getValueType(0);
59622 EVT OpVT = SETCC0->getOperand(0).getValueType();
59623 if (!VT.isInteger())
59625
59626 if (VT.isVector())
59631
59632 // Don't use `NotAnd` as even though `not` is generally shorter code size than
59633 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
59634 // `NotAnd` applies, `AddAnd` does as well.
59635 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
59636 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
59638}
59639
59641 EVT VT = Op.getValueType();
59642 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
59643 isa<ConstantSDNode>(Op.getOperand(1));
59644
59645 // i16 is legal, but undesirable since i16 instruction encodings are longer
59646 // and some i16 instructions are slow.
59647 // 8-bit multiply-by-constant can usually be expanded to something cheaper
59648 // using LEA and/or other ALU ops.
59649 if (VT != MVT::i16 && !Is8BitMulByConstant)
59650 return false;
59651
59652 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
59653 if (!Op.hasOneUse())
59654 return false;
59655 SDNode *User = *Op->user_begin();
59657 return false;
59658 auto *Ld = cast<LoadSDNode>(Load);
59659 auto *St = cast<StoreSDNode>(User);
59660 return Ld->getBasePtr() == St->getBasePtr();
59661 };
59662
59663 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
59664 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
59665 return false;
59666 if (!Op.hasOneUse())
59667 return false;
59668 SDNode *User = *Op->user_begin();
59669 if (User->getOpcode() != ISD::ATOMIC_STORE)
59670 return false;
59671 auto *Ld = cast<AtomicSDNode>(Load);
59672 auto *St = cast<AtomicSDNode>(User);
59673 return Ld->getBasePtr() == St->getBasePtr();
59674 };
59675
59676 auto IsFoldableZext = [](SDValue Op) {
59677 if (!Op.hasOneUse())
59678 return false;
59679 SDNode *User = *Op->user_begin();
59680 EVT VT = User->getValueType(0);
59681 return (User->getOpcode() == ISD::ZERO_EXTEND &&
59682 (VT == MVT::i32 || VT == MVT::i64));
59683 };
59684
59685 bool Commute = false;
59686 switch (Op.getOpcode()) {
59687 default: return false;
59688 case ISD::SIGN_EXTEND:
59689 case ISD::ZERO_EXTEND:
59690 case ISD::ANY_EXTEND:
59691 break;
59692 case ISD::SHL:
59693 case ISD::SRA:
59694 case ISD::SRL: {
59695 SDValue N0 = Op.getOperand(0);
59696 // Look out for (store (shl (load), x)).
59697 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
59698 return false;
59699 break;
59700 }
59701 case ISD::MUL:
59702 // When ZU is enabled, we prefer to not promote for MUL by a constant
59703 // when there is an opportunity to fold a zext with imulzu.
59704 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
59705 (isa<ConstantSDNode>(Op.getOperand(0)) ||
59706 isa<ConstantSDNode>(Op.getOperand(1))))
59707 return false;
59708 [[fallthrough]];
59709 case ISD::ADD:
59710 case ISD::AND:
59711 case ISD::OR:
59712 case ISD::XOR:
59713 Commute = true;
59714 [[fallthrough]];
59715 case ISD::SUB: {
59716 SDValue N0 = Op.getOperand(0);
59717 SDValue N1 = Op.getOperand(1);
59718 // Avoid disabling potential load folding opportunities.
59719 if (X86::mayFoldLoad(N1, Subtarget) &&
59720 (!Commute || !isa<ConstantSDNode>(N0) ||
59721 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
59722 return false;
59723 if (X86::mayFoldLoad(N0, Subtarget) &&
59724 ((Commute && !isa<ConstantSDNode>(N1)) ||
59725 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
59726 return false;
59727 if (IsFoldableAtomicRMW(N0, Op) ||
59728 (Commute && IsFoldableAtomicRMW(N1, Op)))
59729 return false;
59730 }
59731 }
59732
59733 PVT = MVT::i32;
59734 return true;
59735}
59736
59737//===----------------------------------------------------------------------===//
59738// X86 Inline Assembly Support
59739//===----------------------------------------------------------------------===//
59740
59741// Helper to match a string separated by whitespace.
59743 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
59744
59745 for (StringRef Piece : Pieces) {
59746 if (!S.starts_with(Piece)) // Check if the piece matches.
59747 return false;
59748
59749 S = S.substr(Piece.size());
59751 if (Pos == 0) // We matched a prefix.
59752 return false;
59753
59754 S = S.substr(Pos);
59755 }
59756
59757 return S.empty();
59758}
59759
59761
59762 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
59763 if (llvm::is_contained(AsmPieces, "~{cc}") &&
59764 llvm::is_contained(AsmPieces, "~{flags}") &&
59765 llvm::is_contained(AsmPieces, "~{fpsr}")) {
59766
59767 if (AsmPieces.size() == 3)
59768 return true;
59769 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
59770 return true;
59771 }
59772 }
59773 return false;
59774}
59775
59777 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
59778
59779 const std::string &AsmStr = IA->getAsmString();
59780
59781 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
59782 if (!Ty || Ty->getBitWidth() % 16 != 0)
59783 return false;
59784
59785 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
59786 SmallVector<StringRef, 4> AsmPieces;
59787 SplitString(AsmStr, AsmPieces, ";\n");
59788
59789 switch (AsmPieces.size()) {
59790 default: return false;
59791 case 1:
59792 // FIXME: this should verify that we are targeting a 486 or better. If not,
59793 // we will turn this bswap into something that will be lowered to logical
59794 // ops instead of emitting the bswap asm. For now, we don't support 486 or
59795 // lower so don't worry about this.
59796 // bswap $0
59797 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
59798 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
59799 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
59800 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
59801 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
59802 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
59803 // No need to check constraints, nothing other than the equivalent of
59804 // "=r,0" would be valid here.
59806 }
59807
59808 // rorw $$8, ${0:w} --> llvm.bswap.i16
59809 if (CI->getType()->isIntegerTy(16) &&
59810 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
59811 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
59812 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
59813 AsmPieces.clear();
59814 StringRef ConstraintsStr = IA->getConstraintString();
59815 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
59816 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
59817 if (clobbersFlagRegisters(AsmPieces))
59819 }
59820 break;
59821 case 3:
59822 if (CI->getType()->isIntegerTy(32) &&
59823 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
59824 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
59825 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
59826 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
59827 AsmPieces.clear();
59828 StringRef ConstraintsStr = IA->getConstraintString();
59829 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
59830 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
59831 if (clobbersFlagRegisters(AsmPieces))
59833 }
59834
59835 if (CI->getType()->isIntegerTy(64)) {
59836 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
59837 if (Constraints.size() >= 2 &&
59838 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
59839 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
59840 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
59841 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
59842 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
59843 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
59845 }
59846 }
59847 break;
59848 }
59849 return false;
59850}
59851
59854 .Case("{@cca}", X86::COND_A)
59855 .Case("{@ccae}", X86::COND_AE)
59856 .Case("{@ccb}", X86::COND_B)
59857 .Case("{@ccbe}", X86::COND_BE)
59858 .Case("{@ccc}", X86::COND_B)
59859 .Case("{@cce}", X86::COND_E)
59860 .Case("{@ccz}", X86::COND_E)
59861 .Case("{@ccg}", X86::COND_G)
59862 .Case("{@ccge}", X86::COND_GE)
59863 .Case("{@ccl}", X86::COND_L)
59864 .Case("{@ccle}", X86::COND_LE)
59865 .Case("{@ccna}", X86::COND_BE)
59866 .Case("{@ccnae}", X86::COND_B)
59867 .Case("{@ccnb}", X86::COND_AE)
59868 .Case("{@ccnbe}", X86::COND_A)
59869 .Case("{@ccnc}", X86::COND_AE)
59870 .Case("{@ccne}", X86::COND_NE)
59871 .Case("{@ccnz}", X86::COND_NE)
59872 .Case("{@ccng}", X86::COND_LE)
59873 .Case("{@ccnge}", X86::COND_L)
59874 .Case("{@ccnl}", X86::COND_GE)
59875 .Case("{@ccnle}", X86::COND_G)
59876 .Case("{@ccno}", X86::COND_NO)
59877 .Case("{@ccnp}", X86::COND_NP)
59878 .Case("{@ccns}", X86::COND_NS)
59879 .Case("{@cco}", X86::COND_O)
59880 .Case("{@ccp}", X86::COND_P)
59881 .Case("{@ccs}", X86::COND_S)
59883 return Cond;
59884}
59885
59886/// Given a constraint letter, return the type of constraint for this target.
59889 if (Constraint.size() == 1) {
59890 switch (Constraint[0]) {
59891 case 'R':
59892 case 'q':
59893 case 'Q':
59894 case 'f':
59895 case 't':
59896 case 'u':
59897 case 'y':
59898 case 'x':
59899 case 'v':
59900 case 'l':
59901 case 'k': // AVX512 masking registers.
59902 return C_RegisterClass;
59903 case 'a':
59904 case 'b':
59905 case 'c':
59906 case 'd':
59907 case 'S':
59908 case 'D':
59909 case 'A':
59910 return C_Register;
59911 case 'I':
59912 case 'J':
59913 case 'K':
59914 case 'N':
59915 case 'G':
59916 case 'L':
59917 case 'M':
59918 return C_Immediate;
59919 case 'C':
59920 case 'e':
59921 case 'Z':
59922 return C_Other;
59923 default:
59924 break;
59925 }
59926 }
59927 else if (Constraint.size() == 2) {
59928 switch (Constraint[0]) {
59929 default:
59930 break;
59931 case 'W':
59932 if (Constraint[1] != 's')
59933 break;
59934 return C_Other;
59935 case 'Y':
59936 switch (Constraint[1]) {
59937 default:
59938 break;
59939 case 'z':
59940 return C_Register;
59941 case 'i':
59942 case 'm':
59943 case 'k':
59944 case 't':
59945 case '2':
59946 return C_RegisterClass;
59947 }
59948 break;
59949 case 'j':
59950 switch (Constraint[1]) {
59951 default:
59952 break;
59953 case 'r':
59954 case 'R':
59955 return C_RegisterClass;
59956 }
59957 }
59958 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
59959 return C_Other;
59960 return TargetLowering::getConstraintType(Constraint);
59961}
59962
59963/// Examine constraint type and operand type and determine a weight value.
59964/// This object must already have been set up with the operand type
59965/// and the current alternative constraint selected.
59968 AsmOperandInfo &Info, const char *Constraint) const {
59970 Value *CallOperandVal = Info.CallOperandVal;
59971 // If we don't have a value, we can't do a match,
59972 // but allow it at the lowest weight.
59973 if (!CallOperandVal)
59974 return CW_Default;
59975 Type *Ty = CallOperandVal->getType();
59976 // Look at the constraint type.
59977 switch (*Constraint) {
59978 default:
59980 [[fallthrough]];
59981 case 'R':
59982 case 'q':
59983 case 'Q':
59984 case 'a':
59985 case 'b':
59986 case 'c':
59987 case 'd':
59988 case 'S':
59989 case 'D':
59990 case 'A':
59991 if (CallOperandVal->getType()->isIntegerTy())
59992 Wt = CW_SpecificReg;
59993 break;
59994 case 'f':
59995 case 't':
59996 case 'u':
59997 if (Ty->isFloatingPointTy())
59998 Wt = CW_SpecificReg;
59999 break;
60000 case 'y':
60001 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
60002 Wt = CW_SpecificReg;
60003 break;
60004 case 'Y':
60005 if (StringRef(Constraint).size() != 2)
60006 break;
60007 switch (Constraint[1]) {
60008 default:
60009 return CW_Invalid;
60010 // XMM0
60011 case 'z':
60012 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
60013 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
60014 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
60015 return CW_SpecificReg;
60016 return CW_Invalid;
60017 // Conditional OpMask regs (AVX512)
60018 case 'k':
60019 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
60020 return CW_Register;
60021 return CW_Invalid;
60022 // Any MMX reg
60023 case 'm':
60024 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
60025 return CW_SpecificReg;
60026 return CW_Invalid;
60027 // Any SSE reg when ISA >= SSE2, same as 'x'
60028 case 'i':
60029 case 't':
60030 case '2':
60031 if (!Subtarget.hasSSE2())
60032 return CW_Invalid;
60033 break;
60034 }
60035 break;
60036 case 'j':
60037 if (StringRef(Constraint).size() != 2)
60038 break;
60039 switch (Constraint[1]) {
60040 default:
60041 return CW_Invalid;
60042 case 'r':
60043 case 'R':
60044 if (CallOperandVal->getType()->isIntegerTy())
60045 Wt = CW_SpecificReg;
60046 break;
60047 }
60048 break;
60049 case 'v':
60050 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
60051 Wt = CW_Register;
60052 [[fallthrough]];
60053 case 'x':
60054 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
60055 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
60056 Wt = CW_Register;
60057 break;
60058 case 'k':
60059 // Enable conditional vector operations using %k<#> registers.
60060 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
60061 Wt = CW_Register;
60062 break;
60063 case 'I':
60064 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
60065 if (C->getZExtValue() <= 31)
60066 Wt = CW_Constant;
60067 break;
60068 case 'J':
60069 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60070 if (C->getZExtValue() <= 63)
60071 Wt = CW_Constant;
60072 break;
60073 case 'K':
60074 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60075 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
60076 Wt = CW_Constant;
60077 break;
60078 case 'L':
60079 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60080 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
60081 Wt = CW_Constant;
60082 break;
60083 case 'M':
60084 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60085 if (C->getZExtValue() <= 3)
60086 Wt = CW_Constant;
60087 break;
60088 case 'N':
60089 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60090 if (C->getZExtValue() <= 0xff)
60091 Wt = CW_Constant;
60092 break;
60093 case 'G':
60094 case 'C':
60095 if (isa<ConstantFP>(CallOperandVal))
60096 Wt = CW_Constant;
60097 break;
60098 case 'e':
60099 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60100 if ((C->getSExtValue() >= -0x80000000LL) &&
60101 (C->getSExtValue() <= 0x7fffffffLL))
60102 Wt = CW_Constant;
60103 break;
60104 case 'Z':
60105 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60106 if (C->getZExtValue() <= 0xffffffff)
60107 Wt = CW_Constant;
60108 break;
60109 }
60110 return Wt;
60111}
60112
60113/// Try to replace an X constraint, which matches anything, with another that
60114/// has more specific requirements based on the type of the corresponding
60115/// operand.
60117LowerXConstraint(EVT ConstraintVT) const {
60118 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
60119 // 'f' like normal targets.
60120 if (ConstraintVT.isFloatingPoint()) {
60121 if (Subtarget.hasSSE1())
60122 return "x";
60123 }
60124
60125 return TargetLowering::LowerXConstraint(ConstraintVT);
60126}
60127
60128// Lower @cc targets via setcc.
60130 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
60131 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
60133 if (Cond == X86::COND_INVALID)
60134 return SDValue();
60135 // Check that return type is valid.
60136 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
60137 OpInfo.ConstraintVT.getSizeInBits() < 8)
60138 report_fatal_error("Glue output operand is of invalid type");
60139
60140 // Get EFLAGS register. Only update chain when copyfrom is glued.
60141 if (Glue.getNode()) {
60142 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
60143 Chain = Glue.getValue(1);
60144 } else
60145 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
60146 // Extract CC code.
60147 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
60148 // Extend to 32-bits
60149 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
60150
60151 return Result;
60152}
60153
60154/// Lower the specified operand into the Ops vector.
60155/// If it is invalid, don't add anything to Ops.
60157 StringRef Constraint,
60158 std::vector<SDValue> &Ops,
60159 SelectionDAG &DAG) const {
60160 SDValue Result;
60161 char ConstraintLetter = Constraint[0];
60162 switch (ConstraintLetter) {
60163 default: break;
60164 case 'I':
60165 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60166 if (C->getZExtValue() <= 31) {
60167 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60168 Op.getValueType());
60169 break;
60170 }
60171 }
60172 return;
60173 case 'J':
60174 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60175 if (C->getZExtValue() <= 63) {
60176 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60177 Op.getValueType());
60178 break;
60179 }
60180 }
60181 return;
60182 case 'K':
60183 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60184 if (isInt<8>(C->getSExtValue())) {
60185 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60186 Op.getValueType());
60187 break;
60188 }
60189 }
60190 return;
60191 case 'L':
60192 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60193 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
60194 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
60195 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
60196 Op.getValueType());
60197 break;
60198 }
60199 }
60200 return;
60201 case 'M':
60202 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60203 if (C->getZExtValue() <= 3) {
60204 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60205 Op.getValueType());
60206 break;
60207 }
60208 }
60209 return;
60210 case 'N':
60211 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60212 if (C->getZExtValue() <= 255) {
60213 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60214 Op.getValueType());
60215 break;
60216 }
60217 }
60218 return;
60219 case 'O':
60220 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60221 if (C->getZExtValue() <= 127) {
60222 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60223 Op.getValueType());
60224 break;
60225 }
60226 }
60227 return;
60228 case 'e': {
60229 // 32-bit signed value
60230 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60232 C->getSExtValue())) {
60233 // Widen to 64 bits here to get it sign extended.
60234 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
60235 break;
60236 }
60237 // FIXME gcc accepts some relocatable values here too, but only in certain
60238 // memory models; it's complicated.
60239 }
60240 return;
60241 }
60242 case 'W': {
60243 assert(Constraint[1] == 's');
60244 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
60245 // offset.
60246 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
60247 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
60248 BA->getValueType(0)));
60249 } else {
60250 int64_t Offset = 0;
60251 if (Op->getOpcode() == ISD::ADD &&
60252 isa<ConstantSDNode>(Op->getOperand(1))) {
60253 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
60254 Op = Op->getOperand(0);
60255 }
60256 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
60257 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
60258 GA->getValueType(0), Offset));
60259 }
60260 return;
60261 }
60262 case 'Z': {
60263 // 32-bit unsigned value
60264 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60266 C->getZExtValue())) {
60267 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60268 Op.getValueType());
60269 break;
60270 }
60271 }
60272 // FIXME gcc accepts some relocatable values here too, but only in certain
60273 // memory models; it's complicated.
60274 return;
60275 }
60276 case 'i': {
60277 // Literal immediates are always ok.
60278 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
60279 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
60280 BooleanContent BCont = getBooleanContents(MVT::i64);
60281 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
60283 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
60284 : CST->getSExtValue();
60285 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
60286 break;
60287 }
60288
60289 // In any sort of PIC mode addresses need to be computed at runtime by
60290 // adding in a register or some sort of table lookup. These can't
60291 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
60292 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
60293 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
60294 return;
60295
60296 // If we are in non-pic codegen mode, we allow the address of a global (with
60297 // an optional displacement) to be used with 'i'.
60298 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
60299 // If we require an extra load to get this address, as in PIC mode, we
60300 // can't accept it.
60302 Subtarget.classifyGlobalReference(GA->getGlobal())))
60303 return;
60304 break;
60305 }
60306 }
60307
60308 if (Result.getNode()) {
60309 Ops.push_back(Result);
60310 return;
60311 }
60312 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
60313}
60314
60315/// Check if \p RC is a general purpose register class.
60316/// I.e., GR* or one of their variant.
60317static bool isGRClass(const TargetRegisterClass &RC) {
60318 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
60319 RC.hasSuperClassEq(&X86::GR16RegClass) ||
60320 RC.hasSuperClassEq(&X86::GR32RegClass) ||
60321 RC.hasSuperClassEq(&X86::GR64RegClass) ||
60322 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
60323}
60324
60325/// Check if \p RC is a vector register class.
60326/// I.e., FR* / VR* or one of their variant.
60327static bool isFRClass(const TargetRegisterClass &RC) {
60328 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
60329 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
60330 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
60331 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
60332 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
60333 RC.hasSuperClassEq(&X86::VR512RegClass);
60334}
60335
60336/// Check if \p RC is a mask register class.
60337/// I.e., VK* or one of their variant.
60338static bool isVKClass(const TargetRegisterClass &RC) {
60339 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
60340 RC.hasSuperClassEq(&X86::VK2RegClass) ||
60341 RC.hasSuperClassEq(&X86::VK4RegClass) ||
60342 RC.hasSuperClassEq(&X86::VK8RegClass) ||
60343 RC.hasSuperClassEq(&X86::VK16RegClass) ||
60344 RC.hasSuperClassEq(&X86::VK32RegClass) ||
60345 RC.hasSuperClassEq(&X86::VK64RegClass);
60346}
60347
60348static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
60349 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
60350}
60351
60352std::pair<unsigned, const TargetRegisterClass *>
60354 StringRef Constraint,
60355 MVT VT) const {
60356 // First, see if this is a constraint that directly corresponds to an LLVM
60357 // register class.
60358 if (Constraint.size() == 1) {
60359 // GCC Constraint Letters
60360 switch (Constraint[0]) {
60361 default: break;
60362 // 'A' means [ER]AX + [ER]DX.
60363 case 'A':
60364 if (Subtarget.is64Bit())
60365 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
60366 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
60367 "Expecting 64, 32 or 16 bit subtarget");
60368 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
60369
60370 // TODO: Slight differences here in allocation order and leaving
60371 // RIP in the class. Do they matter any more here than they do
60372 // in the normal allocation?
60373 case 'k':
60374 if (Subtarget.hasAVX512()) {
60375 if (VT == MVT::v1i1 || VT == MVT::i1)
60376 return std::make_pair(0U, &X86::VK1RegClass);
60377 if (VT == MVT::v8i1 || VT == MVT::i8)
60378 return std::make_pair(0U, &X86::VK8RegClass);
60379 if (VT == MVT::v16i1 || VT == MVT::i16)
60380 return std::make_pair(0U, &X86::VK16RegClass);
60381 }
60382 if (Subtarget.hasBWI()) {
60383 if (VT == MVT::v32i1 || VT == MVT::i32)
60384 return std::make_pair(0U, &X86::VK32RegClass);
60385 if (VT == MVT::v64i1 || VT == MVT::i64)
60386 return std::make_pair(0U, &X86::VK64RegClass);
60387 }
60388 break;
60389 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
60390 if (Subtarget.is64Bit()) {
60391 if (VT == MVT::i8 || VT == MVT::i1)
60392 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60393 ? &X86::GR8RegClass
60394 : &X86::GR8_NOREX2RegClass);
60395 if (VT == MVT::i16)
60396 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60397 ? &X86::GR16RegClass
60398 : &X86::GR16_NOREX2RegClass);
60399 if (VT == MVT::i32 || VT == MVT::f32)
60400 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60401 ? &X86::GR32RegClass
60402 : &X86::GR32_NOREX2RegClass);
60403 if (VT != MVT::f80 && !VT.isVector())
60404 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60405 ? &X86::GR64RegClass
60406 : &X86::GR64_NOREX2RegClass);
60407 break;
60408 }
60409 [[fallthrough]];
60410 // 32-bit fallthrough
60411 case 'Q': // Q_REGS
60412 if (VT == MVT::i8 || VT == MVT::i1)
60413 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
60414 if (VT == MVT::i16)
60415 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
60416 if (VT == MVT::i32 || VT == MVT::f32 ||
60417 (!VT.isVector() && !Subtarget.is64Bit()))
60418 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
60419 if (VT != MVT::f80 && !VT.isVector())
60420 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
60421 break;
60422 case 'r': // GENERAL_REGS
60423 case 'l': // INDEX_REGS
60424 if (VT == MVT::i8 || VT == MVT::i1)
60425 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60426 ? &X86::GR8RegClass
60427 : &X86::GR8_NOREX2RegClass);
60428 if (VT == MVT::i16)
60429 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60430 ? &X86::GR16RegClass
60431 : &X86::GR16_NOREX2RegClass);
60432 if (VT == MVT::i32 || VT == MVT::f32 ||
60433 (!VT.isVector() && !Subtarget.is64Bit()))
60434 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60435 ? &X86::GR32RegClass
60436 : &X86::GR32_NOREX2RegClass);
60437 if (VT != MVT::f80 && !VT.isVector())
60438 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60439 ? &X86::GR64RegClass
60440 : &X86::GR64_NOREX2RegClass);
60441 break;
60442 case 'R': // LEGACY_REGS
60443 if (VT == MVT::i8 || VT == MVT::i1)
60444 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
60445 if (VT == MVT::i16)
60446 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
60447 if (VT == MVT::i32 || VT == MVT::f32 ||
60448 (!VT.isVector() && !Subtarget.is64Bit()))
60449 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
60450 if (VT != MVT::f80 && !VT.isVector())
60451 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
60452 break;
60453 case 'f': // FP Stack registers.
60454 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
60455 // value to the correct fpstack register class.
60456 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
60457 return std::make_pair(0U, &X86::RFP32RegClass);
60458 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
60459 return std::make_pair(0U, &X86::RFP64RegClass);
60460 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
60461 return std::make_pair(0U, &X86::RFP80RegClass);
60462 break;
60463 case 'y': // MMX_REGS if MMX allowed.
60464 if (!Subtarget.hasMMX()) break;
60465 return std::make_pair(0U, &X86::VR64RegClass);
60466 case 'v':
60467 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
60468 if (!Subtarget.hasSSE1()) break;
60469 bool VConstraint = (Constraint[0] == 'v');
60470
60471 switch (VT.SimpleTy) {
60472 default: break;
60473 // Scalar SSE types.
60474 case MVT::f16:
60475 if (VConstraint && Subtarget.hasFP16())
60476 return std::make_pair(0U, &X86::FR16XRegClass);
60477 break;
60478 case MVT::f32:
60479 case MVT::i32:
60480 if (VConstraint && Subtarget.hasVLX())
60481 return std::make_pair(0U, &X86::FR32XRegClass);
60482 return std::make_pair(0U, &X86::FR32RegClass);
60483 case MVT::f64:
60484 case MVT::i64:
60485 if (VConstraint && Subtarget.hasVLX())
60486 return std::make_pair(0U, &X86::FR64XRegClass);
60487 return std::make_pair(0U, &X86::FR64RegClass);
60488 case MVT::i128:
60489 if (Subtarget.is64Bit()) {
60490 if (VConstraint && Subtarget.hasVLX())
60491 return std::make_pair(0U, &X86::VR128XRegClass);
60492 return std::make_pair(0U, &X86::VR128RegClass);
60493 }
60494 break;
60495 // Vector types and fp128.
60496 case MVT::v8f16:
60497 if (!Subtarget.hasFP16())
60498 break;
60499 if (VConstraint)
60500 return std::make_pair(0U, &X86::VR128XRegClass);
60501 return std::make_pair(0U, &X86::VR128RegClass);
60502 case MVT::v8bf16:
60503 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60504 break;
60505 if (VConstraint)
60506 return std::make_pair(0U, &X86::VR128XRegClass);
60507 return std::make_pair(0U, &X86::VR128RegClass);
60508 case MVT::f128:
60509 case MVT::v16i8:
60510 case MVT::v8i16:
60511 case MVT::v4i32:
60512 case MVT::v2i64:
60513 case MVT::v4f32:
60514 case MVT::v2f64:
60515 if (VConstraint && Subtarget.hasVLX())
60516 return std::make_pair(0U, &X86::VR128XRegClass);
60517 return std::make_pair(0U, &X86::VR128RegClass);
60518 // AVX types.
60519 case MVT::v16f16:
60520 if (!Subtarget.hasFP16())
60521 break;
60522 if (VConstraint)
60523 return std::make_pair(0U, &X86::VR256XRegClass);
60524 return std::make_pair(0U, &X86::VR256RegClass);
60525 case MVT::v16bf16:
60526 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60527 break;
60528 if (VConstraint)
60529 return std::make_pair(0U, &X86::VR256XRegClass);
60530 return std::make_pair(0U, &X86::VR256RegClass);
60531 case MVT::v32i8:
60532 case MVT::v16i16:
60533 case MVT::v8i32:
60534 case MVT::v4i64:
60535 case MVT::v8f32:
60536 case MVT::v4f64:
60537 if (VConstraint && Subtarget.hasVLX())
60538 return std::make_pair(0U, &X86::VR256XRegClass);
60539 if (Subtarget.hasAVX())
60540 return std::make_pair(0U, &X86::VR256RegClass);
60541 break;
60542 case MVT::v32f16:
60543 if (!Subtarget.hasFP16())
60544 break;
60545 if (VConstraint)
60546 return std::make_pair(0U, &X86::VR512RegClass);
60547 return std::make_pair(0U, &X86::VR512_0_15RegClass);
60548 case MVT::v32bf16:
60549 if (!Subtarget.hasBF16())
60550 break;
60551 if (VConstraint)
60552 return std::make_pair(0U, &X86::VR512RegClass);
60553 return std::make_pair(0U, &X86::VR512_0_15RegClass);
60554 case MVT::v64i8:
60555 case MVT::v32i16:
60556 case MVT::v8f64:
60557 case MVT::v16f32:
60558 case MVT::v16i32:
60559 case MVT::v8i64:
60560 if (!Subtarget.hasAVX512()) break;
60561 if (VConstraint)
60562 return std::make_pair(0U, &X86::VR512RegClass);
60563 return std::make_pair(0U, &X86::VR512_0_15RegClass);
60564 }
60565 break;
60566 }
60567 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
60568 switch (Constraint[1]) {
60569 default:
60570 break;
60571 case 'i':
60572 case 't':
60573 case '2':
60574 return getRegForInlineAsmConstraint(TRI, "x", VT);
60575 case 'm':
60576 if (!Subtarget.hasMMX()) break;
60577 return std::make_pair(0U, &X86::VR64RegClass);
60578 case 'z':
60579 if (!Subtarget.hasSSE1()) break;
60580 switch (VT.SimpleTy) {
60581 default: break;
60582 // Scalar SSE types.
60583 case MVT::f16:
60584 if (!Subtarget.hasFP16())
60585 break;
60586 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
60587 case MVT::f32:
60588 case MVT::i32:
60589 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
60590 case MVT::f64:
60591 case MVT::i64:
60592 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
60593 case MVT::v8f16:
60594 if (!Subtarget.hasFP16())
60595 break;
60596 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
60597 case MVT::v8bf16:
60598 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60599 break;
60600 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
60601 case MVT::f128:
60602 case MVT::v16i8:
60603 case MVT::v8i16:
60604 case MVT::v4i32:
60605 case MVT::v2i64:
60606 case MVT::v4f32:
60607 case MVT::v2f64:
60608 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
60609 // AVX types.
60610 case MVT::v16f16:
60611 if (!Subtarget.hasFP16())
60612 break;
60613 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
60614 case MVT::v16bf16:
60615 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60616 break;
60617 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
60618 case MVT::v32i8:
60619 case MVT::v16i16:
60620 case MVT::v8i32:
60621 case MVT::v4i64:
60622 case MVT::v8f32:
60623 case MVT::v4f64:
60624 if (Subtarget.hasAVX())
60625 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
60626 break;
60627 case MVT::v32f16:
60628 if (!Subtarget.hasFP16())
60629 break;
60630 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
60631 case MVT::v32bf16:
60632 if (!Subtarget.hasBF16())
60633 break;
60634 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
60635 case MVT::v64i8:
60636 case MVT::v32i16:
60637 case MVT::v8f64:
60638 case MVT::v16f32:
60639 case MVT::v16i32:
60640 case MVT::v8i64:
60641 if (Subtarget.hasAVX512())
60642 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
60643 break;
60644 }
60645 break;
60646 case 'k':
60647 // This register class doesn't allocate k0 for masked vector operation.
60648 if (Subtarget.hasAVX512()) {
60649 if (VT == MVT::v1i1 || VT == MVT::i1)
60650 return std::make_pair(0U, &X86::VK1WMRegClass);
60651 if (VT == MVT::v8i1 || VT == MVT::i8)
60652 return std::make_pair(0U, &X86::VK8WMRegClass);
60653 if (VT == MVT::v16i1 || VT == MVT::i16)
60654 return std::make_pair(0U, &X86::VK16WMRegClass);
60655 }
60656 if (Subtarget.hasBWI()) {
60657 if (VT == MVT::v32i1 || VT == MVT::i32)
60658 return std::make_pair(0U, &X86::VK32WMRegClass);
60659 if (VT == MVT::v64i1 || VT == MVT::i64)
60660 return std::make_pair(0U, &X86::VK64WMRegClass);
60661 }
60662 break;
60663 }
60664 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
60665 switch (Constraint[1]) {
60666 default:
60667 break;
60668 case 'r':
60669 if (VT == MVT::i8 || VT == MVT::i1)
60670 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
60671 if (VT == MVT::i16)
60672 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
60673 if (VT == MVT::i32 || VT == MVT::f32)
60674 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
60675 if (VT != MVT::f80 && !VT.isVector())
60676 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
60677 break;
60678 case 'R':
60679 if (VT == MVT::i8 || VT == MVT::i1)
60680 return std::make_pair(0U, &X86::GR8RegClass);
60681 if (VT == MVT::i16)
60682 return std::make_pair(0U, &X86::GR16RegClass);
60683 if (VT == MVT::i32 || VT == MVT::f32)
60684 return std::make_pair(0U, &X86::GR32RegClass);
60685 if (VT != MVT::f80 && !VT.isVector())
60686 return std::make_pair(0U, &X86::GR64RegClass);
60687 break;
60688 }
60689 }
60690
60691 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
60692 return std::make_pair(0U, &X86::GR32RegClass);
60693
60694 // Use the default implementation in TargetLowering to convert the register
60695 // constraint into a member of a register class.
60696 std::pair<Register, const TargetRegisterClass*> Res;
60698
60699 // Not found as a standard register?
60700 if (!Res.second) {
60701 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
60702 // to/from f80.
60703 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
60704 // Map st(0) -> st(7) -> ST0
60705 if (Constraint.size() == 7 && Constraint[0] == '{' &&
60706 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
60707 Constraint[3] == '(' &&
60708 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
60709 Constraint[5] == ')' && Constraint[6] == '}') {
60710 // st(7) is not allocatable and thus not a member of RFP80. Return
60711 // singleton class in cases where we have a reference to it.
60712 if (Constraint[4] == '7')
60713 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
60714 return std::make_pair(X86::FP0 + Constraint[4] - '0',
60715 &X86::RFP80RegClass);
60716 }
60717
60718 // GCC allows "st(0)" to be called just plain "st".
60719 if (StringRef("{st}").equals_insensitive(Constraint))
60720 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
60721 }
60722
60723 // flags -> EFLAGS
60724 if (StringRef("{flags}").equals_insensitive(Constraint))
60725 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
60726
60727 // dirflag -> DF
60728 // Only allow for clobber.
60729 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
60730 VT == MVT::Other)
60731 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
60732
60733 // fpsr -> FPSW
60734 // Only allow for clobber.
60735 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
60736 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
60737
60738 return Res;
60739 }
60740
60741 // Make sure it isn't a register that requires 64-bit mode.
60742 if (!Subtarget.is64Bit() &&
60743 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
60744 TRI->getEncodingValue(Res.first) >= 8) {
60745 // Register requires REX prefix, but we're in 32-bit mode.
60746 return std::make_pair(0, nullptr);
60747 }
60748
60749 // Make sure it isn't a register that requires AVX512.
60750 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
60751 TRI->getEncodingValue(Res.first) & 0x10) {
60752 // Register requires EVEX prefix.
60753 return std::make_pair(0, nullptr);
60754 }
60755
60756 // Otherwise, check to see if this is a register class of the wrong value
60757 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
60758 // turn into {ax},{dx}.
60759 // MVT::Other is used to specify clobber names.
60760 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
60761 return Res; // Correct type already, nothing to do.
60762
60763 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
60764 // return "eax". This should even work for things like getting 64bit integer
60765 // registers when given an f64 type.
60766 const TargetRegisterClass *Class = Res.second;
60767 // The generic code will match the first register class that contains the
60768 // given register. Thus, based on the ordering of the tablegened file,
60769 // the "plain" GR classes might not come first.
60770 // Therefore, use a helper method.
60771 if (isGRClass(*Class)) {
60772 unsigned Size = VT.getSizeInBits();
60773 if (Size == 1) Size = 8;
60774 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
60775 return std::make_pair(0, nullptr);
60776 Register DestReg = getX86SubSuperRegister(Res.first, Size);
60777 if (DestReg.isValid()) {
60778 bool is64Bit = Subtarget.is64Bit();
60779 const TargetRegisterClass *RC =
60780 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
60781 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
60782 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
60783 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
60784 if (Size == 64 && !is64Bit) {
60785 // Model GCC's behavior here and select a fixed pair of 32-bit
60786 // registers.
60787 switch (DestReg) {
60788 case X86::RAX:
60789 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
60790 case X86::RDX:
60791 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
60792 case X86::RCX:
60793 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
60794 case X86::RBX:
60795 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
60796 case X86::RSI:
60797 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
60798 case X86::RDI:
60799 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
60800 case X86::RBP:
60801 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
60802 default:
60803 return std::make_pair(0, nullptr);
60804 }
60805 }
60806 if (RC && RC->contains(DestReg))
60807 return std::make_pair(DestReg, RC);
60808 return Res;
60809 }
60810 // No register found/type mismatch.
60811 return std::make_pair(0, nullptr);
60812 } else if (isFRClass(*Class)) {
60813 // Handle references to XMM physical registers that got mapped into the
60814 // wrong class. This can happen with constraints like {xmm0} where the
60815 // target independent register mapper will just pick the first match it can
60816 // find, ignoring the required type.
60817
60818 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
60819 if (VT == MVT::f16)
60820 Res.second = &X86::FR16XRegClass;
60821 else if (VT == MVT::f32 || VT == MVT::i32)
60822 Res.second = &X86::FR32XRegClass;
60823 else if (VT == MVT::f64 || VT == MVT::i64)
60824 Res.second = &X86::FR64XRegClass;
60825 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
60826 Res.second = &X86::VR128XRegClass;
60827 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
60828 Res.second = &X86::VR256XRegClass;
60829 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
60830 Res.second = &X86::VR512RegClass;
60831 else {
60832 // Type mismatch and not a clobber: Return an error;
60833 Res.first = 0;
60834 Res.second = nullptr;
60835 }
60836 } else if (isVKClass(*Class)) {
60837 if (VT == MVT::v1i1 || VT == MVT::i1)
60838 Res.second = &X86::VK1RegClass;
60839 else if (VT == MVT::v8i1 || VT == MVT::i8)
60840 Res.second = &X86::VK8RegClass;
60841 else if (VT == MVT::v16i1 || VT == MVT::i16)
60842 Res.second = &X86::VK16RegClass;
60843 else if (VT == MVT::v32i1 || VT == MVT::i32)
60844 Res.second = &X86::VK32RegClass;
60845 else if (VT == MVT::v64i1 || VT == MVT::i64)
60846 Res.second = &X86::VK64RegClass;
60847 else {
60848 // Type mismatch and not a clobber: Return an error;
60849 Res.first = 0;
60850 Res.second = nullptr;
60851 }
60852 }
60853
60854 return Res;
60855}
60856
60858 // Integer division on x86 is expensive. However, when aggressively optimizing
60859 // for code size, we prefer to use a div instruction, as it is usually smaller
60860 // than the alternative sequence.
60861 // The exception to this is vector division. Since x86 doesn't have vector
60862 // integer division, leaving the division as-is is a loss even in terms of
60863 // size, because it will have to be scalarized, while the alternative code
60864 // sequence can be performed in vector form.
60865 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
60866 return OptSize && !VT.isVector();
60867}
60868
60869void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
60870 if (!Subtarget.is64Bit())
60871 return;
60872
60873 // Update IsSplitCSR in X86MachineFunctionInfo.
60875 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
60876 AFI->setIsSplitCSR(true);
60877}
60878
60879void X86TargetLowering::insertCopiesSplitCSR(
60880 MachineBasicBlock *Entry,
60881 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
60882 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
60883 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
60884 if (!IStart)
60885 return;
60886
60887 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
60888 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
60889 MachineBasicBlock::iterator MBBI = Entry->begin();
60890 for (const MCPhysReg *I = IStart; *I; ++I) {
60891 const TargetRegisterClass *RC = nullptr;
60892 if (X86::GR64RegClass.contains(*I))
60893 RC = &X86::GR64RegClass;
60894 else
60895 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
60896
60897 Register NewVR = MRI->createVirtualRegister(RC);
60898 // Create copy from CSR to a virtual register.
60899 // FIXME: this currently does not emit CFI pseudo-instructions, it works
60900 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
60901 // nounwind. If we want to generalize this later, we may need to emit
60902 // CFI pseudo-instructions.
60903 assert(
60904 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
60905 "Function should be nounwind in insertCopiesSplitCSR!");
60906 Entry->addLiveIn(*I);
60907 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
60908 .addReg(*I);
60909
60910 // Insert the copy-back instructions right before the terminator.
60911 for (auto *Exit : Exits)
60912 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
60913 TII->get(TargetOpcode::COPY), *I)
60914 .addReg(NewVR);
60915 }
60916}
60917
60919 return Subtarget.is64Bit();
60920}
60921
60925 const TargetInstrInfo *TII) const {
60926 assert(MBBI->isCall() && MBBI->getCFIType() &&
60927 "Invalid call instruction for a KCFI check");
60928
60929 MachineFunction &MF = *MBB.getParent();
60930 // If the call target is a memory operand, unfold it and use R11 for the
60931 // call, so KCFI_CHECK won't have to recompute the address.
60932 switch (MBBI->getOpcode()) {
60933 case X86::CALL64m:
60934 case X86::CALL64m_NT:
60935 case X86::TAILJMPm64:
60936 case X86::TAILJMPm64_REX: {
60939 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
60940 /*UnfoldStore=*/false, NewMIs))
60941 report_fatal_error("Failed to unfold memory operand for a KCFI check");
60942 for (auto *NewMI : NewMIs)
60943 MBBI = MBB.insert(OrigCall, NewMI);
60944 assert(MBBI->isCall() &&
60945 "Unexpected instruction after memory operand unfolding");
60946 if (OrigCall->shouldUpdateAdditionalCallInfo())
60947 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
60948 MBBI->setCFIType(MF, OrigCall->getCFIType());
60949 OrigCall->eraseFromParent();
60950 break;
60951 }
60952 default:
60953 break;
60954 }
60955
60956 MachineOperand &Target = MBBI->getOperand(0);
60957 Register TargetReg;
60958 switch (MBBI->getOpcode()) {
60959 case X86::CALL64r:
60960 case X86::CALL64r_NT:
60961 case X86::TAILJMPr64:
60962 case X86::TAILJMPr64_REX:
60963 assert(Target.isReg() && "Unexpected target operand for an indirect call");
60964 Target.setIsRenamable(false);
60965 TargetReg = Target.getReg();
60966 break;
60967 case X86::CALL64pcrel32:
60968 case X86::TAILJMPd64:
60969 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
60970 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
60971 // 64-bit indirect thunk calls.
60972 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
60973 "Unexpected register for an indirect thunk call");
60974 TargetReg = X86::R11;
60975 break;
60976 default:
60977 llvm_unreachable("Unexpected CFI call opcode");
60978 break;
60979 }
60980
60981 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
60982 .addReg(TargetReg)
60983 .addImm(MBBI->getCFIType())
60984 .getInstr();
60985}
60986
60987/// Returns true if stack probing through a function call is requested.
60989 return !getStackProbeSymbolName(MF).empty();
60990}
60991
60992/// Returns true if stack probing through inline assembly is requested.
60994
60995 // No inline stack probe for Windows, they have their own mechanism.
60996 if (Subtarget.isOSWindows() ||
60997 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
60998 return false;
60999
61000 // If the function specifically requests inline stack probes, emit them.
61001 if (MF.getFunction().hasFnAttribute("probe-stack"))
61002 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
61003 "inline-asm";
61004
61005 return false;
61006}
61007
61008/// Returns the name of the symbol used to emit stack probes or the empty
61009/// string if not applicable.
61012 // Inline Stack probes disable stack probe call
61013 if (hasInlineStackProbe(MF))
61014 return "";
61015
61016 // If the function specifically requests stack probes, emit them.
61017 if (MF.getFunction().hasFnAttribute("probe-stack"))
61018 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
61019
61020 // Generally, if we aren't on Windows, the platform ABI does not include
61021 // support for stack probes, so don't emit them.
61022 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
61023 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
61024 return "";
61025
61026 // We need a stack probe to conform to the Windows ABI. Choose the right
61027 // symbol.
61028 if (Subtarget.is64Bit())
61029 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
61030 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
61031}
61032
61033unsigned
61035 // The default stack probe size is 4096 if the function has no stackprobesize
61036 // attribute.
61037 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
61038 4096);
61039}
61040
61042 if (ML && ML->isInnermost() &&
61043 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
61046}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
static const LLT S1
static const LLT F64
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
Definition: BitTracker.cpp:73
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Looks at all the uses of the given value Returns the Liveness deduced from the uses of this value Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses If the result is MaybeLiveUses might be modified but its content should be ignored(since it might not be complete). DeadArgumentEliminationPass
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:557
Live Register Matrix
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static bool isUndef(const MachineInstr &MI)
unsigned const TargetRegisterInfo * TRI
#define R2(n)
#define T1
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr Register SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static StringRef substr(StringRef Str, uint64_t Len)
This file implements the SmallBitVector class.
This file defines the SmallSet class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:245
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, unsigned Depth, const SimplifyQuery &Q, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
@ ShiftBit
@ UndefBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static bool isX86CCSigned(unsigned X86CC)
Return true if the condition is an signed comparison operation.
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue V)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISD::P...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as a zero or any extension.
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to map a 128-bit or larger integer comparison to vector instructions before type legalization spl...
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static bool matchAsm(StringRef S, ArrayRef< const char * > Pieces)
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, SDValue And1_L, SDValue And1_R, const SDLoc &DL, SelectionDAG &DAG)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, unsigned Reg)
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, bool AllowTruncate)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG)
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isLegalConversion(MVT VT, bool IsSigned, const X86Subtarget &Subtarget)
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, bool HasVariableMask, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget)
Try to lower a vector shuffle as a bit shift (shifts in zeros).
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If a vector select has an operand that is -1 or 0, try to simplify the select to a bitwise logic oper...
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, SDValue Root, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to lower a vector shuffle as a byte rotation.
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, MachineBasicBlock *BB)
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG)
Fold "masked merge" expressions like (m & x) | (~m & y) into the equivalent ((x ^ y) & m) ^ y) patter...
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code? Current x86 isa includes the foll...
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5463
static APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition: APFloat.cpp:5488
void clearSign()
Definition: APFloat.h:1300
opStatus next(bool nextDown)
Definition: APFloat.h:1256
void changeSign()
Definition: APFloat.h:1299
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1081
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition: APInt.cpp:493
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1007
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1492
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:910
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
APInt abs() const
Get the absolute value.
Definition: APInt.h:1773
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void setSignBit()
Set the sign bit to 1.
Definition: APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition: APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition: APInt.h:1079
int32_t exactLogBase2() const
Definition: APInt.h:1761
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition: APInt.h:1607
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1577
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:624
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:219
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1511
void flipAllBits()
Toggle every bit to its opposite value.
Definition: APInt.h:1434
unsigned countl_one() const
Count the number of leading one bits.
Definition: APInt.h:1594
void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition: APInt.cpp:370
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition: APInt.h:1417
unsigned logBase2() const
Definition: APInt.h:1739
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition: APInt.h:471
bool isMask(unsigned numBits) const
Definition: APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1150
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:959
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1389
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:455
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition: APInt.h:399
APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition: APInt.cpp:947
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:652
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:764
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:768
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:396
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
The address of a basic block.
Definition: Constants.h:893
A "pseudo-class" with methods for operating on BUILD_VECTORs.
bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
Value * getCalledOperand() const
Definition: InstrTypes.h:1334
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition: Constants.cpp:3007
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1597
bool isMachineConstantPoolEntry() const
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:403
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:435
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Tagged union holding either a T or a Error.
Definition: Error.h:481
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:128
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:719
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:716
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition: Function.h:917
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1048
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:365
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition: GlobalValue.h:568
bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition: Globals.cpp:405
ThreadLocalMode getThreadLocalMode() const
Definition: GlobalValue.h:272
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:121
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
Class to represent integer types.
Definition: DerivedTypes.h:42
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:74
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:176
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:661
MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition: MCContext.cpp:241
MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition: MCContext.cpp:246
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:307
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
unsigned succ_size() const
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
succ_reverse_iterator succ_rbegin()
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:71
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
bool readMem() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:354
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:121
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
iterator_range< user_iterator > users()
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
Definition: SelectionDAG.h:371
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition: SelectionDAG.h:953
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:751
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:983
SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:499
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:802
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:458
SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:761
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:857
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:828
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:505
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:701
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:797
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:874
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
std::optional< uint64_t > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
Definition: SelectionDAG.h:907
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition: SelectionDAG.h:937
SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
void reserve(size_type NumEntries)
Definition: SmallPtrSet.h:112
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
iterator erase(const_iterator CI)
Definition: SmallVector.h:737
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:578
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:286
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:571
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
size_t size_type
Definition: StringRef.h:57
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
static constexpr size_t npos
Definition: StringRef.h:53
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition: StringRef.h:176
size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Definition: StringRef.cpp:253
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:406
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:80
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:701
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition: Triple.h:758
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:588
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
static IntegerType * getInt1Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:411
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
uint64_t getArrayNumElements() const
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:64
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
use_iterator use_begin()
Definition: Value.h:360
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1094
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
unsigned getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const
bool hasBasePointer(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getDarwinTLSCallPreservedMask() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
Definition: X86Subtarget.h:236
bool hasAnyFMA() const
Definition: X86Subtarget.h:203
bool isOSWindows() const
Definition: X86Subtarget.h:329
bool isTargetMachO() const
Definition: X86Subtarget.h:293
bool useIndirectThunkBranches() const
Definition: X86Subtarget.h:221
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool hasBitScanPassThrough() const
Definition: X86Subtarget.h:269
bool isPICStyleGOT() const
Definition: X86Subtarget.h:337
bool hasSSE42() const
Definition: X86Subtarget.h:198
const X86TargetLowering * getTargetLowering() const override
Definition: X86Subtarget.h:118
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition: X86Subtarget.h:281
bool canUseCMOV() const
Definition: X86Subtarget.h:192
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:340
bool isTargetWindowsMSVC() const
Definition: X86Subtarget.h:305
bool canUseCMPXCHG8B() const
Definition: X86Subtarget.h:185
bool isTargetDarwin() const
Definition: X86Subtarget.h:285
bool isTargetWin64() const
Definition: X86Subtarget.h:333
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition: X86Subtarget.h:178
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:283
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:122
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:346
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool canExtendTo512DQ() const
Definition: X86Subtarget.h:232
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool isTargetELF() const
Definition: X86Subtarget.h:291
bool hasSSEPrefetch() const
Definition: X86Subtarget.h:209
bool canUseCMPXCHG16B() const
Definition: X86Subtarget.h:186
unsigned char classifyGlobalReference(const GlobalValue *GV, const Module &M) const
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasInt256() const
Definition: X86Subtarget.h:202
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:338
bool isTargetCygMing() const
Definition: X86Subtarget.h:325
unsigned char classifyLocalReference(const GlobalValue *GV) const
Classify a global variable reference for the current subtarget according to how we should reference i...
unsigned char classifyBlockAddressReference() const
Classify a blockaddress reference for the current subtarget according to how we should reference it i...
bool isTargetPS() const
Definition: X86Subtarget.h:289
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:132
bool hasAVX() const
Definition: X86Subtarget.h:199
bool isTargetWindowsGNU() const
Definition: X86Subtarget.h:317
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool isTargetWindowsItanium() const
Definition: X86Subtarget.h:321
bool isTargetNaCl64() const
Definition: X86Subtarget.h:301
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:124
bool useBWIRegs() const
Definition: X86Subtarget.h:262
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const
Classify a global function reference for the current subtarget.
bool hasAVX2() const
Definition: X86Subtarget.h:200
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y —> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define INT64_MIN
Definition: DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ X86_ThisCall
Similar to X86_StdCall.
Definition: CallingConv.h:122
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition: CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition: CallingConv.h:103
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1197
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:491
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1360
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:153
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:512
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1340
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1226
@ ConstantFP
Definition: ISDOpcodes.h:77
@ STRICT_FATAN2
Definition: ISDOpcodes.h:428
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1342
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ STRICT_FCEIL
Definition: ISDOpcodes.h:441
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1343
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition: ISDOpcodes.h:130
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1073
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ STRICT_FTANH
Definition: ISDOpcodes.h:431
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:999
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:157
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1325
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:451
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ MEMBARRIER
MEMBARRIER - Compiler barrier only; generate a no-op.
Definition: ISDOpcodes.h:1299
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1304
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition: ISDOpcodes.h:871
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:492
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ STRICT_FLOG2
Definition: ISDOpcodes.h:436
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1270
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:418
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1494
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ EH_LABEL
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:1173
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition: ISDOpcodes.h:141
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ STRICT_FASIN
Definition: ISDOpcodes.h:425
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:685
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:465
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:107
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1118
@ STRICT_FATAN
Definition: ISDOpcodes.h:427
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:788
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1341
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1127
@ GC_TRANSITION_START
GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the beginning and end of GC transition s...
Definition: ISDOpcodes.h:1391
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:642
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1222
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:445
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ STRICT_FP_TO_FP16
Definition: ISDOpcodes.h:967
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ STRICT_FP16_TO_FP
Definition: ISDOpcodes.h:966
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:450
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:439
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:907
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:440
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ STRICT_FSINH
Definition: ISDOpcodes.h:429
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1344
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:120
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1286
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ ConstantPool
Definition: ISDOpcodes.h:82
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:860
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ STRICT_FROUND
Definition: ISDOpcodes.h:443
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:464
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
Definition: ISDOpcodes.h:1372
@ STRICT_BF16_TO_FP
Definition: ISDOpcodes.h:975
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:442
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:444
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:458
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:457
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:164
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ GET_FPENV_MEM
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1078
@ STRICT_FCOSH
Definition: ISDOpcodes.h:430
@ STRICT_FP_TO_BF16
Definition: ISDOpcodes.h:976
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:680
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:407
@ STRICT_FLOG10
Definition: ISDOpcodes.h:435
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition: ISDOpcodes.h:223
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_FEXP2
Definition: ISDOpcodes.h:433
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ ExternalSymbol
Definition: ISDOpcodes.h:83
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition: ISDOpcodes.h:669
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:882
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:438
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:906
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:147
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1217
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ GC_TRANSITION_END
Definition: ISDOpcodes.h:1392
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:437
@ SET_FPENV_MEM
Sets the current floating point environment.
Definition: ISDOpcodes.h:1083
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:692
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1276
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ STRICT_FACOS
Definition: ISDOpcodes.h:426
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool isExtVecInRegOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1686
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1681
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1498
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition: ISDOpcodes.h:1668
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1643
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1610
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1590
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition: ISDOpcodes.h:1649
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
ID ArrayRef< Type * > Tys
Definition: Intrinsics.h:102
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:732
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition: PatternMatch.h:664
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
@ GeneralDynamic
Definition: CodeGen.h:46
@ X86
Windows x64, Windows Itanium (IA-64)
@ PTR32_UPTR
Definition: X86.h:214
@ FS
Definition: X86.h:211
@ PTR64
Definition: X86.h:215
@ PTR32_SPTR
Definition: X86.h:213
@ GS
Definition: X86.h:210
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:411
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition: X86BaseInfo.h:391
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: X86BaseInfo.h:488
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition: X86BaseInfo.h:450
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:432
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:456
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition: X86BaseInfo.h:438
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition: X86BaseInfo.h:476
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:403
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:363
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition: X86BaseInfo.h:472
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition: X86BaseInfo.h:460
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:425
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition: X86BaseInfo.h:480
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:444
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:419
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:387
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
@ AddrNumOperands
Definition: X86BaseInfo.h:36
@ TO_NEAREST_INT
Definition: X86BaseInfo.h:42
@ CUR_DIRECTION
Definition: X86BaseInfo.h:46
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:47
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition: X86InstrInfo.h:121
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
@ Length
Definition: DWP.cpp:480
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition: X86InstrInfo.h:139
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition: Utils.cpp:1565
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:360
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, unsigned Reg)
Replace the address used in the instruction with the direct memory reference.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:297
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition: STLExtras.h:2055
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1547
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:347
bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
unsigned M1(unsigned Val)
Definition: VE.h:376
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, unsigned Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:155
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:160
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition: DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1978
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1866
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ SM_SentinelUndef
@ SM_SentinelZero
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
const char * toString(DWARFSectionKind Kind)
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ TRUNCATE_TO_REG
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ TRUNCATE2_TO_REG
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ FIXUPIMM_MASKZ
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition: STLExtras.h:1624
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
#define EQ(a, b)
Definition: regexec.c:112
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:257
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:302
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:306
static const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition: APFloat.cpp:280
static const fltSemantics & IEEEquad() LLVM_READNONE
Definition: APFloat.cpp:259
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:315
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:258
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:256
opStatus
IEEE-754R 7: Default exception handling.
Definition: APFloat.h:318
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:289
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:212
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
Definition: KnownBits.cpp:765
static std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition: KnownBits.cpp:488
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition: KnownBits.h:178
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:100
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:79
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:234
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition: KnownBits.h:266
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:153
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition: KnownBits.h:281
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition: KnownBits.h:85
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:43
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:164
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:53
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition: KnownBits.h:103
static KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition: KnownBits.cpp:228
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:217
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:288
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:303
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:172
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition: KnownBits.h:188
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:137
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:97
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition: KnownBits.h:91
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:804
static std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition: KnownBits.cpp:526
bool isAllOnes() const
Returns true if value is all one bits.
Definition: KnownBits.h:82
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:59
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
MVT ConstraintVT
The ValueType for the operand value.
std::string ConstraintCode
This contains the actual string for the code, like "m".
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
X86AddressMode - This struct holds a generalized full x86 address mode.