LLVM 21.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
24#include "llvm/ADT/Statistic.h"
41#include "llvm/IR/CallingConv.h"
42#include "llvm/IR/Constants.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalAlias.h"
48#include "llvm/IR/IRBuilder.h"
50#include "llvm/IR/Intrinsics.h"
52#include "llvm/MC/MCAsmInfo.h"
53#include "llvm/MC/MCContext.h"
54#include "llvm/MC/MCExpr.h"
55#include "llvm/MC/MCSymbol.h"
57#include "llvm/Support/Debug.h"
62#include <algorithm>
63#include <bitset>
64#include <cctype>
65#include <numeric>
66using namespace llvm;
67
68#define DEBUG_TYPE "x86-isel"
69
71 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
73 "Sets the preferable loop alignment for experiments (as log2 bytes) "
74 "for innermost loops only. If specified, this option overrides "
75 "alignment set by x86-experimental-pref-loop-alignment."),
77
79 "x86-br-merging-base-cost", cl::init(2),
81 "Sets the cost threshold for when multiple conditionals will be merged "
82 "into one branch versus be split in multiple branches. Merging "
83 "conditionals saves branches at the cost of additional instructions. "
84 "This value sets the instruction cost limit, below which conditionals "
85 "will be merged, and above which conditionals will be split. Set to -1 "
86 "to never merge branches."),
88
90 "x86-br-merging-ccmp-bias", cl::init(6),
91 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
92 "supports conditional compare instructions."),
94
95static cl::opt<bool>
96 WidenShift("x86-widen-shift", cl::init(true),
97 cl::desc("Replace narrow shifts with wider shifts."),
99
101 "x86-br-merging-likely-bias", cl::init(0),
102 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
103 "that all conditionals will be executed. For example for merging "
104 "the conditionals (a == b && c > d), if its known that a == b is "
105 "likely, then it is likely that if the conditionals are split "
106 "both sides will be executed, so it may be desirable to increase "
107 "the instruction cost threshold. Set to -1 to never merge likely "
108 "branches."),
109 cl::Hidden);
110
112 "x86-br-merging-unlikely-bias", cl::init(-1),
113 cl::desc(
114 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
115 "that all conditionals will be executed. For example for merging "
116 "the conditionals (a == b && c > d), if its known that a == b is "
117 "unlikely, then it is unlikely that if the conditionals are split "
118 "both sides will be executed, so it may be desirable to decrease "
119 "the instruction cost threshold. Set to -1 to never merge unlikely "
120 "branches."),
121 cl::Hidden);
122
124 "mul-constant-optimization", cl::init(true),
125 cl::desc("Replace 'mul x, Const' with more effective instructions like "
126 "SHIFT, LEA, etc."),
127 cl::Hidden);
128
130 const X86Subtarget &STI)
131 : TargetLowering(TM), Subtarget(STI) {
132 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
133 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
134
135 // Set up the TargetLowering object.
136
137 // X86 is weird. It always uses i8 for shift amounts and setcc results.
139 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
141
142 // X86 instruction cache is coherent with its data cache so we can use the
143 // default expansion to a no-op.
145
146 // For 64-bit, since we have so many registers, use the ILP scheduler.
147 // For 32-bit, use the register pressure specific scheduling.
148 // For Atom, always use ILP scheduling.
149 if (Subtarget.isAtom())
151 else if (Subtarget.is64Bit())
153 else
155 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
157
158 // Bypass expensive divides and use cheaper ones.
159 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
160 if (Subtarget.hasSlowDivide32())
161 addBypassSlowDiv(32, 8);
162 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
163 addBypassSlowDiv(64, 32);
164 }
165
166 // Setup Windows compiler runtime calls.
167 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
168 static const struct {
169 const RTLIB::Libcall Op;
170 const char * const Name;
171 const CallingConv::ID CC;
172 } LibraryCalls[] = {
173 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
174 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
175 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
176 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
177 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
178 };
179
180 for (const auto &LC : LibraryCalls) {
181 setLibcallName(LC.Op, LC.Name);
182 setLibcallCallingConv(LC.Op, LC.CC);
183 }
184 }
185
186 if (Subtarget.canUseCMPXCHG16B())
188 else if (Subtarget.canUseCMPXCHG8B())
190 else
192
193 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
194
196
197 // Set up the register classes.
198 addRegisterClass(MVT::i8, &X86::GR8RegClass);
199 addRegisterClass(MVT::i16, &X86::GR16RegClass);
200 addRegisterClass(MVT::i32, &X86::GR32RegClass);
201 if (Subtarget.is64Bit())
202 addRegisterClass(MVT::i64, &X86::GR64RegClass);
203
204 for (MVT VT : MVT::integer_valuetypes())
206
207 // We don't accept any truncstore of integer registers.
208 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
209 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
210 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
211 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
212 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
213 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
214
215 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
216
217 // SETOEQ and SETUNE require checking two conditions.
218 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
221 }
222
223 // Integer absolute.
224 if (Subtarget.canUseCMOV()) {
225 setOperationAction(ISD::ABS , MVT::i16 , Custom);
226 setOperationAction(ISD::ABS , MVT::i32 , Custom);
227 if (Subtarget.is64Bit())
228 setOperationAction(ISD::ABS , MVT::i64 , Custom);
229 }
230
231 // Absolute difference.
232 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
233 setOperationAction(Op , MVT::i8 , Custom);
234 setOperationAction(Op , MVT::i16 , Custom);
235 setOperationAction(Op , MVT::i32 , Custom);
236 if (Subtarget.is64Bit())
237 setOperationAction(Op , MVT::i64 , Custom);
238 }
239
240 // Signed saturation subtraction.
244 if (Subtarget.is64Bit())
246
247 // Funnel shifts.
248 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
249 // For slow shld targets we only lower for code size.
250 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
251
252 setOperationAction(ShiftOp , MVT::i8 , Custom);
253 setOperationAction(ShiftOp , MVT::i16 , Custom);
254 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
255 if (Subtarget.is64Bit())
256 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
257 }
258
259 if (!Subtarget.useSoftFloat()) {
260 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
261 // operation.
266 // We have an algorithm for SSE2, and we turn this into a 64-bit
267 // FILD or VCVTUSI2SS/SD for other targets.
270 // We have an algorithm for SSE2->double, and we turn this into a
271 // 64-bit FILD followed by conditional FADD for other targets.
274
275 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
276 // this operation.
279 // SSE has no i16 to fp conversion, only i32. We promote in the handler
280 // to allow f80 to use i16 and f64 to use i16 with sse1 only
283 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
287 // are Legal, f80 is custom lowered.
290
291 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
292 // this operation.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
301 // are Legal, f80 is custom lowered.
304
305 // Handle FP_TO_UINT by promoting the destination to a larger signed
306 // conversion.
308 // FIXME: This doesn't generate invalid exception when it should. PR44019.
311 // FIXME: This doesn't generate invalid exception when it should. PR44019.
317
322
323 if (!Subtarget.is64Bit()) {
326 }
327 }
328
329 if (Subtarget.hasSSE2()) {
330 // Custom lowering for saturating float to int conversions.
331 // We handle promotion to larger result types manually.
332 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
335 }
338 if (Subtarget.is64Bit()) {
341 }
342 }
343 if (Subtarget.hasAVX10_2()) {
346 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
347 MVT::v4i64}) {
350 }
351 if (Subtarget.hasAVX10_2_512()) {
354 }
355 if (Subtarget.is64Bit()) {
358 }
359 }
360
361 // Handle address space casts between mixed sized pointers.
364
365 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
366 if (!Subtarget.hasSSE2()) {
371 if (Subtarget.is64Bit()) {
373 // Without SSE, i64->f64 goes through memory.
375 }
376 } else if (!Subtarget.is64Bit())
378
379 // Scalar integer divide and remainder are lowered to use operations that
380 // produce two results, to match the available instructions. This exposes
381 // the two-result form to trivial CSE, which is able to combine x/y and x%y
382 // into a single instruction.
383 //
384 // Scalar integer multiply-high is also lowered to use two-result
385 // operations, to match the available instructions. However, plain multiply
386 // (low) operations are left as Legal, as there are single-result
387 // instructions for this in x86. Using the two-result multiply instructions
388 // when both high and low results are needed must be arranged by dagcombine.
389 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
396 }
397
398 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
400 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
401 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
404 }
405 if (Subtarget.is64Bit())
410
411 setOperationAction(ISD::FREM , MVT::f32 , Expand);
412 setOperationAction(ISD::FREM , MVT::f64 , Expand);
413 setOperationAction(ISD::FREM , MVT::f80 , Expand);
414 setOperationAction(ISD::FREM , MVT::f128 , Expand);
415
416 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
422 }
423
424 // Promote the i8 variants and force them on up to i32 which has a shorter
425 // encoding.
426 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
428 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
429 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
430 // promote that too.
431 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
433
434 if (!Subtarget.hasBMI()) {
435 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
437 if (Subtarget.is64Bit()) {
438 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
440 }
441 }
442
443 if (Subtarget.hasLZCNT()) {
444 // When promoting the i8 variants, force them to i32 for a shorter
445 // encoding.
446 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
448 } else {
449 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
450 if (VT == MVT::i64 && !Subtarget.is64Bit())
451 continue;
454 }
455 }
456
459 // Special handling for half-precision floating point conversions.
460 // If we don't have F16C support, then lower half float conversions
461 // into library calls.
463 Op, MVT::f32,
464 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
465 // There's never any support for operations beyond MVT::f32.
466 setOperationAction(Op, MVT::f64, Expand);
467 setOperationAction(Op, MVT::f80, Expand);
468 setOperationAction(Op, MVT::f128, Expand);
469 }
470
471 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
474 }
475
476 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
477 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
478 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
479 setTruncStoreAction(VT, MVT::f16, Expand);
480 setTruncStoreAction(VT, MVT::bf16, Expand);
481
484 }
485
489 if (Subtarget.is64Bit())
491 if (Subtarget.hasPOPCNT()) {
492 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
493 // popcntw is longer to encode than popcntl and also has a false dependency
494 // on the dest that popcntl hasn't had since Cannon Lake.
495 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
496 } else {
501 }
502
504
505 if (!Subtarget.hasMOVBE())
507
508 // X86 wants to expand cmov itself.
509 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
514 }
515 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
520 }
521
522 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
525
527 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
528 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
532 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
533 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
534
535 // Darwin ABI issue.
536 for (auto VT : { MVT::i32, MVT::i64 }) {
537 if (VT == MVT::i64 && !Subtarget.is64Bit())
538 continue;
545 }
546
547 // 64-bit shl, sra, srl (iff 32-bit x86)
548 for (auto VT : { MVT::i32, MVT::i64 }) {
549 if (VT == MVT::i64 && !Subtarget.is64Bit())
550 continue;
554 }
555
556 if (Subtarget.hasSSEPrefetch())
558
560
561 // Expand certain atomics
562 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
570 }
571
572 if (!Subtarget.is64Bit())
574
575 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
576 // All CPUs supporting AVX will atomically load/store aligned 128-bit
577 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
580 }
581
582 if (Subtarget.canUseCMPXCHG16B())
584
585 // FIXME - use subtarget debug flags
586 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
587 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
588 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
590 }
591
594
597
598 setOperationAction(ISD::TRAP, MVT::Other, Legal);
600 if (Subtarget.isTargetPS())
602 else
604
605 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
607 setOperationAction(ISD::VAEND , MVT::Other, Expand);
608 bool Is64Bit = Subtarget.is64Bit();
609 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
610 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
611
614
616
617 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
620
622
623 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
624 setOperationAction(ISD::FABS, VT, Action);
625 setOperationAction(ISD::FNEG, VT, Action);
627 setOperationAction(ISD::FREM, VT, Action);
628 setOperationAction(ISD::FMA, VT, Action);
629 setOperationAction(ISD::FMINNUM, VT, Action);
630 setOperationAction(ISD::FMAXNUM, VT, Action);
635 setOperationAction(ISD::FSIN, VT, Action);
636 setOperationAction(ISD::FCOS, VT, Action);
637 setOperationAction(ISD::FSINCOS, VT, Action);
638 setOperationAction(ISD::FTAN, VT, Action);
639 setOperationAction(ISD::FSQRT, VT, Action);
640 setOperationAction(ISD::FPOW, VT, Action);
641 setOperationAction(ISD::FPOWI, VT, Action);
642 setOperationAction(ISD::FLOG, VT, Action);
643 setOperationAction(ISD::FLOG2, VT, Action);
644 setOperationAction(ISD::FLOG10, VT, Action);
645 setOperationAction(ISD::FEXP, VT, Action);
646 setOperationAction(ISD::FEXP2, VT, Action);
647 setOperationAction(ISD::FEXP10, VT, Action);
648 setOperationAction(ISD::FCEIL, VT, Action);
649 setOperationAction(ISD::FFLOOR, VT, Action);
651 setOperationAction(ISD::FRINT, VT, Action);
652 setOperationAction(ISD::BR_CC, VT, Action);
653 setOperationAction(ISD::SETCC, VT, Action);
656 setOperationAction(ISD::FROUND, VT, Action);
658 setOperationAction(ISD::FTRUNC, VT, Action);
659 setOperationAction(ISD::FLDEXP, VT, Action);
660 };
661
662 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
663 // f16, f32 and f64 use SSE.
664 // Set up the FP register classes.
665 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
666 : &X86::FR16RegClass);
667 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
668 : &X86::FR32RegClass);
669 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
670 : &X86::FR64RegClass);
671
672 // Disable f32->f64 extload as we can only generate this in one instruction
673 // under optsize. So its easier to pattern match (fpext (load)) for that
674 // case instead of needing to emit 2 instructions for extload in the
675 // non-optsize case.
676 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
677
678 for (auto VT : { MVT::f32, MVT::f64 }) {
679 // Use ANDPD to simulate FABS.
681
682 // Use XORP to simulate FNEG.
684
685 // Use ANDPD and ORPD to simulate FCOPYSIGN.
687
688 // These might be better off as horizontal vector ops.
691
692 // We don't support sin/cos/fmod
696 }
697
698 // Half type will be promoted by default.
699 setF16Action(MVT::f16, Promote);
707
738
739 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
740 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
741
742 // Lower this to MOVMSK plus an AND.
745
746 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
747 (UseX87 || Is64Bit)) {
748 // Use SSE for f32, x87 for f64.
749 // Set up the FP register classes.
750 addRegisterClass(MVT::f32, &X86::FR32RegClass);
751 if (UseX87)
752 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
753
754 // Use ANDPS to simulate FABS.
756
757 // Use XORP to simulate FNEG.
759
760 if (UseX87)
762
763 // Use ANDPS and ORPS to simulate FCOPYSIGN.
764 if (UseX87)
767
768 // We don't support sin/cos/fmod
772
773 if (UseX87) {
774 // Always expand sin/cos functions even though x87 has an instruction.
778 }
779 } else if (UseX87) {
780 // f32 and f64 in x87.
781 // Set up the FP register classes.
782 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
783 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
784
785 for (auto VT : { MVT::f32, MVT::f64 }) {
788
789 // Always expand sin/cos functions even though x87 has an instruction.
793 }
794 }
795
796 // Expand FP32 immediates into loads from the stack, save special cases.
797 if (isTypeLegal(MVT::f32)) {
798 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
799 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
800 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
801 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
802 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
803 } else // SSE immediates.
804 addLegalFPImmediate(APFloat(+0.0f)); // xorps
805 }
806 // Expand FP64 immediates into loads from the stack, save special cases.
807 if (isTypeLegal(MVT::f64)) {
808 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
809 addLegalFPImmediate(APFloat(+0.0)); // FLD0
810 addLegalFPImmediate(APFloat(+1.0)); // FLD1
811 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
812 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
813 } else // SSE immediates.
814 addLegalFPImmediate(APFloat(+0.0)); // xorpd
815 }
816 // Support fp16 0 immediate.
817 if (isTypeLegal(MVT::f16))
818 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
819
820 // Handle constrained floating-point operations of scalar.
833
834 // We don't support FMA.
837
838 // f80 always uses X87.
839 if (UseX87) {
840 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
843 {
845 addLegalFPImmediate(TmpFlt); // FLD0
846 TmpFlt.changeSign();
847 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
848
849 bool ignored;
850 APFloat TmpFlt2(+1.0);
852 &ignored);
853 addLegalFPImmediate(TmpFlt2); // FLD1
854 TmpFlt2.changeSign();
855 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
856 }
857
858 // Always expand sin/cos functions even though x87 has an instruction.
859 // clang-format off
871 // clang-format on
872
884
885 // Handle constrained floating-point operations of scalar.
892 if (isTypeLegal(MVT::f16)) {
895 } else {
897 }
898 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
899 // as Custom.
901 }
902
903 // f128 uses xmm registers, but most operations require libcalls.
904 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
905 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
906 : &X86::VR128RegClass);
907
908 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
909
920
924
925 // clang-format off
933 // clang-format on
934 // No STRICT_FSINCOS
937
940 // We need to custom handle any FP_ROUND with an f128 input, but
941 // LegalizeDAG uses the result type to know when to run a custom handler.
942 // So we have to list all legal floating point result types here.
943 if (isTypeLegal(MVT::f32)) {
946 }
947 if (isTypeLegal(MVT::f64)) {
950 }
951 if (isTypeLegal(MVT::f80)) {
955 }
956
958
959 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
960 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
961 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
962 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
963 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
964 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
965 }
966
967 // Always use a library call for pow.
968 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
969 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
970 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
971 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
972
981
982 // Some FP actions are always expanded for vector types.
983 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
984 MVT::v4f32, MVT::v8f32, MVT::v16f32,
985 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
986 // clang-format off
1000 // clang-format on
1001 }
1002
1003 // First set operation action for all vector types to either promote
1004 // (for widening) or expand (for scalarization). Then we will selectively
1005 // turn on ones that can be effectively codegen'd.
1045 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1046 setTruncStoreAction(InnerVT, VT, Expand);
1047
1048 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1049 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1050
1051 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1052 // types, we have to deal with them whether we ask for Expansion or not.
1053 // Setting Expand causes its own optimisation problems though, so leave
1054 // them legal.
1055 if (VT.getVectorElementType() == MVT::i1)
1056 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1057
1058 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1059 // split/scalarized right now.
1060 if (VT.getVectorElementType() == MVT::f16 ||
1061 VT.getVectorElementType() == MVT::bf16)
1062 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1063 }
1064 }
1065
1066 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1067 // with -msoft-float, disable use of MMX as well.
1068 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1069 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1070 // No operations on x86mmx supported, everything uses intrinsics.
1071 }
1072
1073 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1074 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1075 : &X86::VR128RegClass);
1076
1081
1082 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1083 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1091
1092 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1093 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1095
1101 }
1102
1103 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1104 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1105 : &X86::VR128RegClass);
1106
1107 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1108 // registers cannot be used even for integer operations.
1109 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1110 : &X86::VR128RegClass);
1111 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1112 : &X86::VR128RegClass);
1113 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1114 : &X86::VR128RegClass);
1115 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1116 : &X86::VR128RegClass);
1117 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1118 : &X86::VR128RegClass);
1119
1120 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1125 }
1126
1127 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1128 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1133 }
1134
1135 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1136 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1137 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1138
1139 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1140 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1141 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1142 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1143 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1144 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1145 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1146 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1147 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1148 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1151
1152 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1153 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1154 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1155
1156 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1158 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1160
1161 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1162 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1163
1164 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1165 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1166 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1167 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1168 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1169 }
1170
1181
1186
1187 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1193
1194 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1195 // setcc all the way to isel and prefer SETGT in some isel patterns.
1198 }
1199
1200 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1201 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1206
1207 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1213 }
1214
1215 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1219
1220 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1221 continue;
1222
1225 }
1226 setF16Action(MVT::v8f16, Expand);
1227 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1228 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1229 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1230 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1231 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1232 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1234
1235 // Custom lower v2i64 and v2f64 selects.
1242
1249
1250 // Custom legalize these to avoid over promotion or custom promotion.
1251 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1256 }
1257
1262
1265
1268
1269 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1274
1279
1280 // We want to legalize this to an f64 load rather than an i64 load on
1281 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1282 // store.
1283 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1284 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1285 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1286 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1287 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1289
1290 // Add 32-bit vector stores to help vectorization opportunities.
1291 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1293
1297 if (!Subtarget.hasAVX512())
1299
1303
1305
1322
1323 // In the customized shift lowering, the legal v4i32/v2i64 cases
1324 // in AVX2 will be recognized.
1325 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1329 if (VT == MVT::v2i64) continue;
1334 }
1335
1341 }
1342
1343 if (Subtarget.hasGFNI()) {
1348 }
1349
1350 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1351 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1352 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1353 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1354
1355 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1358 }
1359
1360 // These might be better off as horizontal vector ops.
1365 }
1366
1367 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1368 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1371 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1375 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1381
1383 }
1384
1385 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1386 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1387 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1388 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1389 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1390 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1391 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1392 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1393
1397
1398 // FIXME: Do we need to handle scalar-to-vector here?
1399 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1400 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1401
1402 // We directly match byte blends in the backend as they match the VSELECT
1403 // condition form.
1405
1406 // SSE41 brings specific instructions for doing vector sign extend even in
1407 // cases where we don't have SRA.
1408 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1411 }
1412
1413 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1414 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1415 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1416 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1417 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1418 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1419 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1420 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1421 }
1422
1423 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1424 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1425 // do the pre and post work in the vector domain.
1428 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1429 // so that DAG combine doesn't try to turn it into uint_to_fp.
1432 }
1433 }
1434
1435 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1437 }
1438
1439 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1440 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1441 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1444 }
1445
1446 // XOP can efficiently perform BITREVERSE with VPPERM.
1447 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1449 }
1450
1451 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1452 bool HasInt256 = Subtarget.hasInt256();
1453
1454 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1455 : &X86::VR256RegClass);
1456 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1457 : &X86::VR256RegClass);
1458 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1459 : &X86::VR256RegClass);
1460 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1461 : &X86::VR256RegClass);
1462 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1463 : &X86::VR256RegClass);
1464 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1465 : &X86::VR256RegClass);
1466 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1467 : &X86::VR256RegClass);
1468
1469 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1482
1484
1488
1494 }
1495
1496 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1497 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1498
1499 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1500 // even though v8i16 is a legal type.
1501 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1502 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1503 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1504 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1508
1515
1527
1528 if (!Subtarget.hasAVX512())
1530
1531 // In the customized shift lowering, the legal v8i32/v4i64 cases
1532 // in AVX2 will be recognized.
1533 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1539 if (VT == MVT::v4i64) continue;
1544 }
1545
1546 // These types need custom splitting if their input is a 128-bit vector.
1551
1555 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1556 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1559
1560 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1564 }
1565
1570
1571 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1576
1577 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1578 // setcc all the way to isel and prefer SETGT in some isel patterns.
1581 }
1582
1583 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1584 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1589
1590 if (Subtarget.hasAnyFMA()) {
1591 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1592 MVT::v2f64, MVT::v4f64 }) {
1595 }
1596 }
1597
1598 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1599 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1601 }
1602
1603 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1604 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1605 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1606 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1607
1608 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1609 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1610 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1611 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1613 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1614 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1616
1617 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1618 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1619
1620 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1621 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1622 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1623 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1624 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1625
1626 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1630 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1631 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1632 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1633 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1638
1639 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1640 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1641 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1642 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1643 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1644 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1645 }
1646
1647 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1650 }
1651
1652 if (HasInt256) {
1653 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1654 // when we have a 256bit-wide blend with immediate.
1657
1658 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1659 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1660 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1661 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1662 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1663 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1664 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1665 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1666 }
1667 }
1668
1669 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1670 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1671 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1673 }
1674
1675 // Extract subvector is special because the value type
1676 // (result) is 128-bit but the source is 256-bit wide.
1677 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1678 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1680 }
1681
1682 // Custom lower several nodes for 256-bit types.
1683 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1684 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1694 }
1695 setF16Action(MVT::v16f16, Expand);
1696 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1697 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1699 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1700 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1701 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1702 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1703
1704 if (HasInt256) {
1706
1707 // Custom legalize 2x32 to get a little better code.
1710
1711 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1712 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1714 }
1715 }
1716
1717 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1718 Subtarget.hasF16C()) {
1719 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1722 }
1723 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1726 }
1727 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1728 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1729 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1730 }
1731 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1732 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1733 }
1734
1735 // This block controls legalization of the mask vector sizes that are
1736 // available with AVX512. 512-bit vectors are in a separate block controlled
1737 // by useAVX512Regs.
1738 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1739 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1740 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1741 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1742 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1743 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1744
1748
1749 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1750 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1751 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1752 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1753 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1754 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1755 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1756 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1764
1765 // There is no byte sized k-register load or store without AVX512DQ.
1766 if (!Subtarget.hasDQI()) {
1767 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1768 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1769 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1770 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1771
1776 }
1777
1778 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1779 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1783 }
1784
1785 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1787
1788 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1792
1799 }
1800
1801 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1803 }
1804 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1805 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1808 }
1809 }
1810
1811 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1812 // elements. 512-bits can be disabled based on prefer-vector-width and
1813 // required-vector-width function attributes.
1814 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1815 bool HasBWI = Subtarget.hasBWI();
1816
1817 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1818 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1819 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1820 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1821 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1822 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1823 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1824
1825 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1826 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1827 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1828 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1829 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1830 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1831 if (HasBWI)
1832 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1833 }
1834
1835 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1846 }
1847 setOperationAction(ISD::LRINT, MVT::v16f32,
1848 Subtarget.hasDQI() ? Legal : Custom);
1849 setOperationAction(ISD::LRINT, MVT::v8f64,
1850 Subtarget.hasDQI() ? Legal : Custom);
1851 if (Subtarget.hasDQI())
1852 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1853
1854 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1859 }
1860
1861 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1866 }
1867
1874
1886
1887 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1888 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1889 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1890 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1891 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1892 if (HasBWI)
1893 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1894
1895 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1896 // to 512-bit rather than use the AVX2 instructions so that we can use
1897 // k-masks.
1898 if (!Subtarget.hasVLX()) {
1899 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1900 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1903 }
1904 }
1905
1907 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1908 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1918
1919 if (HasBWI) {
1920 // Extends from v64i1 masks to 512-bit vectors.
1924 }
1925
1926 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1939
1941 }
1942
1943 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1946 }
1947
1948 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1949 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1951 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1952
1953 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1954 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1955 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1956 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1957
1958 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1959 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1960 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1961 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1962 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1963 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1964 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1965 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1966
1967 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1968 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1969
1970 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1980
1981 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1982 // setcc all the way to isel and prefer SETGT in some isel patterns.
1985 }
1986
1987 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1988 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1993
1994 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
2001 }
2002
2003 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2004 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
2005 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
2007 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
2008 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
2009 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
2010 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2015 }
2016
2017 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2018 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2019 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2020 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2021 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2022 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2023
2024 if (Subtarget.hasDQI()) {
2028 setOperationAction(Opc, MVT::v8i64, Custom);
2029 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2030 }
2031
2032 if (Subtarget.hasCDI()) {
2033 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2034 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2036 }
2037 } // Subtarget.hasCDI()
2038
2039 if (Subtarget.hasVPOPCNTDQ()) {
2040 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2042 }
2043
2044 // Extract subvector is special because the value type
2045 // (result) is 256-bit but the source is 512-bit wide.
2046 // 128-bit was made Legal under AVX1.
2047 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2048 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2050
2051 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2052 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2062 }
2063 setF16Action(MVT::v32f16, Expand);
2068 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2069 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2070 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2071
2072 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2077 }
2078 if (HasBWI) {
2079 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2082 }
2083 } else {
2084 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2085 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2086 }
2087
2088 if (Subtarget.hasVBMI2()) {
2089 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2092 }
2093
2094 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2095 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2096 }
2097
2098 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2099 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2101 }// useAVX512Regs
2102
2103 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2104 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2105 MVT::v4i64}) {
2108 }
2109 }
2110
2111 // This block controls legalization for operations that don't have
2112 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2113 // narrower widths.
2114 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2115 // These operations are handled on non-VLX by artificially widening in
2116 // isel patterns.
2117
2121
2122 if (Subtarget.hasDQI()) {
2123 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2124 // v2f32 UINT_TO_FP is already custom under SSE2.
2127 "Unexpected operation action!");
2128 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2133 }
2134
2135 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2141 }
2142
2143 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2146 }
2147
2148 // Custom legalize 2x32 to get a little better code.
2151
2152 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2153 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2155
2156 if (Subtarget.hasDQI()) {
2160 setOperationAction(Opc, MVT::v2i64, Custom);
2161 setOperationAction(Opc, MVT::v4i64, Custom);
2162 }
2163 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2164 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2165 }
2166
2167 if (Subtarget.hasCDI()) {
2168 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2170 }
2171 } // Subtarget.hasCDI()
2172
2173 if (Subtarget.hasVPOPCNTDQ()) {
2174 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2176 }
2177
2178 // We can try to convert vectors to different sizes to leverage legal
2179 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2180 // then specialize to Legal below.
2181 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2182 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2183 MVT::v16i16, MVT::v8i8})
2185
2186 // Legal vpcompress depends on various AVX512 extensions.
2187 // Legal in AVX512F
2188 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2190
2191 // Legal in AVX512F + AVX512VL
2192 if (Subtarget.hasVLX())
2193 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2194 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2196
2197 // Legal in AVX512F + AVX512VBMI2
2198 if (Subtarget.hasVBMI2())
2199 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2201
2202 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2203 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2204 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2206 }
2207
2208 // This block control legalization of v32i1/v64i1 which are available with
2209 // AVX512BW..
2210 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2211 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2212 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2213
2214 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2225 }
2226
2227 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2229
2230 // Extends from v32i1 masks to 256-bit vectors.
2234
2235 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2236 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2237 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2238 }
2239
2240 // These operations are handled on non-VLX by artificially widening in
2241 // isel patterns.
2242 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2243
2244 if (Subtarget.hasBITALG()) {
2245 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2247 }
2248 }
2249
2250 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2251 auto setGroup = [&] (MVT VT) {
2262
2275
2277
2280
2286
2292
2296 };
2297
2298 // AVX512_FP16 scalar operations
2299 setGroup(MVT::f16);
2315
2318
2319 if (Subtarget.useAVX512Regs()) {
2320 setGroup(MVT::v32f16);
2326 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2333
2338 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2340 MVT::v32i16);
2341 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2343 MVT::v32i16);
2344 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2346 MVT::v32i16);
2347 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2349 MVT::v32i16);
2350
2354
2355 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2356 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2357
2362 }
2363
2364 if (Subtarget.hasVLX()) {
2365 setGroup(MVT::v8f16);
2366 setGroup(MVT::v16f16);
2367
2378
2389
2390 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2393
2397
2398 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2399 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2400 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2401 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2402
2403 // Need to custom widen these to prevent scalarization.
2404 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2405 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2406
2411
2416 }
2417 }
2418
2419 if (!Subtarget.useSoftFloat() &&
2420 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2421 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2422 : &X86::VR128RegClass);
2423 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2424 : &X86::VR256RegClass);
2425 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2426 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2427 // Set the operation action Custom to do the customization later.
2430 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2431 setF16Action(VT, Expand);
2432 if (!Subtarget.hasBF16())
2438 }
2439 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2440 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2441 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2442 }
2443 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2444 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2446 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2447 }
2448
2449 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2450 Subtarget.useAVX512Regs()) {
2451 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2452 setF16Action(MVT::v32bf16, Expand);
2453 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2454 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2455 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2457 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2461 }
2462
2463 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2464 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2476 }
2477 if (Subtarget.hasAVX10_2_512()) {
2478 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2479 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2480 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2481 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2482 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2483 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2484 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2485 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2486 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2489 }
2490 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2493 }
2494 }
2495
2496 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2497 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2498 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2499 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2500 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2501 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2502
2503 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2504 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2505 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2506 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2507 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2508
2509 if (Subtarget.hasBWI()) {
2510 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2511 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2512 }
2513
2514 if (Subtarget.hasFP16()) {
2515 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2524 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2533 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2538 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2543 }
2544 }
2545
2546 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2547 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2548 }
2549
2550 // We want to custom lower some of our intrinsics.
2554 if (!Subtarget.is64Bit()) {
2556 }
2557
2558 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2559 // handle type legalization for these operations here.
2560 //
2561 // FIXME: We really should do custom legalization for addition and
2562 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2563 // than generic legalization for 64-bit multiplication-with-overflow, though.
2564 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2565 if (VT == MVT::i64 && !Subtarget.is64Bit())
2566 continue;
2567 // Add/Sub/Mul with overflow operations are custom lowered.
2574
2575 // Support carry in as value rather than glue.
2581 }
2582
2583 // Combine sin / cos into _sincos_stret if it is available.
2584 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2585 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2588 }
2589
2590 if (Subtarget.isTargetWin64()) {
2591 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2592 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2593 setOperationAction(ISD::SREM, MVT::i128, Custom);
2594 setOperationAction(ISD::UREM, MVT::i128, Custom);
2603 }
2604
2605 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2606 // is. We should promote the value to 64-bits to solve this.
2607 // This is what the CRT headers do - `fmodf` is an inline header
2608 // function casting to f64 and calling `fmod`.
2609 if (Subtarget.is32Bit() &&
2610 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2611 // clang-format off
2612 for (ISD::NodeType Op :
2630 if (isOperationExpand(Op, MVT::f32))
2631 setOperationAction(Op, MVT::f32, Promote);
2632 // clang-format on
2633
2634 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2635 // it, but it's just a wrapper around ldexp.
2636 if (Subtarget.isOSWindows()) {
2638 if (isOperationExpand(Op, MVT::f32))
2639 setOperationAction(Op, MVT::f32, Promote);
2640 }
2641
2642 // We have target-specific dag combine patterns for the following nodes:
2653 ISD::SHL,
2654 ISD::SRA,
2655 ISD::SRL,
2656 ISD::OR,
2657 ISD::AND,
2663 ISD::ADD,
2664 ISD::FADD,
2665 ISD::FSUB,
2666 ISD::FNEG,
2667 ISD::FMA,
2671 ISD::SUB,
2672 ISD::LOAD,
2673 ISD::LRINT,
2675 ISD::MLOAD,
2676 ISD::STORE,
2692 ISD::SETCC,
2693 ISD::MUL,
2694 ISD::XOR,
2705
2707
2708 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2710 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2712 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2714
2715 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2716 // that needs to benchmarked and balanced with the potential use of vector
2717 // load/store types (PR33329, PR33914).
2720
2721 // Default loop alignment, which can be overridden by -align-loops.
2723
2724 // An out-of-order CPU can speculatively execute past a predictable branch,
2725 // but a conditional move could be stalled by an expensive earlier operation.
2726 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2727 EnableExtLdPromotion = true;
2729
2731
2732 // Default to having -disable-strictnode-mutation on
2733 IsStrictFPEnabled = true;
2734}
2735
2736// This has so far only been implemented for 64-bit MachO.
2738 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2739}
2740
2742 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2743 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2744}
2745
2747 const SDLoc &DL) const {
2748 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2749 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2750 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2751 return SDValue(Node, 0);
2752}
2753
2756 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2757 !Subtarget.hasBWI())
2758 return TypeSplitVector;
2759
2760 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2761 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2762 return TypeSplitVector;
2763
2764 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2765 VT.getVectorElementType() != MVT::i1)
2766 return TypeWidenVector;
2767
2769}
2770
2771FastISel *
2773 const TargetLibraryInfo *libInfo) const {
2774 return X86::createFastISel(funcInfo, libInfo);
2775}
2776
2777//===----------------------------------------------------------------------===//
2778// Other Lowering Hooks
2779//===----------------------------------------------------------------------===//
2780
2782 bool AssumeSingleUse) {
2783 if (!AssumeSingleUse && !Op.hasOneUse())
2784 return false;
2785 if (!ISD::isNormalLoad(Op.getNode()))
2786 return false;
2787
2788 // If this is an unaligned vector, make sure the target supports folding it.
2789 auto *Ld = cast<LoadSDNode>(Op.getNode());
2790 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2791 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2792 return false;
2793
2794 // TODO: If this is a non-temporal load and the target has an instruction
2795 // for it, it should not be folded. See "useNonTemporalLoad()".
2796
2797 return true;
2798}
2799
2801 const X86Subtarget &Subtarget,
2802 bool AssumeSingleUse) {
2803 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2804 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2805 return false;
2806
2807 // We can not replace a wide volatile load with a broadcast-from-memory,
2808 // because that would narrow the load, which isn't legal for volatiles.
2809 auto *Ld = cast<LoadSDNode>(Op.getNode());
2810 return !Ld->isVolatile() ||
2811 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2812}
2813
2815 if (!Op.hasOneUse())
2816 return false;
2817 // Peek through (oneuse) bitcast users
2818 SDNode *User = *Op->user_begin();
2819 while (User->getOpcode() == ISD::BITCAST) {
2820 if (!User->hasOneUse())
2821 return false;
2822 User = *User->user_begin();
2823 }
2824 return ISD::isNormalStore(User);
2825}
2826
2828 if (Op.hasOneUse()) {
2829 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2830 return (ISD::ZERO_EXTEND == Opcode);
2831 }
2832 return false;
2833}
2834
2835static bool isLogicOp(unsigned Opcode) {
2836 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2837 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2838}
2839
2840static bool isTargetShuffle(unsigned Opcode) {
2841 switch(Opcode) {
2842 default: return false;
2843 case X86ISD::BLENDI:
2844 case X86ISD::PSHUFB:
2845 case X86ISD::PSHUFD:
2846 case X86ISD::PSHUFHW:
2847 case X86ISD::PSHUFLW:
2848 case X86ISD::SHUFP:
2849 case X86ISD::INSERTPS:
2850 case X86ISD::EXTRQI:
2851 case X86ISD::INSERTQI:
2852 case X86ISD::VALIGN:
2853 case X86ISD::PALIGNR:
2854 case X86ISD::VSHLDQ:
2855 case X86ISD::VSRLDQ:
2856 case X86ISD::MOVLHPS:
2857 case X86ISD::MOVHLPS:
2858 case X86ISD::MOVSHDUP:
2859 case X86ISD::MOVSLDUP:
2860 case X86ISD::MOVDDUP:
2861 case X86ISD::MOVSS:
2862 case X86ISD::MOVSD:
2863 case X86ISD::MOVSH:
2864 case X86ISD::UNPCKL:
2865 case X86ISD::UNPCKH:
2866 case X86ISD::VBROADCAST:
2867 case X86ISD::VPERMILPI:
2868 case X86ISD::VPERMILPV:
2869 case X86ISD::VPERM2X128:
2870 case X86ISD::SHUF128:
2871 case X86ISD::VPERMIL2:
2872 case X86ISD::VPERMI:
2873 case X86ISD::VPPERM:
2874 case X86ISD::VPERMV:
2875 case X86ISD::VPERMV3:
2876 case X86ISD::VZEXT_MOVL:
2877 return true;
2878 }
2879}
2880
2881static bool isTargetShuffleVariableMask(unsigned Opcode) {
2882 switch (Opcode) {
2883 default: return false;
2884 // Target Shuffles.
2885 case X86ISD::PSHUFB:
2886 case X86ISD::VPERMILPV:
2887 case X86ISD::VPERMIL2:
2888 case X86ISD::VPPERM:
2889 case X86ISD::VPERMV:
2890 case X86ISD::VPERMV3:
2891 return true;
2892 // 'Faux' Target Shuffles.
2893 case ISD::OR:
2894 case ISD::AND:
2895 case X86ISD::ANDNP:
2896 return true;
2897 }
2898}
2899
2902 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2904 int ReturnAddrIndex = FuncInfo->getRAIndex();
2905
2906 if (ReturnAddrIndex == 0) {
2907 // Set up a frame object for the return address.
2908 unsigned SlotSize = RegInfo->getSlotSize();
2909 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2910 -(int64_t)SlotSize,
2911 false);
2912 FuncInfo->setRAIndex(ReturnAddrIndex);
2913 }
2914
2915 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2916}
2917
2919 bool HasSymbolicDisplacement) {
2920 // Offset should fit into 32 bit immediate field.
2921 if (!isInt<32>(Offset))
2922 return false;
2923
2924 // If we don't have a symbolic displacement - we don't have any extra
2925 // restrictions.
2926 if (!HasSymbolicDisplacement)
2927 return true;
2928
2929 // We can fold large offsets in the large code model because we always use
2930 // 64-bit offsets.
2931 if (CM == CodeModel::Large)
2932 return true;
2933
2934 // For kernel code model we know that all object resist in the negative half
2935 // of 32bits address space. We may not accept negative offsets, since they may
2936 // be just off and we may accept pretty large positive ones.
2937 if (CM == CodeModel::Kernel)
2938 return Offset >= 0;
2939
2940 // For other non-large code models we assume that latest small object is 16MB
2941 // before end of 31 bits boundary. We may also accept pretty large negative
2942 // constants knowing that all objects are in the positive half of address
2943 // space.
2944 return Offset < 16 * 1024 * 1024;
2945}
2946
2947/// Return true if the condition is an signed comparison operation.
2948static bool isX86CCSigned(X86::CondCode X86CC) {
2949 switch (X86CC) {
2950 default:
2951 llvm_unreachable("Invalid integer condition!");
2952 case X86::COND_E:
2953 case X86::COND_NE:
2954 case X86::COND_B:
2955 case X86::COND_A:
2956 case X86::COND_BE:
2957 case X86::COND_AE:
2958 return false;
2959 case X86::COND_G:
2960 case X86::COND_GE:
2961 case X86::COND_L:
2962 case X86::COND_LE:
2963 return true;
2964 }
2965}
2966
2968 switch (SetCCOpcode) {
2969 // clang-format off
2970 default: llvm_unreachable("Invalid integer condition!");
2971 case ISD::SETEQ: return X86::COND_E;
2972 case ISD::SETGT: return X86::COND_G;
2973 case ISD::SETGE: return X86::COND_GE;
2974 case ISD::SETLT: return X86::COND_L;
2975 case ISD::SETLE: return X86::COND_LE;
2976 case ISD::SETNE: return X86::COND_NE;
2977 case ISD::SETULT: return X86::COND_B;
2978 case ISD::SETUGT: return X86::COND_A;
2979 case ISD::SETULE: return X86::COND_BE;
2980 case ISD::SETUGE: return X86::COND_AE;
2981 // clang-format on
2982 }
2983}
2984
2985/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2986/// condition code, returning the condition code and the LHS/RHS of the
2987/// comparison to make.
2989 bool isFP, SDValue &LHS, SDValue &RHS,
2990 SelectionDAG &DAG) {
2991 if (!isFP) {
2992 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2993 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2994 // X > -1 -> X == 0, jump !sign.
2995 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2996 return X86::COND_NS;
2997 }
2998 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2999 // X < 0 -> X == 0, jump on sign.
3000 return X86::COND_S;
3001 }
3002 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3003 // X >= 0 -> X == 0, jump on !sign.
3004 return X86::COND_NS;
3005 }
3006 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3007 // X < 1 -> X <= 0
3008 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3009 return X86::COND_LE;
3010 }
3011 }
3012
3013 return TranslateIntegerX86CC(SetCCOpcode);
3014 }
3015
3016 // First determine if it is required or is profitable to flip the operands.
3017
3018 // If LHS is a foldable load, but RHS is not, flip the condition.
3019 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3020 !ISD::isNON_EXTLoad(RHS.getNode())) {
3021 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3022 std::swap(LHS, RHS);
3023 }
3024
3025 switch (SetCCOpcode) {
3026 default: break;
3027 case ISD::SETOLT:
3028 case ISD::SETOLE:
3029 case ISD::SETUGT:
3030 case ISD::SETUGE:
3031 std::swap(LHS, RHS);
3032 break;
3033 }
3034
3035 // On a floating point condition, the flags are set as follows:
3036 // ZF PF CF op
3037 // 0 | 0 | 0 | X > Y
3038 // 0 | 0 | 1 | X < Y
3039 // 1 | 0 | 0 | X == Y
3040 // 1 | 1 | 1 | unordered
3041 switch (SetCCOpcode) {
3042 // clang-format off
3043 default: llvm_unreachable("Condcode should be pre-legalized away");
3044 case ISD::SETUEQ:
3045 case ISD::SETEQ: return X86::COND_E;
3046 case ISD::SETOLT: // flipped
3047 case ISD::SETOGT:
3048 case ISD::SETGT: return X86::COND_A;
3049 case ISD::SETOLE: // flipped
3050 case ISD::SETOGE:
3051 case ISD::SETGE: return X86::COND_AE;
3052 case ISD::SETUGT: // flipped
3053 case ISD::SETULT:
3054 case ISD::SETLT: return X86::COND_B;
3055 case ISD::SETUGE: // flipped
3056 case ISD::SETULE:
3057 case ISD::SETLE: return X86::COND_BE;
3058 case ISD::SETONE:
3059 case ISD::SETNE: return X86::COND_NE;
3060 case ISD::SETUO: return X86::COND_P;
3061 case ISD::SETO: return X86::COND_NP;
3062 case ISD::SETOEQ:
3063 case ISD::SETUNE: return X86::COND_INVALID;
3064 // clang-format on
3065 }
3066}
3067
3068/// Is there a floating point cmov for the specific X86 condition code?
3069/// Current x86 isa includes the following FP cmov instructions:
3070/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3071static bool hasFPCMov(unsigned X86CC) {
3072 switch (X86CC) {
3073 default:
3074 return false;
3075 case X86::COND_B:
3076 case X86::COND_BE:
3077 case X86::COND_E:
3078 case X86::COND_P:
3079 case X86::COND_A:
3080 case X86::COND_AE:
3081 case X86::COND_NE:
3082 case X86::COND_NP:
3083 return true;
3084 }
3085}
3086
3087static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3088 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3089 VT.is512BitVector();
3090}
3091
3093 const CallInst &I,
3094 MachineFunction &MF,
3095 unsigned Intrinsic) const {
3097 Info.offset = 0;
3098
3099 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
3100 if (!IntrData) {
3101 switch (Intrinsic) {
3102 case Intrinsic::x86_aesenc128kl:
3103 case Intrinsic::x86_aesdec128kl:
3105 Info.ptrVal = I.getArgOperand(1);
3106 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3107 Info.align = Align(1);
3109 return true;
3110 case Intrinsic::x86_aesenc256kl:
3111 case Intrinsic::x86_aesdec256kl:
3113 Info.ptrVal = I.getArgOperand(1);
3114 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3115 Info.align = Align(1);
3117 return true;
3118 case Intrinsic::x86_aesencwide128kl:
3119 case Intrinsic::x86_aesdecwide128kl:
3121 Info.ptrVal = I.getArgOperand(0);
3122 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3123 Info.align = Align(1);
3125 return true;
3126 case Intrinsic::x86_aesencwide256kl:
3127 case Intrinsic::x86_aesdecwide256kl:
3129 Info.ptrVal = I.getArgOperand(0);
3130 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3131 Info.align = Align(1);
3133 return true;
3134 case Intrinsic::x86_cmpccxadd32:
3135 case Intrinsic::x86_cmpccxadd64:
3136 case Intrinsic::x86_atomic_bts:
3137 case Intrinsic::x86_atomic_btc:
3138 case Intrinsic::x86_atomic_btr: {
3140 Info.ptrVal = I.getArgOperand(0);
3141 unsigned Size = I.getType()->getScalarSizeInBits();
3142 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3143 Info.align = Align(Size);
3146 return true;
3147 }
3148 case Intrinsic::x86_atomic_bts_rm:
3149 case Intrinsic::x86_atomic_btc_rm:
3150 case Intrinsic::x86_atomic_btr_rm: {
3152 Info.ptrVal = I.getArgOperand(0);
3153 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3154 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3155 Info.align = Align(Size);
3158 return true;
3159 }
3160 case Intrinsic::x86_aadd32:
3161 case Intrinsic::x86_aadd64:
3162 case Intrinsic::x86_aand32:
3163 case Intrinsic::x86_aand64:
3164 case Intrinsic::x86_aor32:
3165 case Intrinsic::x86_aor64:
3166 case Intrinsic::x86_axor32:
3167 case Intrinsic::x86_axor64:
3168 case Intrinsic::x86_atomic_add_cc:
3169 case Intrinsic::x86_atomic_sub_cc:
3170 case Intrinsic::x86_atomic_or_cc:
3171 case Intrinsic::x86_atomic_and_cc:
3172 case Intrinsic::x86_atomic_xor_cc: {
3174 Info.ptrVal = I.getArgOperand(0);
3175 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3176 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3177 Info.align = Align(Size);
3180 return true;
3181 }
3182 }
3183 return false;
3184 }
3185
3186 switch (IntrData->Type) {
3189 case TRUNCATE_TO_MEM_VI32: {
3191 Info.ptrVal = I.getArgOperand(0);
3192 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3194 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3195 ScalarVT = MVT::i8;
3196 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3197 ScalarVT = MVT::i16;
3198 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3199 ScalarVT = MVT::i32;
3200
3201 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3202 Info.align = Align(1);
3204 break;
3205 }
3206 case GATHER:
3207 case GATHER_AVX2: {
3209 Info.ptrVal = nullptr;
3210 MVT DataVT = MVT::getVT(I.getType());
3211 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3212 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3213 IndexVT.getVectorNumElements());
3214 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3215 Info.align = Align(1);
3217 break;
3218 }
3219 case SCATTER: {
3221 Info.ptrVal = nullptr;
3222 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3223 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3224 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3225 IndexVT.getVectorNumElements());
3226 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3227 Info.align = Align(1);
3229 break;
3230 }
3231 default:
3232 return false;
3233 }
3234
3235 return true;
3236}
3237
3238/// Returns true if the target can instruction select the
3239/// specified FP immediate natively. If false, the legalizer will
3240/// materialize the FP immediate as a load from a constant pool.
3242 bool ForCodeSize) const {
3243 for (const APFloat &FPImm : LegalFPImmediates)
3244 if (Imm.bitwiseIsEqual(FPImm))
3245 return true;
3246 return false;
3247}
3248
3250 ISD::LoadExtType ExtTy,
3251 EVT NewVT) const {
3252 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3253
3254 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3255 // relocation target a movq or addq instruction: don't let the load shrink.
3256 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3257 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3258 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3259 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3260
3261 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3262 // those uses are extracted directly into a store, then the extract + store
3263 // can be store-folded. Therefore, it's probably not worth splitting the load.
3264 EVT VT = Load->getValueType(0);
3265 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3266 for (SDUse &Use : Load->uses()) {
3267 // Skip uses of the chain value. Result 0 of the node is the load value.
3268 if (Use.getResNo() != 0)
3269 continue;
3270
3271 SDNode *User = Use.getUser();
3272
3273 // If this use is not an extract + store, it's probably worth splitting.
3274 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !User->hasOneUse() ||
3275 User->user_begin()->getOpcode() != ISD::STORE)
3276 return true;
3277 }
3278 // All non-chain uses are extract + store.
3279 return false;
3280 }
3281
3282 return true;
3283}
3284
3285/// Returns true if it is beneficial to convert a load of a constant
3286/// to just the constant itself.
3288 Type *Ty) const {
3289 assert(Ty->isIntegerTy());
3290
3291 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3292 if (BitSize == 0 || BitSize > 64)
3293 return false;
3294 return true;
3295}
3296
3298 // If we are using XMM registers in the ABI and the condition of the select is
3299 // a floating-point compare and we have blendv or conditional move, then it is
3300 // cheaper to select instead of doing a cross-register move and creating a
3301 // load that depends on the compare result.
3302 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3303 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3304}
3305
3307 // TODO: It might be a win to ease or lift this restriction, but the generic
3308 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3309 if (VT.isVector() && Subtarget.hasAVX512())
3310 return false;
3311
3312 return true;
3313}
3314
3316 SDValue C) const {
3317 // TODO: We handle scalars using custom code, but generic combining could make
3318 // that unnecessary.
3319 APInt MulC;
3320 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3321 return false;
3322
3323 // Find the type this will be legalized too. Otherwise we might prematurely
3324 // convert this to shl+add/sub and then still have to type legalize those ops.
3325 // Another choice would be to defer the decision for illegal types until
3326 // after type legalization. But constant splat vectors of i64 can't make it
3327 // through type legalization on 32-bit targets so we would need to special
3328 // case vXi64.
3329 while (getTypeAction(Context, VT) != TypeLegal)
3330 VT = getTypeToTransformTo(Context, VT);
3331
3332 // If vector multiply is legal, assume that's faster than shl + add/sub.
3333 // Multiply is a complex op with higher latency and lower throughput in
3334 // most implementations, sub-vXi32 vector multiplies are always fast,
3335 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3336 // is always going to be slow.
3337 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3338 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3339 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3340 return false;
3341
3342 // shl+add, shl+sub, shl+add+neg
3343 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3344 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3345}
3346
3348 unsigned Index) const {
3350 return false;
3351
3352 // Mask vectors support all subregister combinations and operations that
3353 // extract half of vector.
3354 if (ResVT.getVectorElementType() == MVT::i1)
3355 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3356 (Index == ResVT.getVectorNumElements()));
3357
3358 return (Index % ResVT.getVectorNumElements()) == 0;
3359}
3360
3362 unsigned Opc = VecOp.getOpcode();
3363
3364 // Assume target opcodes can't be scalarized.
3365 // TODO - do we have any exceptions?
3366 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3367 return false;
3368
3369 // If the vector op is not supported, try to convert to scalar.
3370 EVT VecVT = VecOp.getValueType();
3371 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3372 return true;
3373
3374 // If the vector op is supported, but the scalar op is not, the transform may
3375 // not be worthwhile.
3376 EVT ScalarVT = VecVT.getScalarType();
3377 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3378}
3379
3381 bool) const {
3382 // TODO: Allow vectors?
3383 if (VT.isVector())
3384 return false;
3385 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3386}
3387
3389 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3390 // i32/i64 or can rely on BSF passthrough value.
3391 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3392 Subtarget.hasBitScanPassThrough() ||
3393 (!Ty->isVectorTy() &&
3394 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3395}
3396
3398 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3399 // passthrough value.
3400 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3401 Subtarget.hasBitScanPassThrough();
3402}
3403
3405 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3406 // expensive than a straight movsd. On the other hand, it's important to
3407 // shrink long double fp constant since fldt is very slow.
3408 return !Subtarget.hasSSE2() || VT == MVT::f80;
3409}
3410
3412 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3413 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3414}
3415
3417 const SelectionDAG &DAG,
3418 const MachineMemOperand &MMO) const {
3419 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3420 BitcastVT.getVectorElementType() == MVT::i1)
3421 return false;
3422
3423 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3424 return false;
3425
3426 // If both types are legal vectors, it's always ok to convert them.
3427 if (LoadVT.isVector() && BitcastVT.isVector() &&
3428 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3429 return true;
3430
3431 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3432}
3433
3435 const MachineFunction &MF) const {
3436 // Do not merge to float value size (128 bytes) if no implicit
3437 // float attribute is set.
3438 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3439
3440 if (NoFloat) {
3441 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3442 return (MemVT.getSizeInBits() <= MaxIntSize);
3443 }
3444 // Make sure we don't merge greater than our preferred vector
3445 // width.
3446 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3447 return false;
3448
3449 return true;
3450}
3451
3453 return Subtarget.hasFastLZCNT();
3454}
3455
3457 const Instruction &AndI) const {
3458 return true;
3459}
3460
3462 EVT VT = Y.getValueType();
3463
3464 if (VT.isVector())
3465 return false;
3466
3467 if (!Subtarget.hasBMI())
3468 return false;
3469
3470 // There are only 32-bit and 64-bit forms for 'andn'.
3471 if (VT != MVT::i32 && VT != MVT::i64)
3472 return false;
3473
3474 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3475}
3476
3478 EVT VT = Y.getValueType();
3479
3480 if (!VT.isVector())
3481 return hasAndNotCompare(Y);
3482
3483 // Vector.
3484
3485 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3486 return false;
3487
3488 if (VT == MVT::v4i32)
3489 return true;
3490
3491 return Subtarget.hasSSE2();
3492}
3493
3495 return X.getValueType().isScalarInteger(); // 'bt'
3496}
3497
3501 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3502 SelectionDAG &DAG) const {
3503 // Does baseline recommend not to perform the fold by default?
3505 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3506 return false;
3507 // For scalars this transform is always beneficial.
3508 if (X.getValueType().isScalarInteger())
3509 return true;
3510 // If all the shift amounts are identical, then transform is beneficial even
3511 // with rudimentary SSE2 shifts.
3512 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3513 return true;
3514 // If we have AVX2 with it's powerful shift operations, then it's also good.
3515 if (Subtarget.hasAVX2())
3516 return true;
3517 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3518 return NewShiftOpcode == ISD::SHL;
3519}
3520
3522 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3523 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3524 if (!VT.isInteger())
3525 return ShiftOpc;
3526
3527 bool PreferRotate = false;
3528 if (VT.isVector()) {
3529 // For vectors, if we have rotate instruction support, then its definetly
3530 // best. Otherwise its not clear what the best so just don't make changed.
3531 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3532 VT.getScalarType() == MVT::i64);
3533 } else {
3534 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3535 // rotate unless we have a zext mask+shr.
3536 PreferRotate = Subtarget.hasBMI2();
3537 if (!PreferRotate) {
3538 unsigned MaskBits =
3539 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3540 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3541 }
3542 }
3543
3544 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3545 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3546
3547 if (PreferRotate && MayTransformRotate)
3548 return ISD::ROTL;
3549
3550 // If vector we don't really get much benefit swapping around constants.
3551 // Maybe we could check if the DAG has the flipped node already in the
3552 // future.
3553 if (VT.isVector())
3554 return ShiftOpc;
3555
3556 // See if the beneficial to swap shift type.
3557 if (ShiftOpc == ISD::SHL) {
3558 // If the current setup has imm64 mask, then inverse will have
3559 // at least imm32 mask (or be zext i32 -> i64).
3560 if (VT == MVT::i64)
3561 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3562 : ShiftOpc;
3563
3564 // We can only benefit if req at least 7-bit for the mask. We
3565 // don't want to replace shl of 1,2,3 as they can be implemented
3566 // with lea/add.
3567 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3568 }
3569
3570 if (VT == MVT::i64)
3571 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3572 // extremely efficient.
3573 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3574
3575 // Keep small shifts as shl so we can generate add/lea.
3576 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3577 }
3578
3579 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3580 // (PreferRotate will be set in the latter case).
3581 if (PreferRotate || !MayTransformRotate || VT.isVector())
3582 return ShiftOpc;
3583
3584 // Non-vector type and we have a zext mask with SRL.
3585 return ISD::SRL;
3586}
3587
3590 const Value *Lhs,
3591 const Value *Rhs) const {
3592 using namespace llvm::PatternMatch;
3593 int BaseCost = BrMergingBaseCostThresh.getValue();
3594 // With CCMP, branches can be merged in a more efficient way.
3595 if (BaseCost >= 0 && Subtarget.hasCCMP())
3596 BaseCost += BrMergingCcmpBias;
3597 // a == b && a == c is a fast pattern on x86.
3598 if (BaseCost >= 0 && Opc == Instruction::And &&
3601 BaseCost += 1;
3602 return {BaseCost, BrMergingLikelyBias.getValue(),
3603 BrMergingUnlikelyBias.getValue()};
3604}
3605
3607 return N->getOpcode() != ISD::FP_EXTEND;
3608}
3609
3611 const SDNode *N, CombineLevel Level) const {
3612 assert(((N->getOpcode() == ISD::SHL &&
3613 N->getOperand(0).getOpcode() == ISD::SRL) ||
3614 (N->getOpcode() == ISD::SRL &&
3615 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3616 "Expected shift-shift mask");
3617 // TODO: Should we always create i64 masks? Or only folded immediates?
3618 EVT VT = N->getValueType(0);
3619 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3620 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3621 // Only fold if the shift values are equal - so it folds to AND.
3622 // TODO - we should fold if either is a non-uniform vector but we don't do
3623 // the fold for non-splats yet.
3624 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3625 }
3627}
3628
3630 EVT VT = Y.getValueType();
3631
3632 // For vectors, we don't have a preference, but we probably want a mask.
3633 if (VT.isVector())
3634 return false;
3635
3636 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3637 if (VT == MVT::i64 && !Subtarget.is64Bit())
3638 return false;
3639
3640 return true;
3641}
3642
3645 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3647 !Subtarget.isOSWindows())
3650 ExpansionFactor);
3651}
3652
3654 // Any legal vector type can be splatted more efficiently than
3655 // loading/spilling from memory.
3656 return isTypeLegal(VT);
3657}
3658
3660 MVT VT = MVT::getIntegerVT(NumBits);
3661 if (isTypeLegal(VT))
3662 return VT;
3663
3664 // PMOVMSKB can handle this.
3665 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3666 return MVT::v16i8;
3667
3668 // VPMOVMSKB can handle this.
3669 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3670 return MVT::v32i8;
3671
3672 // TODO: Allow 64-bit type for 32-bit target.
3673 // TODO: 512-bit types should be allowed, but make sure that those
3674 // cases are handled in combineVectorSizedSetCCEquality().
3675
3677}
3678
3679/// Val is the undef sentinel value or equal to the specified value.
3680static bool isUndefOrEqual(int Val, int CmpVal) {
3681 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3682}
3683
3684/// Return true if every element in Mask is the undef sentinel value or equal to
3685/// the specified value.
3686static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3687 return llvm::all_of(Mask, [CmpVal](int M) {
3688 return (M == SM_SentinelUndef) || (M == CmpVal);
3689 });
3690}
3691
3692/// Return true if every element in Mask, beginning from position Pos and ending
3693/// in Pos+Size is the undef sentinel value or equal to the specified value.
3694static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3695 unsigned Size) {
3696 return llvm::all_of(Mask.slice(Pos, Size),
3697 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3698}
3699
3700/// Val is either the undef or zero sentinel value.
3701static bool isUndefOrZero(int Val) {
3702 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3703}
3704
3705/// Return true if every element in Mask, beginning from position Pos and ending
3706/// in Pos+Size is the undef sentinel value.
3707static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3708 return llvm::all_of(Mask.slice(Pos, Size),
3709 [](int M) { return M == SM_SentinelUndef; });
3710}
3711
3712/// Return true if the mask creates a vector whose lower half is undefined.
3714 unsigned NumElts = Mask.size();
3715 return isUndefInRange(Mask, 0, NumElts / 2);
3716}
3717
3718/// Return true if the mask creates a vector whose upper half is undefined.
3720 unsigned NumElts = Mask.size();
3721 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3722}
3723
3724/// Return true if Val falls within the specified range (L, H].
3725static bool isInRange(int Val, int Low, int Hi) {
3726 return (Val >= Low && Val < Hi);
3727}
3728
3729/// Return true if the value of any element in Mask falls within the specified
3730/// range (L, H].
3731static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3732 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3733}
3734
3735/// Return true if the value of any element in Mask is the zero sentinel value.
3736static bool isAnyZero(ArrayRef<int> Mask) {
3737 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3738}
3739
3740/// Return true if Val is undef or if its value falls within the
3741/// specified range (L, H].
3742static bool isUndefOrInRange(int Val, int Low, int Hi) {
3743 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3744}
3745
3746/// Return true if every element in Mask is undef or if its value
3747/// falls within the specified range (L, H].
3748static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3749 return llvm::all_of(
3750 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3751}
3752
3753/// Return true if Val is undef, zero or if its value falls within the
3754/// specified range (L, H].
3755static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3756 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3757}
3758
3759/// Return true if every element in Mask is undef, zero or if its value
3760/// falls within the specified range (L, H].
3761static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3762 return llvm::all_of(
3763 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3764}
3765
3766/// Return true if every element in Mask, is an in-place blend/select mask or is
3767/// undef.
3769 unsigned NumElts = Mask.size();
3770 for (auto [I, M] : enumerate(Mask))
3771 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3772 return false;
3773 return true;
3774}
3775
3776/// Return true if every element in Mask, beginning
3777/// from position Pos and ending in Pos + Size, falls within the specified
3778/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3779static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3780 unsigned Size, int Low, int Step = 1) {
3781 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3782 if (!isUndefOrEqual(Mask[i], Low))
3783 return false;
3784 return true;
3785}
3786
3787/// Return true if every element in Mask, beginning
3788/// from position Pos and ending in Pos+Size, falls within the specified
3789/// sequential range (Low, Low+Size], or is undef or is zero.
3791 unsigned Size, int Low,
3792 int Step = 1) {
3793 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3794 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3795 return false;
3796 return true;
3797}
3798
3799/// Return true if every element in Mask, beginning
3800/// from position Pos and ending in Pos+Size is undef or is zero.
3801static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3802 unsigned Size) {
3803 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3804}
3805
3806/// Return true if every element of a single input is referenced by the shuffle
3807/// mask. i.e. it just permutes them all.
3809 unsigned NumElts = Mask.size();
3810 APInt DemandedElts = APInt::getZero(NumElts);
3811 for (int M : Mask)
3812 if (isInRange(M, 0, NumElts))
3813 DemandedElts.setBit(M);
3814 return DemandedElts.isAllOnes();
3815}
3816
3817/// Helper function to test whether a shuffle mask could be
3818/// simplified by widening the elements being shuffled.
3819///
3820/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3821/// leaves it in an unspecified state.
3822///
3823/// NOTE: This must handle normal vector shuffle masks and *target* vector
3824/// shuffle masks. The latter have the special property of a '-2' representing
3825/// a zero-ed lane of a vector.
3827 SmallVectorImpl<int> &WidenedMask) {
3828 WidenedMask.assign(Mask.size() / 2, 0);
3829 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3830 int M0 = Mask[i];
3831 int M1 = Mask[i + 1];
3832
3833 // If both elements are undef, its trivial.
3834 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3835 WidenedMask[i / 2] = SM_SentinelUndef;
3836 continue;
3837 }
3838
3839 // Check for an undef mask and a mask value properly aligned to fit with
3840 // a pair of values. If we find such a case, use the non-undef mask's value.
3841 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3842 WidenedMask[i / 2] = M1 / 2;
3843 continue;
3844 }
3845 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3846 WidenedMask[i / 2] = M0 / 2;
3847 continue;
3848 }
3849
3850 // When zeroing, we need to spread the zeroing across both lanes to widen.
3851 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3852 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3854 WidenedMask[i / 2] = SM_SentinelZero;
3855 continue;
3856 }
3857 return false;
3858 }
3859
3860 // Finally check if the two mask values are adjacent and aligned with
3861 // a pair.
3862 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3863 WidenedMask[i / 2] = M0 / 2;
3864 continue;
3865 }
3866
3867 // Otherwise we can't safely widen the elements used in this shuffle.
3868 return false;
3869 }
3870 assert(WidenedMask.size() == Mask.size() / 2 &&
3871 "Incorrect size of mask after widening the elements!");
3872
3873 return true;
3874}
3875
3877 const APInt &Zeroable,
3878 bool V2IsZero,
3879 SmallVectorImpl<int> &WidenedMask) {
3880 // Create an alternative mask with info about zeroable elements.
3881 // Here we do not set undef elements as zeroable.
3882 SmallVector<int, 64> ZeroableMask(Mask);
3883 if (V2IsZero) {
3884 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3885 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3886 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3887 ZeroableMask[i] = SM_SentinelZero;
3888 }
3889 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3890}
3891
3893 SmallVector<int, 32> WidenedMask;
3894 return canWidenShuffleElements(Mask, WidenedMask);
3895}
3896
3897// Attempt to narrow/widen shuffle mask until it matches the target number of
3898// elements.
3899static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3900 SmallVectorImpl<int> &ScaledMask) {
3901 unsigned NumSrcElts = Mask.size();
3902 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3903 "Illegal shuffle scale factor");
3904
3905 // Narrowing is guaranteed to work.
3906 if (NumDstElts >= NumSrcElts) {
3907 int Scale = NumDstElts / NumSrcElts;
3908 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3909 return true;
3910 }
3911
3912 // We have to repeat the widening until we reach the target size, but we can
3913 // split out the first widening as it sets up ScaledMask for us.
3914 if (canWidenShuffleElements(Mask, ScaledMask)) {
3915 while (ScaledMask.size() > NumDstElts) {
3916 SmallVector<int, 16> WidenedMask;
3917 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3918 return false;
3919 ScaledMask = std::move(WidenedMask);
3920 }
3921 return true;
3922 }
3923
3924 return false;
3925}
3926
3927static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3928 SmallVector<int, 32> ScaledMask;
3929 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3930}
3931
3932/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3934 return isNullConstant(Elt) || isNullFPConstant(Elt);
3935}
3936
3937// Build a vector of constants.
3938// Use an UNDEF node if MaskElt == -1.
3939// Split 64-bit constants in the 32-bit mode.
3941 const SDLoc &dl, bool IsMask = false) {
3942
3944 bool Split = false;
3945
3946 MVT ConstVecVT = VT;
3947 unsigned NumElts = VT.getVectorNumElements();
3948 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3949 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3950 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3951 Split = true;
3952 }
3953
3954 MVT EltVT = ConstVecVT.getVectorElementType();
3955 for (unsigned i = 0; i < NumElts; ++i) {
3956 bool IsUndef = Values[i] < 0 && IsMask;
3957 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3958 DAG.getConstant(Values[i], dl, EltVT);
3959 Ops.push_back(OpNode);
3960 if (Split)
3961 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3962 DAG.getConstant(0, dl, EltVT));
3963 }
3964 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3965 if (Split)
3966 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3967 return ConstsNode;
3968}
3969
3970static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3971 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3972 assert(Bits.size() == Undefs.getBitWidth() &&
3973 "Unequal constant and undef arrays");
3975 bool Split = false;
3976
3977 MVT ConstVecVT = VT;
3978 unsigned NumElts = VT.getVectorNumElements();
3979 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3980 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3981 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3982 Split = true;
3983 }
3984
3985 MVT EltVT = ConstVecVT.getVectorElementType();
3986 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3987 if (Undefs[i]) {
3988 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3989 continue;
3990 }
3991 const APInt &V = Bits[i];
3992 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3993 if (Split) {
3994 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3995 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3996 } else if (EltVT == MVT::f32) {
3998 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3999 } else if (EltVT == MVT::f64) {
4001 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4002 } else {
4003 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4004 }
4005 }
4006
4007 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4008 return DAG.getBitcast(VT, ConstsNode);
4009}
4010
4012 SelectionDAG &DAG, const SDLoc &dl) {
4013 APInt Undefs = APInt::getZero(Bits.size());
4014 return getConstVector(Bits, Undefs, VT, DAG, dl);
4015}
4016
4017/// Returns a vector of specified type with all zero elements.
4018static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4019 SelectionDAG &DAG, const SDLoc &dl) {
4020 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4021 VT.getVectorElementType() == MVT::i1) &&
4022 "Unexpected vector type");
4023
4024 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4025 // type. This ensures they get CSE'd. But if the integer type is not
4026 // available, use a floating-point +0.0 instead.
4027 SDValue Vec;
4028 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4029 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4030 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4031 } else if (VT.isFloatingPoint() &&
4033 Vec = DAG.getConstantFP(+0.0, dl, VT);
4034 } else if (VT.getVectorElementType() == MVT::i1) {
4035 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4036 "Unexpected vector type");
4037 Vec = DAG.getConstant(0, dl, VT);
4038 } else {
4039 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4040 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4041 }
4042 return DAG.getBitcast(VT, Vec);
4043}
4044
4045// Helper to determine if the ops are all the extracted subvectors come from a
4046// single source. If we allow commute they don't have to be in order (Lo/Hi).
4047static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4048 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4049 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4050 LHS.getValueType() != RHS.getValueType() ||
4051 LHS.getOperand(0) != RHS.getOperand(0))
4052 return SDValue();
4053
4054 SDValue Src = LHS.getOperand(0);
4055 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4056 return SDValue();
4057
4058 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4059 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4060 RHS.getConstantOperandAPInt(1) == NumElts) ||
4061 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4062 LHS.getConstantOperandAPInt(1) == NumElts))
4063 return Src;
4064
4065 return SDValue();
4066}
4067
4068static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4069 const SDLoc &dl, unsigned vectorWidth) {
4070 EVT VT = Vec.getValueType();
4071 EVT ElVT = VT.getVectorElementType();
4072 unsigned Factor = VT.getSizeInBits() / vectorWidth;
4073 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4074 VT.getVectorNumElements() / Factor);
4075
4076 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4077 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4078 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4079
4080 // This is the index of the first element of the vectorWidth-bit chunk
4081 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4082 IdxVal &= ~(ElemsPerChunk - 1);
4083
4084 // If the input is a buildvector just emit a smaller one.
4085 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4086 return DAG.getBuildVector(ResultVT, dl,
4087 Vec->ops().slice(IdxVal, ElemsPerChunk));
4088
4089 // Check if we're extracting the upper undef of a widening pattern.
4090 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4091 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4092 isNullConstant(Vec.getOperand(2)))
4093 return DAG.getUNDEF(ResultVT);
4094
4095 SDValue VecIdx = DAG.getVectorIdxConstant(IdxVal, dl);
4096 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4097}
4098
4099/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4100/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4101/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4102/// instructions or a simple subregister reference. Idx is an index in the
4103/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4104/// lowering EXTRACT_VECTOR_ELT operations easier.
4105static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4106 SelectionDAG &DAG, const SDLoc &dl) {
4108 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4109 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4110}
4111
4112/// Generate a DAG to grab 256-bits from a 512-bit vector.
4113static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4114 SelectionDAG &DAG, const SDLoc &dl) {
4115 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4116 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4117}
4118
4119static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4120 SelectionDAG &DAG, const SDLoc &dl,
4121 unsigned vectorWidth) {
4122 assert((vectorWidth == 128 || vectorWidth == 256) &&
4123 "Unsupported vector width");
4124 // Inserting UNDEF is Result
4125 if (Vec.isUndef())
4126 return Result;
4127 EVT VT = Vec.getValueType();
4128 EVT ElVT = VT.getVectorElementType();
4129 EVT ResultVT = Result.getValueType();
4130
4131 // Insert the relevant vectorWidth bits.
4132 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4133 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4134
4135 // This is the index of the first element of the vectorWidth-bit chunk
4136 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4137 IdxVal &= ~(ElemsPerChunk - 1);
4138
4139 SDValue VecIdx = DAG.getVectorIdxConstant(IdxVal, dl);
4140 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4141}
4142
4143/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4144/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4145/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4146/// simple superregister reference. Idx is an index in the 128 bits
4147/// we want. It need not be aligned to a 128-bit boundary. That makes
4148/// lowering INSERT_VECTOR_ELT operations easier.
4149static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4150 SelectionDAG &DAG, const SDLoc &dl) {
4151 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4152 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4153}
4154
4155/// Widen a vector to a larger size with the same scalar type, with the new
4156/// elements either zero or undef.
4157static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4158 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4159 const SDLoc &dl) {
4160 EVT VecVT = Vec.getValueType();
4162 VecVT.getScalarType() == VT.getScalarType() &&
4163 "Unsupported vector widening type");
4164 // If the upper 128-bits of a build vector are already undef/zero, then try to
4165 // widen from the lower 128-bits.
4166 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4167 unsigned NumSrcElts = VecVT.getVectorNumElements();
4168 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4169 if (all_of(Hi, [&](SDValue V) {
4170 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4171 }))
4172 Vec = extract128BitVector(Vec, 0, DAG, dl);
4173 }
4174 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4175 : DAG.getUNDEF(VT);
4176 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
4177 DAG.getVectorIdxConstant(0, dl));
4178}
4179
4180/// Widen a vector to a larger size with the same scalar type, with the new
4181/// elements either zero or undef.
4182static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4183 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4184 const SDLoc &dl, unsigned WideSizeInBits) {
4185 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4186 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4187 "Unsupported vector widening type");
4188 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4189 MVT SVT = Vec.getSimpleValueType().getScalarType();
4190 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4191 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4192}
4193
4194/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4195/// and bitcast with integer types.
4196static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4197 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4198 unsigned NumElts = VT.getVectorNumElements();
4199 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4200 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4201 return VT;
4202}
4203
4204/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4205/// bitcast with integer types.
4206static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4207 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4208 const SDLoc &dl) {
4209 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4210 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4211}
4212
4213// Helper function to collect subvector ops that are concatenated together,
4214// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4215// The subvectors in Ops are guaranteed to be the same type.
4217 SelectionDAG &DAG) {
4218 assert(Ops.empty() && "Expected an empty ops vector");
4219
4220 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4221 Ops.append(N->op_begin(), N->op_end());
4222 return true;
4223 }
4224
4225 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4226 SDValue Src = N->getOperand(0);
4227 SDValue Sub = N->getOperand(1);
4228 const APInt &Idx = N->getConstantOperandAPInt(2);
4229 EVT VT = Src.getValueType();
4230 EVT SubVT = Sub.getValueType();
4231
4232 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4233 // insert_subvector(undef, x, lo)
4234 if (Idx == 0 && Src.isUndef()) {
4235 Ops.push_back(Sub);
4236 Ops.push_back(DAG.getUNDEF(SubVT));
4237 return true;
4238 }
4239 if (Idx == (VT.getVectorNumElements() / 2)) {
4240 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4241 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4242 Src.getOperand(1).getValueType() == SubVT &&
4243 isNullConstant(Src.getOperand(2))) {
4244 // Attempt to recurse into inner (matching) concats.
4245 SDValue Lo = Src.getOperand(1);
4246 SDValue Hi = Sub;
4247 SmallVector<SDValue, 2> LoOps, HiOps;
4248 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4249 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4250 LoOps.size() == HiOps.size()) {
4251 Ops.append(LoOps);
4252 Ops.append(HiOps);
4253 return true;
4254 }
4255 Ops.push_back(Lo);
4256 Ops.push_back(Hi);
4257 return true;
4258 }
4259 // insert_subvector(x, extract_subvector(x, lo), hi)
4260 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4261 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4262 Ops.append(2, Sub);
4263 return true;
4264 }
4265 // insert_subvector(undef, x, hi)
4266 if (Src.isUndef()) {
4267 Ops.push_back(DAG.getUNDEF(SubVT));
4268 Ops.push_back(Sub);
4269 return true;
4270 }
4271 }
4272 }
4273 }
4274
4275 return false;
4276}
4277
4278// Helper to check if \p V can be split into subvectors and the upper subvectors
4279// are all undef. In which case return the lower subvector.
4281 SelectionDAG &DAG) {
4282 SmallVector<SDValue> SubOps;
4283 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4284 return SDValue();
4285
4286 unsigned NumSubOps = SubOps.size();
4287 unsigned HalfNumSubOps = NumSubOps / 2;
4288 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4289
4290 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4291 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4292 return SDValue();
4293
4294 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4295 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4296 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4297}
4298
4299// Helper to check if we can access all the constituent subvectors without any
4300// extract ops.
4303 return collectConcatOps(N, Ops, DAG);
4304}
4305
4306static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4307 const SDLoc &dl) {
4308 EVT VT = Op.getValueType();
4309 unsigned NumElems = VT.getVectorNumElements();
4310 unsigned SizeInBits = VT.getSizeInBits();
4311 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4312 "Can't split odd sized vector");
4313
4314 // If this is a splat value (with no-undefs) then use the lower subvector,
4315 // which should be a free extraction.
4316 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4317 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4318 return std::make_pair(Lo, Lo);
4319
4320 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4321 return std::make_pair(Lo, Hi);
4322}
4323
4324/// Break an operation into 2 half sized ops and then concatenate the results.
4326 unsigned NumOps = Op.getNumOperands();
4327 EVT VT = Op.getValueType();
4328
4329 // Extract the LHS Lo/Hi vectors
4330 SmallVector<SDValue> LoOps(NumOps, SDValue());
4331 SmallVector<SDValue> HiOps(NumOps, SDValue());
4332 for (unsigned I = 0; I != NumOps; ++I) {
4333 SDValue SrcOp = Op.getOperand(I);
4334 if (!SrcOp.getValueType().isVector()) {
4335 LoOps[I] = HiOps[I] = SrcOp;
4336 continue;
4337 }
4338 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4339 }
4340
4341 EVT LoVT, HiVT;
4342 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4343 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4344 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4345 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4346}
4347
4348/// Break an unary integer operation into 2 half sized ops and then
4349/// concatenate the result back.
4351 const SDLoc &dl) {
4352 // Make sure we only try to split 256/512-bit types to avoid creating
4353 // narrow vectors.
4354 [[maybe_unused]] EVT VT = Op.getValueType();
4355 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4356 Op.getOperand(0).getValueType().is512BitVector()) &&
4357 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4358 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4359 VT.getVectorNumElements() &&
4360 "Unexpected VTs!");
4361 return splitVectorOp(Op, DAG, dl);
4362}
4363
4364/// Break a binary integer operation into 2 half sized ops and then
4365/// concatenate the result back.
4367 const SDLoc &dl) {
4368 // Assert that all the types match.
4369 [[maybe_unused]] EVT VT = Op.getValueType();
4370 assert(Op.getOperand(0).getValueType() == VT &&
4371 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4372 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4373 return splitVectorOp(Op, DAG, dl);
4374}
4375
4376// Helper for splitting operands of an operation to legal target size and
4377// apply a function on each part.
4378// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4379// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4380// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4381// The argument Builder is a function that will be applied on each split part:
4382// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4383template <typename F>
4385 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4386 F Builder, bool CheckBWI = true) {
4387 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4388 unsigned NumSubs = 1;
4389 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4390 (!CheckBWI && Subtarget.useAVX512Regs())) {
4391 if (VT.getSizeInBits() > 512) {
4392 NumSubs = VT.getSizeInBits() / 512;
4393 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4394 }
4395 } else if (Subtarget.hasAVX2()) {
4396 if (VT.getSizeInBits() > 256) {
4397 NumSubs = VT.getSizeInBits() / 256;
4398 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4399 }
4400 } else {
4401 if (VT.getSizeInBits() > 128) {
4402 NumSubs = VT.getSizeInBits() / 128;
4403 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4404 }
4405 }
4406
4407 if (NumSubs == 1)
4408 return Builder(DAG, DL, Ops);
4409
4411 for (unsigned i = 0; i != NumSubs; ++i) {
4413 for (SDValue Op : Ops) {
4414 EVT OpVT = Op.getValueType();
4415 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4416 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4417 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4418 }
4419 Subs.push_back(Builder(DAG, DL, SubOps));
4420 }
4421 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4422}
4423
4424// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4425// targets.
4426static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4428 const X86Subtarget &Subtarget) {
4429 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4430 MVT SVT = VT.getScalarType();
4431
4432 // If we have a 32/64 splatted constant, splat it to DstTy to
4433 // encourage a foldable broadcast'd operand.
4434 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4435 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4436 // AVX512 broadcasts 32/64-bit operands.
4437 // TODO: Support float once getAVX512Node is used by fp-ops.
4438 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4440 return SDValue();
4441 // If we're not widening, don't bother if we're not bitcasting.
4442 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4443 return SDValue();
4444 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4445 APInt SplatValue, SplatUndef;
4446 unsigned SplatBitSize;
4447 bool HasAnyUndefs;
4448 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4449 HasAnyUndefs, OpEltSizeInBits) &&
4450 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4451 return DAG.getConstant(SplatValue, DL, DstVT);
4452 }
4453 return SDValue();
4454 };
4455
4456 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4457
4458 MVT DstVT = VT;
4459 if (Widen)
4460 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4461
4462 // Canonicalize src operands.
4463 SmallVector<SDValue> SrcOps(Ops);
4464 for (SDValue &Op : SrcOps) {
4465 MVT OpVT = Op.getSimpleValueType();
4466 // Just pass through scalar operands.
4467 if (!OpVT.isVector())
4468 continue;
4469 assert(OpVT == VT && "Vector type mismatch");
4470
4471 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4472 Op = BroadcastOp;
4473 continue;
4474 }
4475
4476 // Just widen the subvector by inserting into an undef wide vector.
4477 if (Widen)
4478 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4479 }
4480
4481 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4482
4483 // Perform the 512-bit op then extract the bottom subvector.
4484 if (Widen)
4485 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4486 return Res;
4487}
4488
4489/// Insert i1-subvector to i1-vector.
4491 const X86Subtarget &Subtarget) {
4492
4493 SDLoc dl(Op);
4494 SDValue Vec = Op.getOperand(0);
4495 SDValue SubVec = Op.getOperand(1);
4496 SDValue Idx = Op.getOperand(2);
4497 unsigned IdxVal = Op.getConstantOperandVal(2);
4498
4499 // Inserting undef is a nop. We can just return the original vector.
4500 if (SubVec.isUndef())
4501 return Vec;
4502
4503 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4504 return Op;
4505
4506 MVT OpVT = Op.getSimpleValueType();
4507 unsigned NumElems = OpVT.getVectorNumElements();
4508 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4509
4510 // Extend to natively supported kshift.
4511 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4512
4513 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4514 // if necessary.
4515 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4516 // May need to promote to a legal type.
4517 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4518 DAG.getConstant(0, dl, WideOpVT),
4519 SubVec, Idx);
4520 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4521 }
4522
4523 MVT SubVecVT = SubVec.getSimpleValueType();
4524 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4525 assert(IdxVal + SubVecNumElems <= NumElems &&
4526 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4527 "Unexpected index value in INSERT_SUBVECTOR");
4528
4529 SDValue Undef = DAG.getUNDEF(WideOpVT);
4530
4531 if (IdxVal == 0) {
4532 // Zero lower bits of the Vec
4533 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4534 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4535 ZeroIdx);
4536 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4537 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4538 // Merge them together, SubVec should be zero extended.
4539 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4540 DAG.getConstant(0, dl, WideOpVT),
4541 SubVec, ZeroIdx);
4542 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4543 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4544 }
4545
4546 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4547 Undef, SubVec, ZeroIdx);
4548
4549 if (Vec.isUndef()) {
4550 assert(IdxVal != 0 && "Unexpected index");
4551 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4552 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4553 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4554 }
4555
4557 assert(IdxVal != 0 && "Unexpected index");
4558 // If upper elements of Vec are known undef, then just shift into place.
4559 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4560 [](SDValue V) { return V.isUndef(); })) {
4561 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4562 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4563 } else {
4564 NumElems = WideOpVT.getVectorNumElements();
4565 unsigned ShiftLeft = NumElems - SubVecNumElems;
4566 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4567 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4568 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4569 if (ShiftRight != 0)
4570 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4571 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4572 }
4573 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4574 }
4575
4576 // Simple case when we put subvector in the upper part
4577 if (IdxVal + SubVecNumElems == NumElems) {
4578 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4579 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4580 if (SubVecNumElems * 2 == NumElems) {
4581 // Special case, use legal zero extending insert_subvector. This allows
4582 // isel to optimize when bits are known zero.
4583 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4584 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4585 DAG.getConstant(0, dl, WideOpVT),
4586 Vec, ZeroIdx);
4587 } else {
4588 // Otherwise use explicit shifts to zero the bits.
4589 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4590 Undef, Vec, ZeroIdx);
4591 NumElems = WideOpVT.getVectorNumElements();
4592 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4593 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4594 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4595 }
4596 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4597 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4598 }
4599
4600 // Inserting into the middle is more complicated.
4601
4602 NumElems = WideOpVT.getVectorNumElements();
4603
4604 // Widen the vector if needed.
4605 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4606
4607 unsigned ShiftLeft = NumElems - SubVecNumElems;
4608 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4609
4610 // Do an optimization for the most frequently used types.
4611 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4612 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4613 Mask0.flipAllBits();
4614 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4615 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4616 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4617 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4618 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4619 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4620 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4621 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4622
4623 // Reduce to original width if needed.
4624 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4625 }
4626
4627 // Clear the upper bits of the subvector and move it to its insert position.
4628 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4629 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4630 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4631 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4632
4633 // Isolate the bits below the insertion point.
4634 unsigned LowShift = NumElems - IdxVal;
4635 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4636 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4637 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4638 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4639
4640 // Isolate the bits after the last inserted bit.
4641 unsigned HighShift = IdxVal + SubVecNumElems;
4642 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4643 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4644 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4645 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4646
4647 // Now OR all 3 pieces together.
4648 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4649 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4650
4651 // Reduce to original width if needed.
4652 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4653}
4654
4656 const SDLoc &dl) {
4657 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4658 EVT SubVT = V1.getValueType();
4659 EVT SubSVT = SubVT.getScalarType();
4660 unsigned SubNumElts = SubVT.getVectorNumElements();
4661 unsigned SubVectorWidth = SubVT.getSizeInBits();
4662 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4663 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4664 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4665}
4666
4667/// Returns a vector of specified type with all bits set.
4668/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4669/// Then bitcast to their original type, ensuring they get CSE'd.
4670static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4671 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4672 "Expected a 128/256/512-bit vector type");
4673 unsigned NumElts = VT.getSizeInBits() / 32;
4674 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4675 return DAG.getBitcast(VT, Vec);
4676}
4677
4678static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4679 SDValue In, SelectionDAG &DAG) {
4680 EVT InVT = In.getValueType();
4681 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4682 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4683 ISD::ZERO_EXTEND == Opcode) &&
4684 "Unknown extension opcode");
4685
4686 // For 256-bit vectors, we only need the lower (128-bit) input half.
4687 // For 512-bit vectors, we only need the lower input half or quarter.
4688 if (InVT.getSizeInBits() > 128) {
4689 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4690 "Expected VTs to be the same size!");
4691 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4692 In = extractSubVector(In, 0, DAG, DL,
4693 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4694 InVT = In.getValueType();
4695 }
4696
4697 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4698 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4699
4700 return DAG.getNode(Opcode, DL, VT, In);
4701}
4702
4703// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4704static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4705 SDValue Mask, SelectionDAG &DAG) {
4706 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4707 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4708 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4709}
4710
4712 bool Lo, bool Unary) {
4713 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4714 "Illegal vector type to unpack");
4715 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4716 int NumElts = VT.getVectorNumElements();
4717 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4718 for (int i = 0; i < NumElts; ++i) {
4719 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4720 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4721 Pos += (Unary ? 0 : NumElts * (i % 2));
4722 Pos += (Lo ? 0 : NumEltsInLane / 2);
4723 Mask.push_back(Pos);
4724 }
4725}
4726
4727/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4728/// imposed by AVX and specific to the unary pattern. Example:
4729/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4730/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4732 bool Lo) {
4733 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4734 int NumElts = VT.getVectorNumElements();
4735 for (int i = 0; i < NumElts; ++i) {
4736 int Pos = i / 2;
4737 Pos += (Lo ? 0 : NumElts / 2);
4738 Mask.push_back(Pos);
4739 }
4740}
4741
4742// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4743static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4744 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4746 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4747 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4748 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4749 int M = Mask[I];
4750 if (M < 0)
4751 continue;
4752 SDValue V = (M < NumElts) ? V1 : V2;
4753 if (V.isUndef())
4754 continue;
4755 Ops[I] = V.getOperand(M % NumElts);
4756 }
4757 return DAG.getBuildVector(VT, dl, Ops);
4758 }
4759
4760 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4761}
4762
4763/// Returns a vector_shuffle node for an unpackl operation.
4764static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4765 SDValue V1, SDValue V2) {
4767 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4768 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4769}
4770
4771/// Returns a vector_shuffle node for an unpackh operation.
4772static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4773 SDValue V1, SDValue V2) {
4775 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4776 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4777}
4778
4779/// Returns a node that packs the LHS + RHS nodes together at half width.
4780/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4781/// TODO: Add subvector splitting if/when we have a need for it.
4782static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4783 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4784 bool PackHiHalf = false) {
4785 MVT OpVT = LHS.getSimpleValueType();
4786 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4787 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4788 assert(OpVT == RHS.getSimpleValueType() &&
4789 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4790 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4791 "Unexpected PACK operand types");
4792 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4793 "Unexpected PACK result type");
4794
4795 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4796 if (EltSizeInBits == 32) {
4797 SmallVector<int> PackMask;
4798 int Offset = PackHiHalf ? 1 : 0;
4799 int NumElts = VT.getVectorNumElements();
4800 for (int I = 0; I != NumElts; I += 4) {
4801 PackMask.push_back(I + Offset);
4802 PackMask.push_back(I + Offset + 2);
4803 PackMask.push_back(I + Offset + NumElts);
4804 PackMask.push_back(I + Offset + NumElts + 2);
4805 }
4806 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4807 DAG.getBitcast(VT, RHS), PackMask);
4808 }
4809
4810 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4811 if (!PackHiHalf) {
4812 if (UsePackUS &&
4813 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4814 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4815 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4816
4817 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4818 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4819 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4820 }
4821
4822 // Fallback to sign/zero extending the requested half and pack.
4823 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4824 if (UsePackUS) {
4825 if (PackHiHalf) {
4826 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4827 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4828 } else {
4829 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4830 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4831 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4832 };
4833 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4834 };
4835
4836 if (!PackHiHalf) {
4837 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4838 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4839 }
4840 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4841 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4842 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4843}
4844
4845/// Return a vector_shuffle of the specified vector of zero or undef vector.
4846/// This produces a shuffle where the low element of V2 is swizzled into the
4847/// zero/undef vector, landing at element Idx.
4848/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4850 bool IsZero,
4851 const X86Subtarget &Subtarget,
4852 SelectionDAG &DAG) {
4853 MVT VT = V2.getSimpleValueType();
4854 SDValue V1 = IsZero
4855 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4856 int NumElems = VT.getVectorNumElements();
4857 SmallVector<int, 16> MaskVec(NumElems);
4858 for (int i = 0; i != NumElems; ++i)
4859 // If this is the insertion idx, put the low elt of V2 here.
4860 MaskVec[i] = (i == Idx) ? NumElems : i;
4861 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4862}
4863
4865 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4866 Ptr.getOpcode() == X86ISD::WrapperRIP)
4867 Ptr = Ptr.getOperand(0);
4868 return dyn_cast<ConstantPoolSDNode>(Ptr);
4869}
4870
4871// TODO: Add support for non-zero offsets.
4874 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4875 return nullptr;
4876 return CNode->getConstVal();
4877}
4878
4880 if (!Load || !ISD::isNormalLoad(Load))
4881 return nullptr;
4882 return getTargetConstantFromBasePtr(Load->getBasePtr());
4883}
4884
4887 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4888}
4889
4890const Constant *
4892 assert(LD && "Unexpected null LoadSDNode");
4893 return getTargetConstantFromNode(LD);
4894}
4895
4896// Extract raw constant bits from constant pools.
4897static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4898 APInt &UndefElts,
4899 SmallVectorImpl<APInt> &EltBits,
4900 bool AllowWholeUndefs = true,
4901 bool AllowPartialUndefs = false) {
4902 assert(EltBits.empty() && "Expected an empty EltBits vector");
4903
4905
4906 EVT VT = Op.getValueType();
4907 unsigned SizeInBits = VT.getSizeInBits();
4908 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4909 unsigned NumElts = SizeInBits / EltSizeInBits;
4910
4911 // Bitcast a source array of element bits to the target size.
4912 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4913 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4914 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4915 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4916 "Constant bit sizes don't match");
4917
4918 // Don't split if we don't allow undef bits.
4919 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4920 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4921 return false;
4922
4923 // If we're already the right size, don't bother bitcasting.
4924 if (NumSrcElts == NumElts) {
4925 UndefElts = UndefSrcElts;
4926 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4927 return true;
4928 }
4929
4930 // Extract all the undef/constant element data and pack into single bitsets.
4931 APInt UndefBits(SizeInBits, 0);
4932 APInt MaskBits(SizeInBits, 0);
4933
4934 for (unsigned i = 0; i != NumSrcElts; ++i) {
4935 unsigned BitOffset = i * SrcEltSizeInBits;
4936 if (UndefSrcElts[i])
4937 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4938 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4939 }
4940
4941 // Split the undef/constant single bitset data into the target elements.
4942 UndefElts = APInt(NumElts, 0);
4943 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4944
4945 for (unsigned i = 0; i != NumElts; ++i) {
4946 unsigned BitOffset = i * EltSizeInBits;
4947 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4948
4949 // Only treat an element as UNDEF if all bits are UNDEF.
4950 if (UndefEltBits.isAllOnes()) {
4951 if (!AllowWholeUndefs)
4952 return false;
4953 UndefElts.setBit(i);
4954 continue;
4955 }
4956
4957 // If only some bits are UNDEF then treat them as zero (or bail if not
4958 // supported).
4959 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4960 return false;
4961
4962 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4963 }
4964 return true;
4965 };
4966
4967 // Collect constant bits and insert into mask/undef bit masks.
4968 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4969 unsigned UndefBitIndex) {
4970 if (!Cst)
4971 return false;
4972 if (isa<UndefValue>(Cst)) {
4973 Undefs.setBit(UndefBitIndex);
4974 return true;
4975 }
4976 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4977 Mask = CInt->getValue();
4978 return true;
4979 }
4980 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4981 Mask = CFP->getValueAPF().bitcastToAPInt();
4982 return true;
4983 }
4984 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4985 Type *Ty = CDS->getType();
4987 Type *EltTy = CDS->getElementType();
4988 bool IsInteger = EltTy->isIntegerTy();
4989 bool IsFP =
4990 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4991 if (!IsInteger && !IsFP)
4992 return false;
4993 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4994 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4995 if (IsInteger)
4996 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4997 else
4998 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4999 I * EltBits);
5000 return true;
5001 }
5002 return false;
5003 };
5004
5005 // Handle UNDEFs.
5006 if (Op.isUndef()) {
5007 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5008 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5009 return CastBitData(UndefSrcElts, SrcEltBits);
5010 }
5011
5012 // Extract scalar constant bits.
5013 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5014 APInt UndefSrcElts = APInt::getZero(1);
5015 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5016 return CastBitData(UndefSrcElts, SrcEltBits);
5017 }
5018 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5019 APInt UndefSrcElts = APInt::getZero(1);
5020 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5021 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5022 return CastBitData(UndefSrcElts, SrcEltBits);
5023 }
5024
5025 // Extract constant bits from build vector.
5026 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5027 BitVector Undefs;
5028 SmallVector<APInt> SrcEltBits;
5029 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5030 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5031 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5032 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5033 if (Undefs[I])
5034 UndefSrcElts.setBit(I);
5035 return CastBitData(UndefSrcElts, SrcEltBits);
5036 }
5037 }
5038
5039 // Extract constant bits from constant pool vector.
5040 if (auto *Cst = getTargetConstantFromNode(Op)) {
5041 Type *CstTy = Cst->getType();
5042 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5043 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5044 return false;
5045
5046 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5047 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5048 if ((SizeInBits % SrcEltSizeInBits) != 0)
5049 return false;
5050
5051 APInt UndefSrcElts(NumSrcElts, 0);
5052 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5053 for (unsigned i = 0; i != NumSrcElts; ++i)
5054 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5055 UndefSrcElts, i))
5056 return false;
5057
5058 return CastBitData(UndefSrcElts, SrcEltBits);
5059 }
5060
5061 // Extract constant bits from a broadcasted constant pool scalar.
5062 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5063 EltSizeInBits <= VT.getScalarSizeInBits()) {
5064 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5065 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5066 return false;
5067
5068 SDValue Ptr = MemIntr->getBasePtr();
5070 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5071 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5072
5073 APInt UndefSrcElts(NumSrcElts, 0);
5074 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5075 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5076 if (UndefSrcElts[0])
5077 UndefSrcElts.setBits(0, NumSrcElts);
5078 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5079 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5080 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5081 return CastBitData(UndefSrcElts, SrcEltBits);
5082 }
5083 }
5084 }
5085
5086 // Extract constant bits from a subvector broadcast.
5087 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5088 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5089 SDValue Ptr = MemIntr->getBasePtr();
5090 // The source constant may be larger than the subvector broadcast,
5091 // ensure we extract the correct subvector constants.
5092 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5093 Type *CstTy = Cst->getType();
5094 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5095 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5096 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5097 (SizeInBits % SubVecSizeInBits) != 0)
5098 return false;
5099 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5100 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5101 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5102 APInt UndefSubElts(NumSubElts, 0);
5103 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5104 APInt(CstEltSizeInBits, 0));
5105 for (unsigned i = 0; i != NumSubElts; ++i) {
5106 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5107 UndefSubElts, i))
5108 return false;
5109 for (unsigned j = 1; j != NumSubVecs; ++j)
5110 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5111 }
5112 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5113 UndefSubElts);
5114 return CastBitData(UndefSubElts, SubEltBits);
5115 }
5116 }
5117
5118 // Extract a rematerialized scalar constant insertion.
5119 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5120 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5121 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5122 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5123 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5124
5125 APInt UndefSrcElts(NumSrcElts, 0);
5126 SmallVector<APInt, 64> SrcEltBits;
5127 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5128 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5129 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5130 return CastBitData(UndefSrcElts, SrcEltBits);
5131 }
5132
5133 // Insert constant bits from a base and sub vector sources.
5134 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5135 // If bitcasts to larger elements we might lose track of undefs - don't
5136 // allow any to be safe.
5137 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5138 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5139
5140 APInt UndefSrcElts, UndefSubElts;
5141 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5142 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5143 UndefSubElts, EltSubBits,
5144 AllowWholeUndefs && AllowUndefs,
5145 AllowPartialUndefs && AllowUndefs) &&
5146 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5147 UndefSrcElts, EltSrcBits,
5148 AllowWholeUndefs && AllowUndefs,
5149 AllowPartialUndefs && AllowUndefs)) {
5150 unsigned BaseIdx = Op.getConstantOperandVal(2);
5151 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5152 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5153 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5154 return CastBitData(UndefSrcElts, EltSrcBits);
5155 }
5156 }
5157
5158 // Extract constant bits from a subvector's source.
5159 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
5160 // TODO - support extract_subvector through bitcasts.
5161 if (EltSizeInBits != VT.getScalarSizeInBits())
5162 return false;
5163
5164 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5165 UndefElts, EltBits, AllowWholeUndefs,
5166 AllowPartialUndefs)) {
5167 EVT SrcVT = Op.getOperand(0).getValueType();
5168 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5169 unsigned NumSubElts = VT.getVectorNumElements();
5170 unsigned BaseIdx = Op.getConstantOperandVal(1);
5171 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5172 if ((BaseIdx + NumSubElts) != NumSrcElts)
5173 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5174 if (BaseIdx != 0)
5175 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5176 return true;
5177 }
5178 }
5179
5180 // Extract constant bits from shuffle node sources.
5181 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5182 // TODO - support shuffle through bitcasts.
5183 if (EltSizeInBits != VT.getScalarSizeInBits())
5184 return false;
5185
5186 ArrayRef<int> Mask = SVN->getMask();
5187 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5188 llvm::any_of(Mask, [](int M) { return M < 0; }))
5189 return false;
5190
5191 APInt UndefElts0, UndefElts1;
5192 SmallVector<APInt, 32> EltBits0, EltBits1;
5193 if (isAnyInRange(Mask, 0, NumElts) &&
5194 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5195 UndefElts0, EltBits0, AllowWholeUndefs,
5196 AllowPartialUndefs))
5197 return false;
5198 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5199 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5200 UndefElts1, EltBits1, AllowWholeUndefs,
5201 AllowPartialUndefs))
5202 return false;
5203
5204 UndefElts = APInt::getZero(NumElts);
5205 for (int i = 0; i != (int)NumElts; ++i) {
5206 int M = Mask[i];
5207 if (M < 0) {
5208 UndefElts.setBit(i);
5209 EltBits.push_back(APInt::getZero(EltSizeInBits));
5210 } else if (M < (int)NumElts) {
5211 if (UndefElts0[M])
5212 UndefElts.setBit(i);
5213 EltBits.push_back(EltBits0[M]);
5214 } else {
5215 if (UndefElts1[M - NumElts])
5216 UndefElts.setBit(i);
5217 EltBits.push_back(EltBits1[M - NumElts]);
5218 }
5219 }
5220 return true;
5221 }
5222
5223 return false;
5224}
5225
5226namespace llvm {
5227namespace X86 {
5228bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5229 APInt UndefElts;
5230 SmallVector<APInt, 16> EltBits;
5232 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5233 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5234 int SplatIndex = -1;
5235 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5236 if (UndefElts[i])
5237 continue;
5238 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5239 SplatIndex = -1;
5240 break;
5241 }
5242 SplatIndex = i;
5243 }
5244 if (0 <= SplatIndex) {
5245 SplatVal = EltBits[SplatIndex];
5246 return true;
5247 }
5248 }
5249
5250 return false;
5251}
5252} // namespace X86
5253} // namespace llvm
5254
5256 unsigned MaskEltSizeInBits,
5258 APInt &UndefElts) {
5259 // Extract the raw target constant bits.
5260 SmallVector<APInt, 64> EltBits;
5261 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5262 EltBits, /* AllowWholeUndefs */ true,
5263 /* AllowPartialUndefs */ false))
5264 return false;
5265
5266 // Insert the extracted elements into the mask.
5267 for (const APInt &Elt : EltBits)
5268 RawMask.push_back(Elt.getZExtValue());
5269
5270 return true;
5271}
5272
5273static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5274 bool AllowUndefs) {
5275 APInt UndefElts;
5276 SmallVector<APInt, 64> EltBits;
5277 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5278 /*AllowWholeUndefs*/ AllowUndefs,
5279 /*AllowPartialUndefs*/ false))
5280 return false;
5281
5282 bool IsPow2OrUndef = true;
5283 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5284 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5285 return IsPow2OrUndef;
5286}
5287
5288// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5290 // TODO: don't always ignore oneuse constraints.
5291 V = peekThroughBitcasts(V);
5292 EVT VT = V.getValueType();
5293
5294 // Match not(xor X, -1) -> X.
5295 if (V.getOpcode() == ISD::XOR &&
5296 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5297 isAllOnesConstant(V.getOperand(1))))
5298 return V.getOperand(0);
5299
5300 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5301 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5302 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5303 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5304 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5305 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5306 V.getOperand(1));
5307 }
5308 }
5309
5310 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5311 if (V.getOpcode() == X86ISD::PCMPGT &&
5312 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5313 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5314 V.getOperand(0).hasOneUse()) {
5315 APInt UndefElts;
5316 SmallVector<APInt> EltBits;
5317 if (getTargetConstantBitsFromNode(V.getOperand(0),
5318 V.getScalarValueSizeInBits(), UndefElts,
5319 EltBits) &&
5320 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5321 // Don't fold min_signed_value -> (min_signed_value - 1)
5322 bool MinSigned = false;
5323 for (APInt &Elt : EltBits) {
5324 MinSigned |= Elt.isMinSignedValue();
5325 Elt -= 1;
5326 }
5327 if (!MinSigned) {
5328 SDLoc DL(V);
5329 MVT VT = V.getSimpleValueType();
5330 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5331 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5332 }
5333 }
5334 }
5335
5336 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5338 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5339 for (SDValue &CatOp : CatOps) {
5340 SDValue NotCat = IsNOT(CatOp, DAG);
5341 if (!NotCat)
5342 return SDValue();
5343 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5344 }
5345 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5346 }
5347
5348 // Match not(or(not(X),not(Y))) -> and(X, Y).
5349 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5350 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5351 // TODO: Handle cases with single NOT operand -> ANDNP
5352 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5353 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5354 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5355 DAG.getBitcast(VT, Op1));
5356 }
5357
5358 return SDValue();
5359}
5360
5361/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5362/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5363/// Note: This ignores saturation, so inputs must be checked first.
5365 bool Unary, unsigned NumStages = 1) {
5366 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5367 unsigned NumElts = VT.getVectorNumElements();
5368 unsigned NumLanes = VT.getSizeInBits() / 128;
5369 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5370 unsigned Offset = Unary ? 0 : NumElts;
5371 unsigned Repetitions = 1u << (NumStages - 1);
5372 unsigned Increment = 1u << NumStages;
5373 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5374
5375 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5376 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5377 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5378 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5379 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5380 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5381 }
5382 }
5383}
5384
5385// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5386static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5387 APInt &DemandedLHS, APInt &DemandedRHS) {
5388 int NumLanes = VT.getSizeInBits() / 128;
5389 int NumElts = DemandedElts.getBitWidth();
5390 int NumInnerElts = NumElts / 2;
5391 int NumEltsPerLane = NumElts / NumLanes;
5392 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5393
5394 DemandedLHS = APInt::getZero(NumInnerElts);
5395 DemandedRHS = APInt::getZero(NumInnerElts);
5396
5397 // Map DemandedElts to the packed operands.
5398 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5399 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5400 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5401 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5402 if (DemandedElts[OuterIdx])
5403 DemandedLHS.setBit(InnerIdx);
5404 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5405 DemandedRHS.setBit(InnerIdx);
5406 }
5407 }
5408}
5409
5410// Split the demanded elts of a HADD/HSUB node between its operands.
5411static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5412 APInt &DemandedLHS, APInt &DemandedRHS) {
5414 DemandedLHS, DemandedRHS);
5415 DemandedLHS |= DemandedLHS << 1;
5416 DemandedRHS |= DemandedRHS << 1;
5417}
5418
5419/// Calculates the shuffle mask corresponding to the target-specific opcode.
5420/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5421/// operands in \p Ops, and returns true.
5422/// Sets \p IsUnary to true if only one source is used. Note that this will set
5423/// IsUnary for shuffles which use a single input multiple times, and in those
5424/// cases it will adjust the mask to only have indices within that single input.
5425/// It is an error to call this with non-empty Mask/Ops vectors.
5426static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5428 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5429 if (!isTargetShuffle(N.getOpcode()))
5430 return false;
5431
5432 MVT VT = N.getSimpleValueType();
5433 unsigned NumElems = VT.getVectorNumElements();
5434 unsigned MaskEltSize = VT.getScalarSizeInBits();
5436 APInt RawUndefs;
5437 uint64_t ImmN;
5438
5439 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5440 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5441
5442 IsUnary = false;
5443 bool IsFakeUnary = false;
5444 switch (N.getOpcode()) {
5445 case X86ISD::BLENDI:
5446 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5447 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5448 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5449 DecodeBLENDMask(NumElems, ImmN, Mask);
5450 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5451 break;
5452 case X86ISD::SHUFP:
5453 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5454 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5455 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5456 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5457 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5458 break;
5459 case X86ISD::INSERTPS:
5460 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5461 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5462 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5463 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5464 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5465 break;
5466 case X86ISD::EXTRQI:
5467 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5468 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5469 isa<ConstantSDNode>(N.getOperand(2))) {
5470 int BitLen = N.getConstantOperandVal(1);
5471 int BitIdx = N.getConstantOperandVal(2);
5472 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5473 IsUnary = true;
5474 }
5475 break;
5476 case X86ISD::INSERTQI:
5477 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5478 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5479 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5480 isa<ConstantSDNode>(N.getOperand(3))) {
5481 int BitLen = N.getConstantOperandVal(2);
5482 int BitIdx = N.getConstantOperandVal(3);
5483 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5484 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5485 }
5486 break;
5487 case X86ISD::UNPCKH:
5488 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5489 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5490 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5491 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5492 break;
5493 case X86ISD::UNPCKL:
5494 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5495 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5496 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5497 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5498 break;
5499 case X86ISD::MOVHLPS:
5500 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5501 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5502 DecodeMOVHLPSMask(NumElems, Mask);
5503 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5504 break;
5505 case X86ISD::MOVLHPS:
5506 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5507 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5508 DecodeMOVLHPSMask(NumElems, Mask);
5509 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5510 break;
5511 case X86ISD::VALIGN:
5512 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5513 "Only 32-bit and 64-bit elements are supported!");
5514 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5515 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5516 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5517 DecodeVALIGNMask(NumElems, ImmN, Mask);
5518 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5519 Ops.push_back(N.getOperand(1));
5520 Ops.push_back(N.getOperand(0));
5521 break;
5522 case X86ISD::PALIGNR:
5523 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5524 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5525 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5526 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5527 DecodePALIGNRMask(NumElems, ImmN, Mask);
5528 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5529 Ops.push_back(N.getOperand(1));
5530 Ops.push_back(N.getOperand(0));
5531 break;
5532 case X86ISD::VSHLDQ:
5533 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5534 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5535 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5536 DecodePSLLDQMask(NumElems, ImmN, Mask);
5537 IsUnary = true;
5538 break;
5539 case X86ISD::VSRLDQ:
5540 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5541 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5542 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5543 DecodePSRLDQMask(NumElems, ImmN, Mask);
5544 IsUnary = true;
5545 break;
5546 case X86ISD::PSHUFD:
5547 case X86ISD::VPERMILPI:
5548 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5549 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5550 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5551 IsUnary = true;
5552 break;
5553 case X86ISD::PSHUFHW:
5554 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5555 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5556 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5557 IsUnary = true;
5558 break;
5559 case X86ISD::PSHUFLW:
5560 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5561 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5562 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5563 IsUnary = true;
5564 break;
5565 case X86ISD::VZEXT_MOVL:
5566 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5567 DecodeZeroMoveLowMask(NumElems, Mask);
5568 IsUnary = true;
5569 break;
5570 case X86ISD::VBROADCAST:
5571 // We only decode broadcasts of same-sized vectors, peeking through to
5572 // extracted subvectors is likely to cause hasOneUse issues with
5573 // SimplifyDemandedBits etc.
5574 if (N.getOperand(0).getValueType() == VT) {
5575 DecodeVectorBroadcast(NumElems, Mask);
5576 IsUnary = true;
5577 break;
5578 }
5579 return false;
5580 case X86ISD::VPERMILPV: {
5581 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5582 IsUnary = true;
5583 SDValue MaskNode = N.getOperand(1);
5584 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5585 RawUndefs)) {
5586 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5587 break;
5588 }
5589 return false;
5590 }
5591 case X86ISD::PSHUFB: {
5592 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5593 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5594 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5595 IsUnary = true;
5596 SDValue MaskNode = N.getOperand(1);
5597 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5598 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5599 break;
5600 }
5601 return false;
5602 }
5603 case X86ISD::VPERMI:
5604 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5605 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5606 DecodeVPERMMask(NumElems, ImmN, Mask);
5607 IsUnary = true;
5608 break;
5609 case X86ISD::MOVSS:
5610 case X86ISD::MOVSD:
5611 case X86ISD::MOVSH:
5612 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5613 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5614 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5615 break;
5616 case X86ISD::VPERM2X128:
5617 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5618 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5619 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5620 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5621 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5622 break;
5623 case X86ISD::SHUF128:
5624 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5625 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5626 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5627 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5628 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5629 break;
5630 case X86ISD::MOVSLDUP:
5631 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5632 DecodeMOVSLDUPMask(NumElems, Mask);
5633 IsUnary = true;
5634 break;
5635 case X86ISD::MOVSHDUP:
5636 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5637 DecodeMOVSHDUPMask(NumElems, Mask);
5638 IsUnary = true;
5639 break;
5640 case X86ISD::MOVDDUP:
5641 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5642 DecodeMOVDDUPMask(NumElems, Mask);
5643 IsUnary = true;
5644 break;
5645 case X86ISD::VPERMIL2: {
5646 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5647 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5648 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5649 SDValue MaskNode = N.getOperand(2);
5650 SDValue CtrlNode = N.getOperand(3);
5651 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5652 unsigned CtrlImm = CtrlOp->getZExtValue();
5653 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5654 RawUndefs)) {
5655 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5656 Mask);
5657 break;
5658 }
5659 }
5660 return false;
5661 }
5662 case X86ISD::VPPERM: {
5663 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5664 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5665 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5666 SDValue MaskNode = N.getOperand(2);
5667 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5668 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5669 break;
5670 }
5671 return false;
5672 }
5673 case X86ISD::VPERMV: {
5674 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5675 IsUnary = true;
5676 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5677 Ops.push_back(N.getOperand(1));
5678 SDValue MaskNode = N.getOperand(0);
5679 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5680 RawUndefs)) {
5681 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5682 break;
5683 }
5684 return false;
5685 }
5686 case X86ISD::VPERMV3: {
5687 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5688 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5689 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5690 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5691 Ops.push_back(N.getOperand(0));
5692 Ops.push_back(N.getOperand(2));
5693 SDValue MaskNode = N.getOperand(1);
5694 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5695 RawUndefs)) {
5696 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5697 break;
5698 }
5699 return false;
5700 }
5701 default:
5702 llvm_unreachable("unknown target shuffle node");
5703 }
5704
5705 // Empty mask indicates the decode failed.
5706 if (Mask.empty())
5707 return false;
5708
5709 // Check if we're getting a shuffle mask with zero'd elements.
5710 if (!AllowSentinelZero && isAnyZero(Mask))
5711 return false;
5712
5713 // If we have a fake unary shuffle, the shuffle mask is spread across two
5714 // inputs that are actually the same node. Re-map the mask to always point
5715 // into the first input.
5716 if (IsFakeUnary)
5717 for (int &M : Mask)
5718 if (M >= (int)Mask.size())
5719 M -= Mask.size();
5720
5721 // If we didn't already add operands in the opcode-specific code, default to
5722 // adding 1 or 2 operands starting at 0.
5723 if (Ops.empty()) {
5724 Ops.push_back(N.getOperand(0));
5725 if (!IsUnary || IsFakeUnary)
5726 Ops.push_back(N.getOperand(1));
5727 }
5728
5729 return true;
5730}
5731
5732// Wrapper for getTargetShuffleMask with InUnary;
5733static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5735 SmallVectorImpl<int> &Mask) {
5736 bool IsUnary;
5737 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5738}
5739
5740/// Compute whether each element of a shuffle is zeroable.
5741///
5742/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5743/// Either it is an undef element in the shuffle mask, the element of the input
5744/// referenced is undef, or the element of the input referenced is known to be
5745/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5746/// as many lanes with this technique as possible to simplify the remaining
5747/// shuffle.
5749 SDValue V1, SDValue V2,
5750 APInt &KnownUndef, APInt &KnownZero) {
5751 int Size = Mask.size();
5752 KnownUndef = KnownZero = APInt::getZero(Size);
5753
5754 V1 = peekThroughBitcasts(V1);
5755 V2 = peekThroughBitcasts(V2);
5756
5757 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5758 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5759
5760 int VectorSizeInBits = V1.getValueSizeInBits();
5761 int ScalarSizeInBits = VectorSizeInBits / Size;
5762 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5763
5764 for (int i = 0; i < Size; ++i) {
5765 int M = Mask[i];
5766 // Handle the easy cases.
5767 if (M < 0) {
5768 KnownUndef.setBit(i);
5769 continue;
5770 }
5771 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5772 KnownZero.setBit(i);
5773 continue;
5774 }
5775
5776 // Determine shuffle input and normalize the mask.
5777 SDValue V = M < Size ? V1 : V2;
5778 M %= Size;
5779
5780 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5781 if (V.getOpcode() != ISD::BUILD_VECTOR)
5782 continue;
5783
5784 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5785 // the (larger) source element must be UNDEF/ZERO.
5786 if ((Size % V.getNumOperands()) == 0) {
5787 int Scale = Size / V->getNumOperands();
5788 SDValue Op = V.getOperand(M / Scale);
5789 if (Op.isUndef())
5790 KnownUndef.setBit(i);
5791 if (X86::isZeroNode(Op))
5792 KnownZero.setBit(i);
5793 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5794 APInt Val = Cst->getAPIntValue();
5795 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5796 if (Val == 0)
5797 KnownZero.setBit(i);
5798 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5799 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5800 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5801 if (Val == 0)
5802 KnownZero.setBit(i);
5803 }
5804 continue;
5805 }
5806
5807 // If the BUILD_VECTOR has more elements then all the (smaller) source
5808 // elements must be UNDEF or ZERO.
5809 if ((V.getNumOperands() % Size) == 0) {
5810 int Scale = V->getNumOperands() / Size;
5811 bool AllUndef = true;
5812 bool AllZero = true;
5813 for (int j = 0; j < Scale; ++j) {
5814 SDValue Op = V.getOperand((M * Scale) + j);
5815 AllUndef &= Op.isUndef();
5816 AllZero &= X86::isZeroNode(Op);
5817 }
5818 if (AllUndef)
5819 KnownUndef.setBit(i);
5820 if (AllZero)
5821 KnownZero.setBit(i);
5822 continue;
5823 }
5824 }
5825}
5826
5827/// Decode a target shuffle mask and inputs and see if any values are
5828/// known to be undef or zero from their inputs.
5829/// Returns true if the target shuffle mask was decoded.
5830/// FIXME: Merge this with computeZeroableShuffleElements?
5833 APInt &KnownUndef, APInt &KnownZero) {
5834 bool IsUnary;
5835 if (!isTargetShuffle(N.getOpcode()))
5836 return false;
5837
5838 MVT VT = N.getSimpleValueType();
5839 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5840 return false;
5841
5842 int Size = Mask.size();
5843 SDValue V1 = Ops[0];
5844 SDValue V2 = IsUnary ? V1 : Ops[1];
5845 KnownUndef = KnownZero = APInt::getZero(Size);
5846
5847 V1 = peekThroughBitcasts(V1);
5848 V2 = peekThroughBitcasts(V2);
5849
5850 assert((VT.getSizeInBits() % Size) == 0 &&
5851 "Illegal split of shuffle value type");
5852 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5853
5854 // Extract known constant input data.
5855 APInt UndefSrcElts[2];
5856 SmallVector<APInt, 32> SrcEltBits[2];
5857 bool IsSrcConstant[2] = {
5858 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5859 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5860 /*AllowPartialUndefs*/ false),
5861 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5862 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5863 /*AllowPartialUndefs*/ false)};
5864
5865 for (int i = 0; i < Size; ++i) {
5866 int M = Mask[i];
5867
5868 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5869 if (M < 0) {
5870 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5871 if (SM_SentinelUndef == M)
5872 KnownUndef.setBit(i);
5873 if (SM_SentinelZero == M)
5874 KnownZero.setBit(i);
5875 continue;
5876 }
5877
5878 // Determine shuffle input and normalize the mask.
5879 unsigned SrcIdx = M / Size;
5880 SDValue V = M < Size ? V1 : V2;
5881 M %= Size;
5882
5883 // We are referencing an UNDEF input.
5884 if (V.isUndef()) {
5885 KnownUndef.setBit(i);
5886 continue;
5887 }
5888
5889 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5890 // TODO: We currently only set UNDEF for integer types - floats use the same
5891 // registers as vectors and many of the scalar folded loads rely on the
5892 // SCALAR_TO_VECTOR pattern.
5893 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5894 (Size % V.getValueType().getVectorNumElements()) == 0) {
5895 int Scale = Size / V.getValueType().getVectorNumElements();
5896 int Idx = M / Scale;
5897 if (Idx != 0 && !VT.isFloatingPoint())
5898 KnownUndef.setBit(i);
5899 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5900 KnownZero.setBit(i);
5901 continue;
5902 }
5903
5904 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5905 // base vectors.
5906 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5907 SDValue Vec = V.getOperand(0);
5908 int NumVecElts = Vec.getValueType().getVectorNumElements();
5909 if (Vec.isUndef() && Size == NumVecElts) {
5910 int Idx = V.getConstantOperandVal(2);
5911 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5912 if (M < Idx || (Idx + NumSubElts) <= M)
5913 KnownUndef.setBit(i);
5914 }
5915 continue;
5916 }
5917
5918 // Attempt to extract from the source's constant bits.
5919 if (IsSrcConstant[SrcIdx]) {
5920 if (UndefSrcElts[SrcIdx][M])
5921 KnownUndef.setBit(i);
5922 else if (SrcEltBits[SrcIdx][M] == 0)
5923 KnownZero.setBit(i);
5924 }
5925 }
5926
5927 assert(VT.getVectorNumElements() == (unsigned)Size &&
5928 "Different mask size from vector size!");
5929 return true;
5930}
5931
5932// Replace target shuffle mask elements with known undef/zero sentinels.
5934 const APInt &KnownUndef,
5935 const APInt &KnownZero,
5936 bool ResolveKnownZeros= true) {
5937 unsigned NumElts = Mask.size();
5938 assert(KnownUndef.getBitWidth() == NumElts &&
5939 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5940
5941 for (unsigned i = 0; i != NumElts; ++i) {
5942 if (KnownUndef[i])
5943 Mask[i] = SM_SentinelUndef;
5944 else if (ResolveKnownZeros && KnownZero[i])
5945 Mask[i] = SM_SentinelZero;
5946 }
5947}
5948
5949// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5951 APInt &KnownUndef,
5952 APInt &KnownZero) {
5953 unsigned NumElts = Mask.size();
5954 KnownUndef = KnownZero = APInt::getZero(NumElts);
5955
5956 for (unsigned i = 0; i != NumElts; ++i) {
5957 int M = Mask[i];
5958 if (SM_SentinelUndef == M)
5959 KnownUndef.setBit(i);
5960 if (SM_SentinelZero == M)
5961 KnownZero.setBit(i);
5962 }
5963}
5964
5965// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5967 SDValue Cond, bool IsBLENDV = false) {
5968 EVT CondVT = Cond.getValueType();
5969 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5970 unsigned NumElts = CondVT.getVectorNumElements();
5971
5972 APInt UndefElts;
5973 SmallVector<APInt, 32> EltBits;
5974 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5975 /*AllowWholeUndefs*/ true,
5976 /*AllowPartialUndefs*/ false))
5977 return false;
5978
5979 Mask.resize(NumElts, SM_SentinelUndef);
5980
5981 for (int i = 0; i != (int)NumElts; ++i) {
5982 Mask[i] = i;
5983 // Arbitrarily choose from the 2nd operand if the select condition element
5984 // is undef.
5985 // TODO: Can we do better by matching patterns such as even/odd?
5986 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5987 (IsBLENDV && EltBits[i].isNonNegative()))
5988 Mask[i] += NumElts;
5989 }
5990
5991 return true;
5992}
5993
5994// Forward declaration (for getFauxShuffleMask recursive check).
5995static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5998 const SelectionDAG &DAG, unsigned Depth,
5999 bool ResolveKnownElts);
6000
6001// Attempt to decode ops that could be represented as a shuffle mask.
6002// The decoded shuffle mask may contain a different number of elements to the
6003// destination value type.
6004// TODO: Merge into getTargetShuffleInputs()
6005static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6008 const SelectionDAG &DAG, unsigned Depth,
6009 bool ResolveKnownElts) {
6010 Mask.clear();
6011 Ops.clear();
6012
6013 MVT VT = N.getSimpleValueType();
6014 unsigned NumElts = VT.getVectorNumElements();
6015 unsigned NumSizeInBits = VT.getSizeInBits();
6016 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6017 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6018 return false;
6019 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6020 unsigned NumSizeInBytes = NumSizeInBits / 8;
6021 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6022
6023 unsigned Opcode = N.getOpcode();
6024 switch (Opcode) {
6025 case ISD::VECTOR_SHUFFLE: {
6026 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6027 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6028 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6029 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6030 Ops.push_back(N.getOperand(0));
6031 Ops.push_back(N.getOperand(1));
6032 return true;
6033 }
6034 return false;
6035 }
6036 case ISD::AND:
6037 case X86ISD::ANDNP: {
6038 // Attempt to decode as a per-byte mask.
6039 APInt UndefElts;
6040 SmallVector<APInt, 32> EltBits;
6041 SDValue N0 = N.getOperand(0);
6042 SDValue N1 = N.getOperand(1);
6043 bool IsAndN = (X86ISD::ANDNP == Opcode);
6044 uint64_t ZeroMask = IsAndN ? 255 : 0;
6045 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6046 /*AllowWholeUndefs*/ false,
6047 /*AllowPartialUndefs*/ false))
6048 return false;
6049 // We can't assume an undef src element gives an undef dst - the other src
6050 // might be zero.
6051 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6052 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6053 const APInt &ByteBits = EltBits[i];
6054 if (ByteBits != 0 && ByteBits != 255)
6055 return false;
6056 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6057 }
6058 Ops.push_back(IsAndN ? N1 : N0);
6059 return true;
6060 }
6061 case ISD::OR: {
6062 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6063 // is a valid shuffle index.
6064 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6065 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6066 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6067 return false;
6068
6069 SmallVector<int, 64> SrcMask0, SrcMask1;
6070 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6073 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6074 Depth + 1, true) ||
6075 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6076 Depth + 1, true))
6077 return false;
6078
6079 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6080 SmallVector<int, 64> Mask0, Mask1;
6081 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6082 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6083 for (int i = 0; i != (int)MaskSize; ++i) {
6084 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6085 // loops converting between OR and BLEND shuffles due to
6086 // canWidenShuffleElements merging away undef elements, meaning we
6087 // fail to recognise the OR as the undef element isn't known zero.
6088 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6089 Mask.push_back(SM_SentinelZero);
6090 else if (Mask1[i] == SM_SentinelZero)
6091 Mask.push_back(i);
6092 else if (Mask0[i] == SM_SentinelZero)
6093 Mask.push_back(i + MaskSize);
6094 else
6095 return false;
6096 }
6097 Ops.push_back(N0);
6098 Ops.push_back(N1);
6099 return true;
6100 }
6101 case ISD::INSERT_SUBVECTOR: {
6102 SDValue Src = N.getOperand(0);
6103 SDValue Sub = N.getOperand(1);
6104 EVT SubVT = Sub.getValueType();
6105 unsigned NumSubElts = SubVT.getVectorNumElements();
6106 if (!N->isOnlyUserOf(Sub.getNode()))
6107 return false;
6108 SDValue SubBC = peekThroughBitcasts(Sub);
6109 uint64_t InsertIdx = N.getConstantOperandVal(2);
6110 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6111 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6112 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6113 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
6114 SDValue SubBCSrc = SubBC.getOperand(0);
6115 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
6116 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
6117 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
6118 "Subvector valuetype mismatch");
6119 InsertIdx *= (MaxElts / NumElts);
6120 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
6121 NumSubElts *= (MaxElts / NumElts);
6122 bool SrcIsUndef = Src.isUndef();
6123 for (int i = 0; i != (int)MaxElts; ++i)
6124 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6125 for (int i = 0; i != (int)NumSubElts; ++i)
6126 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6127 if (!SrcIsUndef)
6128 Ops.push_back(Src);
6129 Ops.push_back(SubBCSrc);
6130 return true;
6131 }
6132 // Handle CONCAT(SUB0, SUB1).
6133 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
6134 // cross lane shuffles.
6135 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6136 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
6137 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6138 Src.getOperand(0).isUndef() &&
6139 Src.getOperand(1).getValueType() == SubVT &&
6140 Src.getConstantOperandVal(2) == 0) {
6141 for (int i = 0; i != (int)NumSubElts; ++i)
6142 Mask.push_back(i);
6143 for (int i = 0; i != (int)NumSubElts; ++i)
6144 Mask.push_back(i + NumElts);
6145 Ops.push_back(Src.getOperand(1));
6146 Ops.push_back(Sub);
6147 return true;
6148 }
6149 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6150 SmallVector<int, 64> SubMask;
6151 SmallVector<SDValue, 2> SubInputs;
6152 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
6153 EVT SubSrcVT = SubSrc.getValueType();
6154 if (!SubSrcVT.isVector())
6155 return false;
6156
6157 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6158 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6159 Depth + 1, ResolveKnownElts))
6160 return false;
6161
6162 // Subvector shuffle inputs must not be larger than the subvector.
6163 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6164 return SubVT.getFixedSizeInBits() <
6165 SubInput.getValueSizeInBits().getFixedValue();
6166 }))
6167 return false;
6168
6169 if (SubMask.size() != NumSubElts) {
6170 assert(((SubMask.size() % NumSubElts) == 0 ||
6171 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
6172 if ((NumSubElts % SubMask.size()) == 0) {
6173 int Scale = NumSubElts / SubMask.size();
6174 SmallVector<int,64> ScaledSubMask;
6175 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6176 SubMask = ScaledSubMask;
6177 } else {
6178 int Scale = SubMask.size() / NumSubElts;
6179 NumSubElts = SubMask.size();
6180 NumElts *= Scale;
6181 InsertIdx *= Scale;
6182 }
6183 }
6184 Ops.push_back(Src);
6185 Ops.append(SubInputs.begin(), SubInputs.end());
6186 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6187 Mask.append(NumElts, SM_SentinelZero);
6188 else
6189 for (int i = 0; i != (int)NumElts; ++i)
6190 Mask.push_back(i);
6191 for (int i = 0; i != (int)NumSubElts; ++i) {
6192 int M = SubMask[i];
6193 if (0 <= M) {
6194 int InputIdx = M / NumSubElts;
6195 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6196 }
6197 Mask[i + InsertIdx] = M;
6198 }
6199 return true;
6200 }
6201 case X86ISD::PINSRB:
6202 case X86ISD::PINSRW:
6205 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6206 // vector, for matching src/dst vector types.
6207 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6208
6209 unsigned DstIdx = 0;
6210 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6211 // Check we have an in-range constant insertion index.
6212 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6213 N.getConstantOperandAPInt(2).uge(NumElts))
6214 return false;
6215 DstIdx = N.getConstantOperandVal(2);
6216
6217 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6218 if (X86::isZeroNode(Scl)) {
6219 Ops.push_back(N.getOperand(0));
6220 for (unsigned i = 0; i != NumElts; ++i)
6221 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6222 return true;
6223 }
6224 }
6225
6226 // Peek through trunc/aext/zext/bitcast.
6227 // TODO: aext shouldn't require SM_SentinelZero padding.
6228 // TODO: handle shift of scalars.
6229 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6230 while (Scl.getOpcode() == ISD::TRUNCATE ||
6231 Scl.getOpcode() == ISD::ANY_EXTEND ||
6232 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6233 (Scl.getOpcode() == ISD::BITCAST &&
6236 Scl = Scl.getOperand(0);
6237 MinBitsPerElt =
6238 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6239 }
6240 if ((MinBitsPerElt % 8) != 0)
6241 return false;
6242
6243 // Attempt to find the source vector the scalar was extracted from.
6244 SDValue SrcExtract;
6245 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6246 Scl.getOpcode() == X86ISD::PEXTRW ||
6247 Scl.getOpcode() == X86ISD::PEXTRB) &&
6248 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6249 SrcExtract = Scl;
6250 }
6251 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6252 return false;
6253
6254 SDValue SrcVec = SrcExtract.getOperand(0);
6255 EVT SrcVT = SrcVec.getValueType();
6256 if (!SrcVT.getScalarType().isByteSized())
6257 return false;
6258 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6259 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6260 unsigned DstByte = DstIdx * NumBytesPerElt;
6261 MinBitsPerElt =
6262 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6263
6264 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6265 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6266 Ops.push_back(SrcVec);
6267 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6268 } else {
6269 Ops.push_back(SrcVec);
6270 Ops.push_back(N.getOperand(0));
6271 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6272 Mask.push_back(NumSizeInBytes + i);
6273 }
6274
6275 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6276 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6277 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6278 Mask[DstByte + i] = SrcByte + i;
6279 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6280 Mask[DstByte + i] = SM_SentinelZero;
6281 return true;
6282 }
6283 case X86ISD::PACKSS:
6284 case X86ISD::PACKUS: {
6285 SDValue N0 = N.getOperand(0);
6286 SDValue N1 = N.getOperand(1);
6287 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6288 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6289 "Unexpected input value type");
6290
6291 APInt EltsLHS, EltsRHS;
6292 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6293
6294 // If we know input saturation won't happen (or we don't care for particular
6295 // lanes), we can treat this as a truncation shuffle.
6296 bool Offset0 = false, Offset1 = false;
6297 if (Opcode == X86ISD::PACKSS) {
6298 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6299 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6300 (!(N1.isUndef() || EltsRHS.isZero()) &&
6301 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6302 return false;
6303 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6304 // PACKSS then it was likely being used for sign-extension for a
6305 // truncation, so just peek through and adjust the mask accordingly.
6306 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6307 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6308 Offset0 = true;
6309 N0 = N0.getOperand(0);
6310 }
6311 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6312 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6313 Offset1 = true;
6314 N1 = N1.getOperand(0);
6315 }
6316 } else {
6317 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6318 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6319 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6320 (!(N1.isUndef() || EltsRHS.isZero()) &&
6321 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6322 return false;
6323 }
6324
6325 bool IsUnary = (N0 == N1);
6326
6327 Ops.push_back(N0);
6328 if (!IsUnary)
6329 Ops.push_back(N1);
6330
6331 createPackShuffleMask(VT, Mask, IsUnary);
6332
6333 if (Offset0 || Offset1) {
6334 for (int &M : Mask)
6335 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6336 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6337 ++M;
6338 }
6339 return true;
6340 }
6341 case ISD::VSELECT:
6342 case X86ISD::BLENDV: {
6343 SDValue Cond = N.getOperand(0);
6344 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6345 Ops.push_back(N.getOperand(1));
6346 Ops.push_back(N.getOperand(2));
6347 return true;
6348 }
6349 return false;
6350 }
6351 case X86ISD::VTRUNC: {
6352 SDValue Src = N.getOperand(0);
6353 EVT SrcVT = Src.getValueType();
6354 // Truncated source must be a simple vector.
6355 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6356 (SrcVT.getScalarSizeInBits() % 8) != 0)
6357 return false;
6358 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6359 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6360 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6361 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6362 for (unsigned i = 0; i != NumSrcElts; ++i)
6363 Mask.push_back(i * Scale);
6364 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6365 Ops.push_back(Src);
6366 return true;
6367 }
6368 case ISD::SHL:
6369 case ISD::SRL: {
6370 // We can only decode 'whole byte' bit shifts as shuffles.
6371 std::optional<uint64_t> Amt = DAG.getValidShiftAmount(N, DemandedElts);
6372 if (!Amt || (*Amt % 8) != 0)
6373 return false;
6374
6375 uint64_t ByteShift = *Amt / 8;
6376 Ops.push_back(N.getOperand(0));
6377
6378 // Clear mask to all zeros and insert the shifted byte indices.
6379 Mask.append(NumSizeInBytes, SM_SentinelZero);
6380
6381 if (ISD::SHL == Opcode) {
6382 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6383 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6384 Mask[i + j] = i + j - ByteShift;
6385 } else {
6386 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6387 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6388 Mask[i + j - ByteShift] = i + j;
6389 }
6390 return true;
6391 }
6392 case X86ISD::VSHLI:
6393 case X86ISD::VSRLI: {
6394 uint64_t ShiftVal = N.getConstantOperandVal(1);
6395 // Out of range bit shifts are guaranteed to be zero.
6396 if (NumBitsPerElt <= ShiftVal) {
6397 Mask.append(NumElts, SM_SentinelZero);
6398 return true;
6399 }
6400
6401 // We can only decode 'whole byte' bit shifts as shuffles.
6402 if ((ShiftVal % 8) != 0)
6403 break;
6404
6405 uint64_t ByteShift = ShiftVal / 8;
6406 Ops.push_back(N.getOperand(0));
6407
6408 // Clear mask to all zeros and insert the shifted byte indices.
6409 Mask.append(NumSizeInBytes, SM_SentinelZero);
6410
6411 if (X86ISD::VSHLI == Opcode) {
6412 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6413 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6414 Mask[i + j] = i + j - ByteShift;
6415 } else {
6416 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6417 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6418 Mask[i + j - ByteShift] = i + j;
6419 }
6420 return true;
6421 }
6422 case X86ISD::VROTLI:
6423 case X86ISD::VROTRI: {
6424 // We can only decode 'whole byte' bit rotates as shuffles.
6425 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6426 if ((RotateVal % 8) != 0)
6427 return false;
6428 Ops.push_back(N.getOperand(0));
6429 int Offset = RotateVal / 8;
6430 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6431 for (int i = 0; i != (int)NumElts; ++i) {
6432 int BaseIdx = i * NumBytesPerElt;
6433 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6434 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6435 }
6436 }
6437 return true;
6438 }
6439 case X86ISD::VBROADCAST: {
6440 SDValue Src = N.getOperand(0);
6441 if (!Src.getSimpleValueType().isVector()) {
6442 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6443 !isNullConstant(Src.getOperand(1)) ||
6444 Src.getOperand(0).getValueType().getScalarType() !=
6445 VT.getScalarType())
6446 return false;
6447 Src = Src.getOperand(0);
6448 }
6449 Ops.push_back(Src);
6450 Mask.append(NumElts, 0);
6451 return true;
6452 }
6454 SDValue Src = N.getOperand(0);
6455 EVT SrcVT = Src.getValueType();
6456 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6457
6458 // Extended source must be a simple vector.
6459 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6460 (NumBitsPerSrcElt % 8) != 0)
6461 return false;
6462
6463 // We can only handle all-signbits extensions.
6464 APInt DemandedSrcElts =
6465 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6466 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6467 return false;
6468
6469 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6470 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6471 for (unsigned I = 0; I != NumElts; ++I)
6472 Mask.append(Scale, I);
6473 Ops.push_back(Src);
6474 return true;
6475 }
6476 case ISD::ZERO_EXTEND:
6477 case ISD::ANY_EXTEND:
6480 SDValue Src = N.getOperand(0);
6481 EVT SrcVT = Src.getValueType();
6482
6483 // Extended source must be a simple vector.
6484 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6485 (SrcVT.getScalarSizeInBits() % 8) != 0)
6486 return false;
6487
6488 bool IsAnyExtend =
6489 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6490 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6491 IsAnyExtend, Mask);
6492 Ops.push_back(Src);
6493 return true;
6494 }
6495 }
6496
6497 return false;
6498}
6499
6500/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6502 SmallVectorImpl<int> &Mask) {
6503 int MaskWidth = Mask.size();
6504 SmallVector<SDValue, 16> UsedInputs;
6505 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6506 int lo = UsedInputs.size() * MaskWidth;
6507 int hi = lo + MaskWidth;
6508
6509 // Strip UNDEF input usage.
6510 if (Inputs[i].isUndef())
6511 for (int &M : Mask)
6512 if ((lo <= M) && (M < hi))
6513 M = SM_SentinelUndef;
6514
6515 // Check for unused inputs.
6516 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6517 for (int &M : Mask)
6518 if (lo <= M)
6519 M -= MaskWidth;
6520 continue;
6521 }
6522
6523 // Check for repeated inputs.
6524 bool IsRepeat = false;
6525 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6526 if (UsedInputs[j] != Inputs[i])
6527 continue;
6528 for (int &M : Mask)
6529 if (lo <= M)
6530 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6531 IsRepeat = true;
6532 break;
6533 }
6534 if (IsRepeat)
6535 continue;
6536
6537 UsedInputs.push_back(Inputs[i]);
6538 }
6539 Inputs = UsedInputs;
6540}
6541
6542/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6543/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6544/// Returns true if the target shuffle mask was decoded.
6545static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6548 APInt &KnownUndef, APInt &KnownZero,
6549 const SelectionDAG &DAG, unsigned Depth,
6550 bool ResolveKnownElts) {
6552 return false; // Limit search depth.
6553
6554 EVT VT = Op.getValueType();
6555 if (!VT.isSimple() || !VT.isVector())
6556 return false;
6557
6558 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6559 if (ResolveKnownElts)
6560 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6561 return true;
6562 }
6563 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6564 ResolveKnownElts)) {
6565 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6566 return true;
6567 }
6568 return false;
6569}
6570
6571static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6574 const SelectionDAG &DAG, unsigned Depth,
6575 bool ResolveKnownElts) {
6576 APInt KnownUndef, KnownZero;
6577 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6578 KnownZero, DAG, Depth, ResolveKnownElts);
6579}
6580
6583 const SelectionDAG &DAG, unsigned Depth = 0,
6584 bool ResolveKnownElts = true) {
6585 EVT VT = Op.getValueType();
6586 if (!VT.isSimple() || !VT.isVector())
6587 return false;
6588
6589 unsigned NumElts = Op.getValueType().getVectorNumElements();
6590 APInt DemandedElts = APInt::getAllOnes(NumElts);
6591 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6592 ResolveKnownElts);
6593}
6594
6595// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6596static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6597 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6598 SelectionDAG &DAG) {
6599 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6600 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6601 "Unknown broadcast load type");
6602
6603 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6604 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6605 return SDValue();
6606
6609 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6610 SDValue Ops[] = {Mem->getChain(), Ptr};
6611 SDValue BcstLd = DAG.getMemIntrinsicNode(
6612 Opcode, DL, Tys, Ops, MemVT,
6614 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6615 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6616 return BcstLd;
6617}
6618
6619/// Returns the scalar element that will make up the i'th
6620/// element of the result of the vector shuffle.
6621static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6622 SelectionDAG &DAG, unsigned Depth) {
6624 return SDValue(); // Limit search depth.
6625
6626 EVT VT = Op.getValueType();
6627 unsigned Opcode = Op.getOpcode();
6628 unsigned NumElems = VT.getVectorNumElements();
6629
6630 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6631 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6632 int Elt = SV->getMaskElt(Index);
6633
6634 if (Elt < 0)
6635 return DAG.getUNDEF(VT.getVectorElementType());
6636
6637 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6638 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6639 }
6640
6641 // Recurse into target specific vector shuffles to find scalars.
6642 if (isTargetShuffle(Opcode)) {
6643 MVT ShufVT = VT.getSimpleVT();
6644 MVT ShufSVT = ShufVT.getVectorElementType();
6645 int NumElems = (int)ShufVT.getVectorNumElements();
6646 SmallVector<int, 16> ShuffleMask;
6648 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6649 return SDValue();
6650
6651 int Elt = ShuffleMask[Index];
6652 if (Elt == SM_SentinelZero)
6653 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6654 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6655 if (Elt == SM_SentinelUndef)
6656 return DAG.getUNDEF(ShufSVT);
6657
6658 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6659 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6660 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6661 }
6662
6663 // Recurse into insert_subvector base/sub vector to find scalars.
6664 if (Opcode == ISD::INSERT_SUBVECTOR) {
6665 SDValue Vec = Op.getOperand(0);
6666 SDValue Sub = Op.getOperand(1);
6667 uint64_t SubIdx = Op.getConstantOperandVal(2);
6668 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6669
6670 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6671 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6672 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6673 }
6674
6675 // Recurse into concat_vectors sub vector to find scalars.
6676 if (Opcode == ISD::CONCAT_VECTORS) {
6677 EVT SubVT = Op.getOperand(0).getValueType();
6678 unsigned NumSubElts = SubVT.getVectorNumElements();
6679 uint64_t SubIdx = Index / NumSubElts;
6680 uint64_t SubElt = Index % NumSubElts;
6681 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6682 }
6683
6684 // Recurse into extract_subvector src vector to find scalars.
6685 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6686 SDValue Src = Op.getOperand(0);
6687 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6688 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6689 }
6690
6691 // We only peek through bitcasts of the same vector width.
6692 if (Opcode == ISD::BITCAST) {
6693 SDValue Src = Op.getOperand(0);
6694 EVT SrcVT = Src.getValueType();
6695 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6696 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6697 return SDValue();
6698 }
6699
6700 // Actual nodes that may contain scalar elements
6701
6702 // For insert_vector_elt - either return the index matching scalar or recurse
6703 // into the base vector.
6704 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6705 isa<ConstantSDNode>(Op.getOperand(2))) {
6706 if (Op.getConstantOperandAPInt(2) == Index)
6707 return Op.getOperand(1);
6708 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6709 }
6710
6711 if (Opcode == ISD::SCALAR_TO_VECTOR)
6712 return (Index == 0) ? Op.getOperand(0)
6713 : DAG.getUNDEF(VT.getVectorElementType());
6714
6715 if (Opcode == ISD::BUILD_VECTOR)
6716 return Op.getOperand(Index);
6717
6718 return SDValue();
6719}
6720
6721// Use PINSRB/PINSRW/PINSRD to create a build vector.
6723 const APInt &NonZeroMask,
6724 unsigned NumNonZero, unsigned NumZero,
6725 SelectionDAG &DAG,
6726 const X86Subtarget &Subtarget) {
6727 MVT VT = Op.getSimpleValueType();
6728 unsigned NumElts = VT.getVectorNumElements();
6729 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6730 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6731 "Illegal vector insertion");
6732
6733 SDValue V;
6734 bool First = true;
6735
6736 for (unsigned i = 0; i < NumElts; ++i) {
6737 bool IsNonZero = NonZeroMask[i];
6738 if (!IsNonZero)
6739 continue;
6740
6741 // If the build vector contains zeros or our first insertion is not the
6742 // first index then insert into zero vector to break any register
6743 // dependency else use SCALAR_TO_VECTOR.
6744 if (First) {
6745 First = false;
6746 if (NumZero || 0 != i)
6747 V = getZeroVector(VT, Subtarget, DAG, DL);
6748 else {
6749 assert(0 == i && "Expected insertion into zero-index");
6750 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6751 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6752 V = DAG.getBitcast(VT, V);
6753 continue;
6754 }
6755 }
6756 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6757 DAG.getVectorIdxConstant(i, DL));
6758 }
6759
6760 return V;
6761}
6762
6763/// Custom lower build_vector of v16i8.
6765 const APInt &NonZeroMask,
6766 unsigned NumNonZero, unsigned NumZero,
6767 SelectionDAG &DAG,
6768 const X86Subtarget &Subtarget) {
6769 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6770 return SDValue();
6771
6772 // SSE4.1 - use PINSRB to insert each byte directly.
6773 if (Subtarget.hasSSE41())
6774 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6775 DAG, Subtarget);
6776
6777 SDValue V;
6778
6779 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6780 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6781 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6782 !NonZeroMask.extractBits(2, 2).isZero()) {
6783 for (unsigned I = 0; I != 4; ++I) {
6784 if (!NonZeroMask[I])
6785 continue;
6786 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6787 if (I != 0)
6788 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6789 DAG.getConstant(I * 8, DL, MVT::i8));
6790 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6791 }
6792 assert(V && "Failed to fold v16i8 vector to zero");
6793 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6794 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6795 V = DAG.getBitcast(MVT::v8i16, V);
6796 }
6797 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6798 bool ThisIsNonZero = NonZeroMask[i];
6799 bool NextIsNonZero = NonZeroMask[i + 1];
6800 if (!ThisIsNonZero && !NextIsNonZero)
6801 continue;
6802
6803 SDValue Elt;
6804 if (ThisIsNonZero) {
6805 if (NumZero || NextIsNonZero)
6806 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6807 else
6808 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6809 }
6810
6811 if (NextIsNonZero) {
6812 SDValue NextElt = Op.getOperand(i + 1);
6813 if (i == 0 && NumZero)
6814 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6815 else
6816 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6817 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6818 DAG.getConstant(8, DL, MVT::i8));
6819 if (ThisIsNonZero)
6820 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6821 else
6822 Elt = NextElt;
6823 }
6824
6825 // If our first insertion is not the first index or zeros are needed, then
6826 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6827 // elements undefined).
6828 if (!V) {
6829 if (i != 0 || NumZero)
6830 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6831 else {
6832 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6833 V = DAG.getBitcast(MVT::v8i16, V);
6834 continue;
6835 }
6836 }
6837 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6838 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6839 DAG.getVectorIdxConstant(i / 2, DL));
6840 }
6841
6842 return DAG.getBitcast(MVT::v16i8, V);
6843}
6844
6845/// Custom lower build_vector of v8i16.
6847 const APInt &NonZeroMask,
6848 unsigned NumNonZero, unsigned NumZero,
6849 SelectionDAG &DAG,
6850 const X86Subtarget &Subtarget) {
6851 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6852 return SDValue();
6853
6854 // Use PINSRW to insert each byte directly.
6855 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6856 Subtarget);
6857}
6858
6859/// Custom lower build_vector of v4i32 or v4f32.
6861 SelectionDAG &DAG,
6862 const X86Subtarget &Subtarget) {
6863 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6864 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6865 // Because we're creating a less complicated build vector here, we may enable
6866 // further folding of the MOVDDUP via shuffle transforms.
6867 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6868 Op.getOperand(0) == Op.getOperand(2) &&
6869 Op.getOperand(1) == Op.getOperand(3) &&
6870 Op.getOperand(0) != Op.getOperand(1)) {
6871 MVT VT = Op.getSimpleValueType();
6872 MVT EltVT = VT.getVectorElementType();
6873 // Create a new build vector with the first 2 elements followed by undef
6874 // padding, bitcast to v2f64, duplicate, and bitcast back.
6875 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6876 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6877 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6878 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6879 return DAG.getBitcast(VT, Dup);
6880 }
6881
6882 // Find all zeroable elements.
6883 std::bitset<4> Zeroable, Undefs;
6884 for (int i = 0; i < 4; ++i) {
6885 SDValue Elt = Op.getOperand(i);
6886 Undefs[i] = Elt.isUndef();
6887 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6888 }
6889 assert(Zeroable.size() - Zeroable.count() > 1 &&
6890 "We expect at least two non-zero elements!");
6891
6892 // We only know how to deal with build_vector nodes where elements are either
6893 // zeroable or extract_vector_elt with constant index.
6894 SDValue FirstNonZero;
6895 unsigned FirstNonZeroIdx;
6896 for (unsigned i = 0; i < 4; ++i) {
6897 if (Zeroable[i])
6898 continue;
6899 SDValue Elt = Op.getOperand(i);
6900 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6901 !isa<ConstantSDNode>(Elt.getOperand(1)))
6902 return SDValue();
6903 // Make sure that this node is extracting from a 128-bit vector.
6904 MVT VT = Elt.getOperand(0).getSimpleValueType();
6905 if (!VT.is128BitVector())
6906 return SDValue();
6907 if (!FirstNonZero.getNode()) {
6908 FirstNonZero = Elt;
6909 FirstNonZeroIdx = i;
6910 }
6911 }
6912
6913 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6914 SDValue V1 = FirstNonZero.getOperand(0);
6915 MVT VT = V1.getSimpleValueType();
6916
6917 // See if this build_vector can be lowered as a blend with zero.
6918 SDValue Elt;
6919 unsigned EltMaskIdx, EltIdx;
6920 int Mask[4];
6921 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6922 if (Zeroable[EltIdx]) {
6923 // The zero vector will be on the right hand side.
6924 Mask[EltIdx] = EltIdx+4;
6925 continue;
6926 }
6927
6928 Elt = Op->getOperand(EltIdx);
6929 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6930 EltMaskIdx = Elt.getConstantOperandVal(1);
6931 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6932 break;
6933 Mask[EltIdx] = EltIdx;
6934 }
6935
6936 if (EltIdx == 4) {
6937 // Let the shuffle legalizer deal with blend operations.
6938 SDValue VZeroOrUndef = (Zeroable == Undefs)
6939 ? DAG.getUNDEF(VT)
6940 : getZeroVector(VT, Subtarget, DAG, DL);
6941 if (V1.getSimpleValueType() != VT)
6942 V1 = DAG.getBitcast(VT, V1);
6943 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6944 }
6945
6946 // See if we can lower this build_vector to a INSERTPS.
6947 if (!Subtarget.hasSSE41())
6948 return SDValue();
6949
6950 SDValue V2 = Elt.getOperand(0);
6951 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6952 V1 = SDValue();
6953
6954 bool CanFold = true;
6955 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6956 if (Zeroable[i])
6957 continue;
6958
6959 SDValue Current = Op->getOperand(i);
6960 SDValue SrcVector = Current->getOperand(0);
6961 if (!V1.getNode())
6962 V1 = SrcVector;
6963 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6964 }
6965
6966 if (!CanFold)
6967 return SDValue();
6968
6969 assert(V1.getNode() && "Expected at least two non-zero elements!");
6970 if (V1.getSimpleValueType() != MVT::v4f32)
6971 V1 = DAG.getBitcast(MVT::v4f32, V1);
6972 if (V2.getSimpleValueType() != MVT::v4f32)
6973 V2 = DAG.getBitcast(MVT::v4f32, V2);
6974
6975 // Ok, we can emit an INSERTPS instruction.
6976 unsigned ZMask = Zeroable.to_ulong();
6977
6978 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6979 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6980 SDValue Result =
6981 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6982 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
6983 return DAG.getBitcast(VT, Result);
6984}
6985
6986/// Return a vector logical shift node.
6987static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6988 SelectionDAG &DAG, const TargetLowering &TLI,
6989 const SDLoc &dl) {
6990 assert(VT.is128BitVector() && "Unknown type for VShift");
6991 MVT ShVT = MVT::v16i8;
6992 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6993 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6994 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6995 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6996 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6997}
6998
7000 SelectionDAG &DAG) {
7001
7002 // Check if the scalar load can be widened into a vector load. And if
7003 // the address is "base + cst" see if the cst can be "absorbed" into
7004 // the shuffle mask.
7005 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
7006 SDValue Ptr = LD->getBasePtr();
7007 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7008 return SDValue();
7009 EVT PVT = LD->getValueType(0);
7010 if (PVT != MVT::i32 && PVT != MVT::f32)
7011 return SDValue();
7012
7013 int FI = -1;
7014 int64_t Offset = 0;
7015 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7016 FI = FINode->getIndex();
7017 Offset = 0;
7018 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7019 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7020 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7021 Offset = Ptr.getConstantOperandVal(1);
7022 Ptr = Ptr.getOperand(0);
7023 } else {
7024 return SDValue();
7025 }
7026
7027 // FIXME: 256-bit vector instructions don't require a strict alignment,
7028 // improve this code to support it better.
7029 Align RequiredAlign(VT.getSizeInBits() / 8);
7030 SDValue Chain = LD->getChain();
7031 // Make sure the stack object alignment is at least 16 or 32.
7033 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7034 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7035 if (MFI.isFixedObjectIndex(FI)) {
7036 // Can't change the alignment. FIXME: It's possible to compute
7037 // the exact stack offset and reference FI + adjust offset instead.
7038 // If someone *really* cares about this. That's the way to implement it.
7039 return SDValue();
7040 } else {
7041 MFI.setObjectAlignment(FI, RequiredAlign);
7042 }
7043 }
7044
7045 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7046 // Ptr + (Offset & ~15).
7047 if (Offset < 0)
7048 return SDValue();
7049 if ((Offset % RequiredAlign.value()) & 3)
7050 return SDValue();
7051 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7052 if (StartOffset) {
7053 SDLoc DL(Ptr);
7054 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7055 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7056 }
7057
7058 int EltNo = (Offset - StartOffset) >> 2;
7059 unsigned NumElems = VT.getVectorNumElements();
7060
7061 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7062 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7063 LD->getPointerInfo().getWithOffset(StartOffset));
7064
7065 SmallVector<int, 8> Mask(NumElems, EltNo);
7066
7067 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7068 }
7069
7070 return SDValue();
7071}
7072
7073// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7074static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7075 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7076 auto *BaseLd = cast<LoadSDNode>(Elt);
7077 if (!BaseLd->isSimple())
7078 return false;
7079 Ld = BaseLd;
7080 ByteOffset = 0;
7081 return true;
7082 }
7083
7084 switch (Elt.getOpcode()) {
7085 case ISD::BITCAST:
7086 case ISD::TRUNCATE:
7088 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7089 case ISD::SRL:
7090 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7091 uint64_t Amt = AmtC->getZExtValue();
7092 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7093 ByteOffset += Amt / 8;
7094 return true;
7095 }
7096 }
7097 break;
7099 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7100 SDValue Src = Elt.getOperand(0);
7101 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7102 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7103 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7104 findEltLoadSrc(Src, Ld, ByteOffset)) {
7105 uint64_t Idx = IdxC->getZExtValue();
7106 ByteOffset += Idx * (SrcSizeInBits / 8);
7107 return true;
7108 }
7109 }
7110 break;
7111 }
7112
7113 return false;
7114}
7115
7116/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7117/// elements can be replaced by a single large load which has the same value as
7118/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7119///
7120/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7122 const SDLoc &DL, SelectionDAG &DAG,
7123 const X86Subtarget &Subtarget,
7124 bool IsAfterLegalize) {
7125 if ((VT.getScalarSizeInBits() % 8) != 0)
7126 return SDValue();
7127
7128 unsigned NumElems = Elts.size();
7129
7130 int LastLoadedElt = -1;
7131 APInt LoadMask = APInt::getZero(NumElems);
7132 APInt ZeroMask = APInt::getZero(NumElems);
7133 APInt UndefMask = APInt::getZero(NumElems);
7134
7135 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7136 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7137
7138 // For each element in the initializer, see if we've found a load, zero or an
7139 // undef.
7140 for (unsigned i = 0; i < NumElems; ++i) {
7141 SDValue Elt = peekThroughBitcasts(Elts[i]);
7142 if (!Elt.getNode())
7143 return SDValue();
7144 if (Elt.isUndef()) {
7145 UndefMask.setBit(i);
7146 continue;
7147 }
7149 ZeroMask.setBit(i);
7150 continue;
7151 }
7152
7153 // Each loaded element must be the correct fractional portion of the
7154 // requested vector load.
7155 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7156 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7157 return SDValue();
7158
7159 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7160 return SDValue();
7161 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7162 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7163 return SDValue();
7164
7165 LoadMask.setBit(i);
7166 LastLoadedElt = i;
7167 }
7168 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7169 NumElems &&
7170 "Incomplete element masks");
7171
7172 // Handle Special Cases - all undef or undef/zero.
7173 if (UndefMask.popcount() == NumElems)
7174 return DAG.getUNDEF(VT);
7175 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7176 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7177 : DAG.getConstantFP(0.0, DL, VT);
7178
7179 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7180 int FirstLoadedElt = LoadMask.countr_zero();
7181 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7182 EVT EltBaseVT = EltBase.getValueType();
7183 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7184 "Register/Memory size mismatch");
7185 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7186 assert(LDBase && "Did not find base load for merging consecutive loads");
7187 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7188 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7189 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7190 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7191 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7192
7193 // TODO: Support offsetting the base load.
7194 if (ByteOffsets[FirstLoadedElt] != 0)
7195 return SDValue();
7196
7197 // Check to see if the element's load is consecutive to the base load
7198 // or offset from a previous (already checked) load.
7199 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7200 LoadSDNode *Ld = Loads[EltIdx];
7201 int64_t ByteOffset = ByteOffsets[EltIdx];
7202 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7203 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7204 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7205 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7206 }
7207 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7208 EltIdx - FirstLoadedElt);
7209 };
7210
7211 // Consecutive loads can contain UNDEFS but not ZERO elements.
7212 // Consecutive loads with UNDEFs and ZEROs elements require a
7213 // an additional shuffle stage to clear the ZERO elements.
7214 bool IsConsecutiveLoad = true;
7215 bool IsConsecutiveLoadWithZeros = true;
7216 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7217 if (LoadMask[i]) {
7218 if (!CheckConsecutiveLoad(LDBase, i)) {
7219 IsConsecutiveLoad = false;
7220 IsConsecutiveLoadWithZeros = false;
7221 break;
7222 }
7223 } else if (ZeroMask[i]) {
7224 IsConsecutiveLoad = false;
7225 }
7226 }
7227
7228 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7229 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7230 assert(LDBase->isSimple() &&
7231 "Cannot merge volatile or atomic loads.");
7232 SDValue NewLd =
7233 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7234 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
7235 MMOFlags);
7236 for (auto *LD : Loads)
7237 if (LD)
7238 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7239 return NewLd;
7240 };
7241
7242 // Check if the base load is entirely dereferenceable.
7243 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7244 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7245
7246 // LOAD - all consecutive load/undefs (must start/end with a load or be
7247 // entirely dereferenceable). If we have found an entire vector of loads and
7248 // undefs, then return a large load of the entire vector width starting at the
7249 // base pointer. If the vector contains zeros, then attempt to shuffle those
7250 // elements.
7251 if (FirstLoadedElt == 0 &&
7252 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7253 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7254 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7255 return SDValue();
7256
7257 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7258 // will lower to regular temporal loads and use the cache.
7259 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7260 VT.is256BitVector() && !Subtarget.hasInt256())
7261 return SDValue();
7262
7263 if (NumElems == 1)
7264 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7265
7266 if (!ZeroMask)
7267 return CreateLoad(VT, LDBase);
7268
7269 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7270 // vector and a zero vector to clear out the zero elements.
7271 if (!IsAfterLegalize && VT.isVector()) {
7272 unsigned NumMaskElts = VT.getVectorNumElements();
7273 if ((NumMaskElts % NumElems) == 0) {
7274 unsigned Scale = NumMaskElts / NumElems;
7275 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7276 for (unsigned i = 0; i < NumElems; ++i) {
7277 if (UndefMask[i])
7278 continue;
7279 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7280 for (unsigned j = 0; j != Scale; ++j)
7281 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7282 }
7283 SDValue V = CreateLoad(VT, LDBase);
7284 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7285 : DAG.getConstantFP(0.0, DL, VT);
7286 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7287 }
7288 }
7289 }
7290
7291 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7292 if (VT.is256BitVector() || VT.is512BitVector()) {
7293 unsigned HalfNumElems = NumElems / 2;
7294 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7295 EVT HalfVT =
7296 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7297 SDValue HalfLD =
7298 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7299 DAG, Subtarget, IsAfterLegalize);
7300 if (HalfLD)
7301 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7302 HalfLD, DAG.getVectorIdxConstant(0, DL));
7303 }
7304 }
7305
7306 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7307 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7308 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7309 LoadSizeInBits == 64) &&
7310 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7311 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7312 : MVT::getIntegerVT(LoadSizeInBits);
7313 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7314 // Allow v4f32 on SSE1 only targets.
7315 // FIXME: Add more isel patterns so we can just use VT directly.
7316 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7317 VecVT = MVT::v4f32;
7318 if (TLI.isTypeLegal(VecVT)) {
7319 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7320 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7321 SDValue ResNode = DAG.getMemIntrinsicNode(
7322 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7324 for (auto *LD : Loads)
7325 if (LD)
7326 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7327 return DAG.getBitcast(VT, ResNode);
7328 }
7329 }
7330
7331 // BROADCAST - match the smallest possible repetition pattern, load that
7332 // scalar/subvector element and then broadcast to the entire vector.
7333 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7334 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7335 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7336 unsigned RepeatSize = SubElems * BaseSizeInBits;
7337 unsigned ScalarSize = std::min(RepeatSize, 64u);
7338 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7339 continue;
7340
7341 // Don't attempt a 1:N subvector broadcast - it should be caught by
7342 // combineConcatVectorOps, else will cause infinite loops.
7343 if (RepeatSize > ScalarSize && SubElems == 1)
7344 continue;
7345
7346 bool Match = true;
7347 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7348 for (unsigned i = 0; i != NumElems && Match; ++i) {
7349 if (!LoadMask[i])
7350 continue;
7351 SDValue Elt = peekThroughBitcasts(Elts[i]);
7352 if (RepeatedLoads[i % SubElems].isUndef())
7353 RepeatedLoads[i % SubElems] = Elt;
7354 else
7355 Match &= (RepeatedLoads[i % SubElems] == Elt);
7356 }
7357
7358 // We must have loads at both ends of the repetition.
7359 Match &= !RepeatedLoads.front().isUndef();
7360 Match &= !RepeatedLoads.back().isUndef();
7361 if (!Match)
7362 continue;
7363
7364 EVT RepeatVT =
7365 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7366 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7367 : EVT::getFloatingPointVT(ScalarSize);
7368 if (RepeatSize > ScalarSize)
7369 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7370 RepeatSize / ScalarSize);
7371 EVT BroadcastVT =
7372 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7373 VT.getSizeInBits() / ScalarSize);
7374 if (TLI.isTypeLegal(BroadcastVT)) {
7375 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7376 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7377 SDValue Broadcast = RepeatLoad;
7378 if (RepeatSize > ScalarSize) {
7379 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7380 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7381 } else {
7382 if (!Subtarget.hasAVX2() &&
7384 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7385 Subtarget,
7386 /*AssumeSingleUse=*/true))
7387 return SDValue();
7388 Broadcast =
7389 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7390 }
7391 return DAG.getBitcast(VT, Broadcast);
7392 }
7393 }
7394 }
7395 }
7396
7397 return SDValue();
7398}
7399
7400// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7401// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7402// are consecutive, non-overlapping, and in the right order.
7404 SelectionDAG &DAG,
7405 const X86Subtarget &Subtarget,
7406 bool IsAfterLegalize) {
7408 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7409 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7410 Elts.push_back(Elt);
7411 continue;
7412 }
7413 return SDValue();
7414 }
7415 assert(Elts.size() == VT.getVectorNumElements());
7416 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7417 IsAfterLegalize);
7418}
7419
7421 const APInt &Undefs, LLVMContext &C) {
7422 unsigned ScalarSize = VT.getScalarSizeInBits();
7423 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7424
7425 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7426 if (VT.isFloatingPoint()) {
7427 if (ScalarSize == 16)
7428 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7429 if (ScalarSize == 32)
7430 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7431 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7432 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7433 }
7434 return Constant::getIntegerValue(Ty, Val);
7435 };
7436
7437 SmallVector<Constant *, 32> ConstantVec;
7438 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7439 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7440 : getConstantScalar(Bits[I]));
7441
7442 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7443}
7444
7445static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7446 unsigned SplatBitSize, LLVMContext &C) {
7447 unsigned ScalarSize = VT.getScalarSizeInBits();
7448
7449 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7450 if (VT.isFloatingPoint()) {
7451 if (ScalarSize == 16)
7452 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7453 if (ScalarSize == 32)
7454 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7455 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7456 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7457 }
7458 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7459 };
7460
7461 if (ScalarSize == SplatBitSize)
7462 return getConstantScalar(SplatValue);
7463
7464 unsigned NumElm = SplatBitSize / ScalarSize;
7465 SmallVector<Constant *, 32> ConstantVec;
7466 for (unsigned I = 0; I != NumElm; ++I) {
7467 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7468 ConstantVec.push_back(getConstantScalar(Val));
7469 }
7470 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7471}
7472
7474 for (auto *U : N->users()) {
7475 unsigned Opc = U->getOpcode();
7476 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7477 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7478 return false;
7479 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7480 return false;
7481 if (isTargetShuffle(Opc))
7482 return true;
7483 if (Opc == ISD::BITCAST) // Ignore bitcasts
7484 return isFoldableUseOfShuffle(U);
7485 if (N->hasOneUse()) {
7486 // TODO, there may be some general way to know if a SDNode can
7487 // be folded. We now only know whether an MI is foldable.
7488 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7489 return false;
7490 return true;
7491 }
7492 }
7493 return false;
7494}
7495
7496/// Attempt to use the vbroadcast instruction to generate a splat value
7497/// from a splat BUILD_VECTOR which uses:
7498/// a. A single scalar load, or a constant.
7499/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7500///
7501/// The VBROADCAST node is returned when a pattern is found,
7502/// or SDValue() otherwise.
7504 const SDLoc &dl,
7505 const X86Subtarget &Subtarget,
7506 SelectionDAG &DAG) {
7507 // VBROADCAST requires AVX.
7508 // TODO: Splats could be generated for non-AVX CPUs using SSE
7509 // instructions, but there's less potential gain for only 128-bit vectors.
7510 if (!Subtarget.hasAVX())
7511 return SDValue();
7512
7513 MVT VT = BVOp->getSimpleValueType(0);
7514 unsigned NumElts = VT.getVectorNumElements();
7515 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7516 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7517 "Unsupported vector type for broadcast.");
7518
7519 // See if the build vector is a repeating sequence of scalars (inc. splat).
7520 SDValue Ld;
7521 BitVector UndefElements;
7522 SmallVector<SDValue, 16> Sequence;
7523 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7524 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7525 if (Sequence.size() == 1)
7526 Ld = Sequence[0];
7527 }
7528
7529 // Attempt to use VBROADCASTM
7530 // From this pattern:
7531 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7532 // b. t1 = (build_vector t0 t0)
7533 //
7534 // Create (VBROADCASTM v2i1 X)
7535 if (!Sequence.empty() && Subtarget.hasCDI()) {
7536 // If not a splat, are the upper sequence values zeroable?
7537 unsigned SeqLen = Sequence.size();
7538 bool UpperZeroOrUndef =
7539 SeqLen == 1 ||
7540 llvm::all_of(ArrayRef(Sequence).drop_front(),
7541 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7542 SDValue Op0 = Sequence[0];
7543 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7544 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7545 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7546 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7547 ? Op0.getOperand(0)
7548 : Op0.getOperand(0).getOperand(0);
7549 MVT MaskVT = BOperand.getSimpleValueType();
7550 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7551 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7552 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7553 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7554 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7555 unsigned Scale = 512 / VT.getSizeInBits();
7556 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7557 }
7558 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7559 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7560 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7561 return DAG.getBitcast(VT, Bcst);
7562 }
7563 }
7564 }
7565
7566 unsigned NumUndefElts = UndefElements.count();
7567 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7568 APInt SplatValue, Undef;
7569 unsigned SplatBitSize;
7570 bool HasUndef;
7571 // Check if this is a repeated constant pattern suitable for broadcasting.
7572 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7573 SplatBitSize > VT.getScalarSizeInBits() &&
7574 SplatBitSize < VT.getSizeInBits()) {
7575 // Avoid replacing with broadcast when it's a use of a shuffle
7576 // instruction to preserve the present custom lowering of shuffles.
7577 if (isFoldableUseOfShuffle(BVOp))
7578 return SDValue();
7579 // replace BUILD_VECTOR with broadcast of the repeated constants.
7580 LLVMContext *Ctx = DAG.getContext();
7581 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7582 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7583 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7584 // Load the constant scalar/subvector and broadcast it.
7585 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7586 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7587 SDValue CP = DAG.getConstantPool(C, PVT);
7588 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7589
7590 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7591 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7592 SDValue Ops[] = {DAG.getEntryNode(), CP};
7593 MachinePointerInfo MPI =
7595 SDValue Brdcst =
7596 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7597 MPI, Alignment, MachineMemOperand::MOLoad);
7598 return DAG.getBitcast(VT, Brdcst);
7599 }
7600 if (SplatBitSize > 64) {
7601 // Load the vector of constants and broadcast it.
7602 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7603 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7604 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7605 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7606 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7607 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7608 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7609 MachinePointerInfo MPI =
7612 Ops, VVT, MPI, Alignment,
7614 }
7615 }
7616
7617 // If we are moving a scalar into a vector (Ld must be set and all elements
7618 // but 1 are undef) and that operation is not obviously supported by
7619 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7620 // That's better than general shuffling and may eliminate a load to GPR and
7621 // move from scalar to vector register.
7622 if (!Ld || NumElts - NumUndefElts != 1)
7623 return SDValue();
7624 unsigned ScalarSize = Ld.getValueSizeInBits();
7625 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7626 return SDValue();
7627 }
7628
7629 bool ConstSplatVal =
7630 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7631 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7632
7633 // TODO: Handle broadcasts of non-constant sequences.
7634
7635 // Make sure that all of the users of a non-constant load are from the
7636 // BUILD_VECTOR node.
7637 // FIXME: Is the use count needed for non-constant, non-load case?
7638 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7639 return SDValue();
7640
7641 unsigned ScalarSize = Ld.getValueSizeInBits();
7642 bool IsGE256 = (VT.getSizeInBits() >= 256);
7643
7644 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7645 // instruction to save 8 or more bytes of constant pool data.
7646 // TODO: If multiple splats are generated to load the same constant,
7647 // it may be detrimental to overall size. There needs to be a way to detect
7648 // that condition to know if this is truly a size win.
7649 bool OptForSize = DAG.shouldOptForSize();
7650
7651 // Handle broadcasting a single constant scalar from the constant pool
7652 // into a vector.
7653 // On Sandybridge (no AVX2), it is still better to load a constant vector
7654 // from the constant pool and not to broadcast it from a scalar.
7655 // But override that restriction when optimizing for size.
7656 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7657 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7658 EVT CVT = Ld.getValueType();
7659 assert(!CVT.isVector() && "Must not broadcast a vector type");
7660
7661 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7662 // For size optimization, also splat v2f64 and v2i64, and for size opt
7663 // with AVX2, also splat i8 and i16.
7664 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7665 if (ScalarSize == 32 ||
7666 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7667 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7668 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7669 const Constant *C = nullptr;
7670 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7671 C = CI->getConstantIntValue();
7672 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7673 C = CF->getConstantFPValue();
7674
7675 assert(C && "Invalid constant type");
7676
7677 SDValue CP =
7679 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7680
7681 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7682 SDValue Ops[] = {DAG.getEntryNode(), CP};
7683 MachinePointerInfo MPI =
7685 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7686 MPI, Alignment, MachineMemOperand::MOLoad);
7687 }
7688 }
7689
7690 // Handle AVX2 in-register broadcasts.
7691 if (!IsLoad && Subtarget.hasInt256() &&
7692 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7693 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7694
7695 // The scalar source must be a normal load.
7696 if (!IsLoad)
7697 return SDValue();
7698
7699 // Make sure the non-chain result is only used by this build vector.
7700 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7701 return SDValue();
7702
7703 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7704 (Subtarget.hasVLX() && ScalarSize == 64)) {
7705 auto *LN = cast<LoadSDNode>(Ld);
7706 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7707 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7708 SDValue BCast =
7710 LN->getMemoryVT(), LN->getMemOperand());
7711 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7712 return BCast;
7713 }
7714
7715 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7716 // double since there is no vbroadcastsd xmm
7717 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7718 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7719 auto *LN = cast<LoadSDNode>(Ld);
7720 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7721 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7722 SDValue BCast =
7724 LN->getMemoryVT(), LN->getMemOperand());
7725 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7726 return BCast;
7727 }
7728
7729 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7730 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7731
7732 // Unsupported broadcast.
7733 return SDValue();
7734}
7735
7736/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7737/// underlying vector and index.
7738///
7739/// Modifies \p ExtractedFromVec to the real vector and returns the real
7740/// index.
7741static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7742 SDValue ExtIdx) {
7743 int Idx = ExtIdx->getAsZExtVal();
7744 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7745 return Idx;
7746
7747 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7748 // lowered this:
7749 // (extract_vector_elt (v8f32 %1), Constant<6>)
7750 // to:
7751 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7752 // (extract_subvector (v8f32 %0), Constant<4>),
7753 // undef)
7754 // Constant<0>)
7755 // In this case the vector is the extract_subvector expression and the index
7756 // is 2, as specified by the shuffle.
7757 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7758 SDValue ShuffleVec = SVOp->getOperand(0);
7759 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7760 assert(ShuffleVecVT.getVectorElementType() ==
7761 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7762
7763 int ShuffleIdx = SVOp->getMaskElt(Idx);
7764 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7765 ExtractedFromVec = ShuffleVec;
7766 return ShuffleIdx;
7767 }
7768 return Idx;
7769}
7770
7772 SelectionDAG &DAG) {
7773 MVT VT = Op.getSimpleValueType();
7774
7775 // Skip if insert_vec_elt is not supported.
7776 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7778 return SDValue();
7779
7780 unsigned NumElems = Op.getNumOperands();
7781 SDValue VecIn1;
7782 SDValue VecIn2;
7783 SmallVector<unsigned, 4> InsertIndices;
7784 SmallVector<int, 8> Mask(NumElems, -1);
7785
7786 for (unsigned i = 0; i != NumElems; ++i) {
7787 unsigned Opc = Op.getOperand(i).getOpcode();
7788
7789 if (Opc == ISD::UNDEF)
7790 continue;
7791
7792 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7793 // Quit if more than 1 elements need inserting.
7794 if (InsertIndices.size() > 1)
7795 return SDValue();
7796
7797 InsertIndices.push_back(i);
7798 continue;
7799 }
7800
7801 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7802 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7803
7804 // Quit if non-constant index.
7805 if (!isa<ConstantSDNode>(ExtIdx))
7806 return SDValue();
7807 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7808
7809 // Quit if extracted from vector of different type.
7810 if (ExtractedFromVec.getValueType() != VT)
7811 return SDValue();
7812
7813 if (!VecIn1.getNode())
7814 VecIn1 = ExtractedFromVec;
7815 else if (VecIn1 != ExtractedFromVec) {
7816 if (!VecIn2.getNode())
7817 VecIn2 = ExtractedFromVec;
7818 else if (VecIn2 != ExtractedFromVec)
7819 // Quit if more than 2 vectors to shuffle
7820 return SDValue();
7821 }
7822
7823 if (ExtractedFromVec == VecIn1)
7824 Mask[i] = Idx;
7825 else if (ExtractedFromVec == VecIn2)
7826 Mask[i] = Idx + NumElems;
7827 }
7828
7829 if (!VecIn1.getNode())
7830 return SDValue();
7831
7832 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7833 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7834
7835 for (unsigned Idx : InsertIndices)
7836 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7838
7839 return NV;
7840}
7841
7842// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7844 const X86Subtarget &Subtarget) {
7845 MVT VT = Op.getSimpleValueType();
7846 MVT IVT =
7847 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7849 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7850 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7851 Op.getOperand(I)));
7852 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7853 return DAG.getBitcast(VT, Res);
7854}
7855
7856// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7858 SelectionDAG &DAG,
7859 const X86Subtarget &Subtarget) {
7860
7861 MVT VT = Op.getSimpleValueType();
7862 assert((VT.getVectorElementType() == MVT::i1) &&
7863 "Unexpected type in LowerBUILD_VECTORvXi1!");
7864 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7865 ISD::isBuildVectorAllOnes(Op.getNode()))
7866 return Op;
7867
7868 uint64_t Immediate = 0;
7869 SmallVector<unsigned, 16> NonConstIdx;
7870 bool IsSplat = true;
7871 bool HasConstElts = false;
7872 int SplatIdx = -1;
7873 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7874 SDValue In = Op.getOperand(idx);
7875 if (In.isUndef())
7876 continue;
7877 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7878 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7879 HasConstElts = true;
7880 } else {
7881 NonConstIdx.push_back(idx);
7882 }
7883 if (SplatIdx < 0)
7884 SplatIdx = idx;
7885 else if (In != Op.getOperand(SplatIdx))
7886 IsSplat = false;
7887 }
7888
7889 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7890 if (IsSplat) {
7891 // The build_vector allows the scalar element to be larger than the vector
7892 // element type. We need to mask it to use as a condition unless we know
7893 // the upper bits are zero.
7894 // FIXME: Use computeKnownBits instead of checking specific opcode?
7895 SDValue Cond = Op.getOperand(SplatIdx);
7896 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7897 if (Cond.getOpcode() != ISD::SETCC)
7898 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7899 DAG.getConstant(1, dl, MVT::i8));
7900
7901 // Perform the select in the scalar domain so we can use cmov.
7902 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7903 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7904 DAG.getAllOnesConstant(dl, MVT::i32),
7905 DAG.getConstant(0, dl, MVT::i32));
7906 Select = DAG.getBitcast(MVT::v32i1, Select);
7907 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7908 } else {
7909 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7910 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7911 DAG.getAllOnesConstant(dl, ImmVT),
7912 DAG.getConstant(0, dl, ImmVT));
7913 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7914 Select = DAG.getBitcast(VecVT, Select);
7915 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7916 DAG.getVectorIdxConstant(0, dl));
7917 }
7918 }
7919
7920 // insert elements one by one
7921 SDValue DstVec;
7922 if (HasConstElts) {
7923 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7924 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7925 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7926 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7927 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7928 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7929 } else {
7930 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7931 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7932 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7933 DstVec = DAG.getBitcast(VecVT, Imm);
7934 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7935 DAG.getVectorIdxConstant(0, dl));
7936 }
7937 } else
7938 DstVec = DAG.getUNDEF(VT);
7939
7940 for (unsigned InsertIdx : NonConstIdx) {
7941 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7942 Op.getOperand(InsertIdx),
7943 DAG.getVectorIdxConstant(InsertIdx, dl));
7944 }
7945 return DstVec;
7946}
7947
7948LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7949 switch (Opcode) {
7950 case X86ISD::PACKSS:
7951 case X86ISD::PACKUS:
7952 case X86ISD::FHADD:
7953 case X86ISD::FHSUB:
7954 case X86ISD::HADD:
7955 case X86ISD::HSUB:
7956 return true;
7957 }
7958 return false;
7959}
7960
7961/// This is a helper function of LowerToHorizontalOp().
7962/// This function checks that the build_vector \p N in input implements a
7963/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7964/// may not match the layout of an x86 256-bit horizontal instruction.
7965/// In other words, if this returns true, then some extraction/insertion will
7966/// be required to produce a valid horizontal instruction.
7967///
7968/// Parameter \p Opcode defines the kind of horizontal operation to match.
7969/// For example, if \p Opcode is equal to ISD::ADD, then this function
7970/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7971/// is equal to ISD::SUB, then this function checks if this is a horizontal
7972/// arithmetic sub.
7973///
7974/// This function only analyzes elements of \p N whose indices are
7975/// in range [BaseIdx, LastIdx).
7976///
7977/// TODO: This function was originally used to match both real and fake partial
7978/// horizontal operations, but the index-matching logic is incorrect for that.
7979/// See the corrected implementation in isHopBuildVector(). Can we reduce this
7980/// code because it is only used for partial h-op matching now?
7981static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7982 const SDLoc &DL, SelectionDAG &DAG,
7983 unsigned BaseIdx, unsigned LastIdx,
7984 SDValue &V0, SDValue &V1) {
7985 EVT VT = N->getValueType(0);
7986 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7987 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7988 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7989 "Invalid Vector in input!");
7990
7991 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7992 bool CanFold = true;
7993 unsigned ExpectedVExtractIdx = BaseIdx;
7994 unsigned NumElts = LastIdx - BaseIdx;
7995 V0 = DAG.getUNDEF(VT);
7996 V1 = DAG.getUNDEF(VT);
7997
7998 // Check if N implements a horizontal binop.
7999 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8000 SDValue Op = N->getOperand(i + BaseIdx);
8001
8002 // Skip UNDEFs.
8003 if (Op->isUndef()) {
8004 // Update the expected vector extract index.
8005 if (i * 2 == NumElts)
8006 ExpectedVExtractIdx = BaseIdx;
8007 ExpectedVExtractIdx += 2;
8008 continue;
8009 }
8010
8011 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8012
8013 if (!CanFold)
8014 break;
8015
8016 SDValue Op0 = Op.getOperand(0);
8017 SDValue Op1 = Op.getOperand(1);
8018
8019 // Try to match the following pattern:
8020 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8021 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8023 Op0.getOperand(0) == Op1.getOperand(0) &&
8024 isa<ConstantSDNode>(Op0.getOperand(1)) &&
8025 isa<ConstantSDNode>(Op1.getOperand(1)));
8026 if (!CanFold)
8027 break;
8028
8029 unsigned I0 = Op0.getConstantOperandVal(1);
8030 unsigned I1 = Op1.getConstantOperandVal(1);
8031
8032 if (i * 2 < NumElts) {
8033 if (V0.isUndef()) {
8034 V0 = Op0.getOperand(0);
8035 if (V0.getValueType() != VT)
8036 return false;
8037 }
8038 } else {
8039 if (V1.isUndef()) {
8040 V1 = Op0.getOperand(0);
8041 if (V1.getValueType() != VT)
8042 return false;
8043 }
8044 if (i * 2 == NumElts)
8045 ExpectedVExtractIdx = BaseIdx;
8046 }
8047
8048 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8049 if (I0 == ExpectedVExtractIdx)
8050 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8051 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8052 // Try to match the following dag sequence:
8053 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8054 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8055 } else
8056 CanFold = false;
8057
8058 ExpectedVExtractIdx += 2;
8059 }
8060
8061 return CanFold;
8062}
8063
8064/// Emit a sequence of two 128-bit horizontal add/sub followed by
8065/// a concat_vector.
8066///
8067/// This is a helper function of LowerToHorizontalOp().
8068/// This function expects two 256-bit vectors called V0 and V1.
8069/// At first, each vector is split into two separate 128-bit vectors.
8070/// Then, the resulting 128-bit vectors are used to implement two
8071/// horizontal binary operations.
8072///
8073/// The kind of horizontal binary operation is defined by \p X86Opcode.
8074///
8075/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8076/// the two new horizontal binop.
8077/// When Mode is set, the first horizontal binop dag node would take as input
8078/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8079/// horizontal binop dag node would take as input the lower 128-bit of V1
8080/// and the upper 128-bit of V1.
8081/// Example:
8082/// HADD V0_LO, V0_HI
8083/// HADD V1_LO, V1_HI
8084///
8085/// Otherwise, the first horizontal binop dag node takes as input the lower
8086/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8087/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8088/// Example:
8089/// HADD V0_LO, V1_LO
8090/// HADD V0_HI, V1_HI
8091///
8092/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8093/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8094/// the upper 128-bits of the result.
8095static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8096 const SDLoc &DL, SelectionDAG &DAG,
8097 unsigned X86Opcode, bool Mode,
8098 bool isUndefLO, bool isUndefHI) {
8099 MVT VT = V0.getSimpleValueType();
8100 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8101 "Invalid nodes in input!");
8102
8103 unsigned NumElts = VT.getVectorNumElements();
8104 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8105 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8106 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8107 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8108 MVT NewVT = V0_LO.getSimpleValueType();
8109
8110 SDValue LO = DAG.getUNDEF(NewVT);
8111 SDValue HI = DAG.getUNDEF(NewVT);
8112
8113 if (Mode) {
8114 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8115 if (!isUndefLO && !V0->isUndef())
8116 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8117 if (!isUndefHI && !V1->isUndef())
8118 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8119 } else {
8120 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8121 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8122 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8123
8124 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8125 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8126 }
8127
8128 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8129}
8130
8131/// Returns true iff \p BV builds a vector with the result equivalent to
8132/// the result of ADDSUB/SUBADD operation.
8133/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8134/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8135/// \p Opnd0 and \p Opnd1.
8137 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8138 SDValue &Opnd0, SDValue &Opnd1,
8139 unsigned &NumExtracts,
8140 bool &IsSubAdd) {
8141
8142 MVT VT = BV->getSimpleValueType(0);
8143 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8144 return false;
8145
8146 unsigned NumElts = VT.getVectorNumElements();
8147 SDValue InVec0 = DAG.getUNDEF(VT);
8148 SDValue InVec1 = DAG.getUNDEF(VT);
8149
8150 NumExtracts = 0;
8151
8152 // Odd-numbered elements in the input build vector are obtained from
8153 // adding/subtracting two integer/float elements.
8154 // Even-numbered elements in the input build vector are obtained from
8155 // subtracting/adding two integer/float elements.
8156 unsigned Opc[2] = {0, 0};
8157 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8158 SDValue Op = BV->getOperand(i);
8159
8160 // Skip 'undef' values.
8161 unsigned Opcode = Op.getOpcode();
8162 if (Opcode == ISD::UNDEF)
8163 continue;
8164
8165 // Early exit if we found an unexpected opcode.
8166 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8167 return false;
8168
8169 SDValue Op0 = Op.getOperand(0);
8170 SDValue Op1 = Op.getOperand(1);
8171
8172 // Try to match the following pattern:
8173 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8174 // Early exit if we cannot match that sequence.
8175 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8177 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8178 Op0.getOperand(1) != Op1.getOperand(1))
8179 return false;
8180
8181 unsigned I0 = Op0.getConstantOperandVal(1);
8182 if (I0 != i)
8183 return false;
8184
8185 // We found a valid add/sub node, make sure its the same opcode as previous
8186 // elements for this parity.
8187 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8188 return false;
8189 Opc[i % 2] = Opcode;
8190
8191 // Update InVec0 and InVec1.
8192 if (InVec0.isUndef()) {
8193 InVec0 = Op0.getOperand(0);
8194 if (InVec0.getSimpleValueType() != VT)
8195 return false;
8196 }
8197 if (InVec1.isUndef()) {
8198 InVec1 = Op1.getOperand(0);
8199 if (InVec1.getSimpleValueType() != VT)
8200 return false;
8201 }
8202
8203 // Make sure that operands in input to each add/sub node always
8204 // come from a same pair of vectors.
8205 if (InVec0 != Op0.getOperand(0)) {
8206 if (Opcode == ISD::FSUB)
8207 return false;
8208
8209 // FADD is commutable. Try to commute the operands
8210 // and then test again.
8211 std::swap(Op0, Op1);
8212 if (InVec0 != Op0.getOperand(0))
8213 return false;
8214 }
8215
8216 if (InVec1 != Op1.getOperand(0))
8217 return false;
8218
8219 // Increment the number of extractions done.
8220 ++NumExtracts;
8221 }
8222
8223 // Ensure we have found an opcode for both parities and that they are
8224 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8225 // inputs are undef.
8226 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8227 InVec0.isUndef() || InVec1.isUndef())
8228 return false;
8229
8230 IsSubAdd = Opc[0] == ISD::FADD;
8231
8232 Opnd0 = InVec0;
8233 Opnd1 = InVec1;
8234 return true;
8235}
8236
8237/// Returns true if is possible to fold MUL and an idiom that has already been
8238/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8239/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8240/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8241///
8242/// Prior to calling this function it should be known that there is some
8243/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8244/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8245/// before replacement of such SDNode with ADDSUB operation. Thus the number
8246/// of \p Opnd0 uses is expected to be equal to 2.
8247/// For example, this function may be called for the following IR:
8248/// %AB = fmul fast <2 x double> %A, %B
8249/// %Sub = fsub fast <2 x double> %AB, %C
8250/// %Add = fadd fast <2 x double> %AB, %C
8251/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8252/// <2 x i32> <i32 0, i32 3>
8253/// There is a def for %Addsub here, which potentially can be replaced by
8254/// X86ISD::ADDSUB operation:
8255/// %Addsub = X86ISD::ADDSUB %AB, %C
8256/// and such ADDSUB can further be replaced with FMADDSUB:
8257/// %Addsub = FMADDSUB %A, %B, %C.
8258///
8259/// The main reason why this method is called before the replacement of the
8260/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8261/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8262/// FMADDSUB is.
8263static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8264 SelectionDAG &DAG,
8265 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8266 unsigned ExpectedUses) {
8267 if (Opnd0.getOpcode() != ISD::FMUL ||
8268 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8269 return false;
8270
8271 // FIXME: These checks must match the similar ones in
8272 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8273 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8274 // or MUL + ADDSUB to FMADDSUB.
8275 const TargetOptions &Options = DAG.getTarget().Options;
8276 bool AllowFusion =
8277 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8278 if (!AllowFusion)
8279 return false;
8280
8281 Opnd2 = Opnd1;
8282 Opnd1 = Opnd0.getOperand(1);
8283 Opnd0 = Opnd0.getOperand(0);
8284
8285 return true;
8286}
8287
8288/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8289/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8290/// X86ISD::FMSUBADD node.
8292 const SDLoc &DL,
8293 const X86Subtarget &Subtarget,
8294 SelectionDAG &DAG) {
8295 SDValue Opnd0, Opnd1;
8296 unsigned NumExtracts;
8297 bool IsSubAdd;
8298 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8299 IsSubAdd))
8300 return SDValue();
8301
8302 MVT VT = BV->getSimpleValueType(0);
8303
8304 // Try to generate X86ISD::FMADDSUB node here.
8305 SDValue Opnd2;
8306 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8307 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8308 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8309 }
8310
8311 // We only support ADDSUB.
8312 if (IsSubAdd)
8313 return SDValue();
8314
8315 // There are no known X86 targets with 512-bit ADDSUB instructions!
8316 // Convert to blend(fsub,fadd).
8317 if (VT.is512BitVector()) {
8318 SmallVector<int> Mask;
8319 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8320 Mask.push_back(I);
8321 Mask.push_back(I + E + 1);
8322 }
8323 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8324 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8325 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8326 }
8327
8328 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8329}
8330
8332 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8333 // Initialize outputs to known values.
8334 MVT VT = BV->getSimpleValueType(0);
8335 HOpcode = ISD::DELETED_NODE;
8336 V0 = DAG.getUNDEF(VT);
8337 V1 = DAG.getUNDEF(VT);
8338
8339 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8340 // half of the result is calculated independently from the 128-bit halves of
8341 // the inputs, so that makes the index-checking logic below more complicated.
8342 unsigned NumElts = VT.getVectorNumElements();
8343 unsigned GenericOpcode = ISD::DELETED_NODE;
8344 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8345 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8346 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8347 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8348 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8349 // Ignore undef elements.
8350 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8351 if (Op.isUndef())
8352 continue;
8353
8354 // If there's an opcode mismatch, we're done.
8355 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8356 return false;
8357
8358 // Initialize horizontal opcode.
8359 if (HOpcode == ISD::DELETED_NODE) {
8360 GenericOpcode = Op.getOpcode();
8361 switch (GenericOpcode) {
8362 // clang-format off
8363 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8364 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8365 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8366 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8367 default: return false;
8368 // clang-format on
8369 }
8370 }
8371
8372 SDValue Op0 = Op.getOperand(0);
8373 SDValue Op1 = Op.getOperand(1);
8374 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8376 Op0.getOperand(0) != Op1.getOperand(0) ||
8377 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8378 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8379 return false;
8380
8381 // The source vector is chosen based on which 64-bit half of the
8382 // destination vector is being calculated.
8383 if (j < NumEltsIn64Bits) {
8384 if (V0.isUndef())
8385 V0 = Op0.getOperand(0);
8386 } else {
8387 if (V1.isUndef())
8388 V1 = Op0.getOperand(0);
8389 }
8390
8391 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8392 if (SourceVec != Op0.getOperand(0))
8393 return false;
8394
8395 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8396 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8397 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8398 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8399 (j % NumEltsIn64Bits) * 2;
8400 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8401 continue;
8402
8403 // If this is not a commutative op, this does not match.
8404 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8405 return false;
8406
8407 // Addition is commutative, so try swapping the extract indexes.
8408 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8409 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8410 continue;
8411
8412 // Extract indexes do not match horizontal requirement.
8413 return false;
8414 }
8415 }
8416 // We matched. Opcode and operands are returned by reference as arguments.
8417 return true;
8418}
8419
8421 const SDLoc &DL, SelectionDAG &DAG,
8422 unsigned HOpcode, SDValue V0, SDValue V1) {
8423 // If either input vector is not the same size as the build vector,
8424 // extract/insert the low bits to the correct size.
8425 // This is free (examples: zmm --> xmm, xmm --> ymm).
8426 MVT VT = BV->getSimpleValueType(0);
8427 unsigned Width = VT.getSizeInBits();
8428 if (V0.getValueSizeInBits() > Width)
8429 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8430 else if (V0.getValueSizeInBits() < Width)
8431 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8432
8433 if (V1.getValueSizeInBits() > Width)
8434 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8435 else if (V1.getValueSizeInBits() < Width)
8436 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8437
8438 unsigned NumElts = VT.getVectorNumElements();
8439 APInt DemandedElts = APInt::getAllOnes(NumElts);
8440 for (unsigned i = 0; i != NumElts; ++i)
8441 if (BV->getOperand(i).isUndef())
8442 DemandedElts.clearBit(i);
8443
8444 // If we don't need the upper xmm, then perform as a xmm hop.
8445 unsigned HalfNumElts = NumElts / 2;
8446 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8447 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8448 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8449 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8450 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8451 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8452 }
8453
8454 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8455}
8456
8457/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8459 const X86Subtarget &Subtarget,
8460 SelectionDAG &DAG) {
8461 // We need at least 2 non-undef elements to make this worthwhile by default.
8462 unsigned NumNonUndefs =
8463 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8464 if (NumNonUndefs < 2)
8465 return SDValue();
8466
8467 // There are 4 sets of horizontal math operations distinguished by type:
8468 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8469 // subtarget feature. Try to match those "native" patterns first.
8470 MVT VT = BV->getSimpleValueType(0);
8471 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8472 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8473 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8474 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8475 unsigned HOpcode;
8476 SDValue V0, V1;
8477 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8478 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8479 }
8480
8481 // Try harder to match 256-bit ops by using extract/concat.
8482 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8483 return SDValue();
8484
8485 // Count the number of UNDEF operands in the build_vector in input.
8486 unsigned NumElts = VT.getVectorNumElements();
8487 unsigned Half = NumElts / 2;
8488 unsigned NumUndefsLO = 0;
8489 unsigned NumUndefsHI = 0;
8490 for (unsigned i = 0, e = Half; i != e; ++i)
8491 if (BV->getOperand(i)->isUndef())
8492 NumUndefsLO++;
8493
8494 for (unsigned i = Half, e = NumElts; i != e; ++i)
8495 if (BV->getOperand(i)->isUndef())
8496 NumUndefsHI++;
8497
8498 SDValue InVec0, InVec1;
8499 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8500 SDValue InVec2, InVec3;
8501 unsigned X86Opcode;
8502 bool CanFold = true;
8503
8504 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8505 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8506 InVec3) &&
8507 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8508 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8509 X86Opcode = X86ISD::HADD;
8510 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8511 InVec1) &&
8512 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8513 InVec3) &&
8514 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8515 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8516 X86Opcode = X86ISD::HSUB;
8517 else
8518 CanFold = false;
8519
8520 if (CanFold) {
8521 // Do not try to expand this build_vector into a pair of horizontal
8522 // add/sub if we can emit a pair of scalar add/sub.
8523 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8524 return SDValue();
8525
8526 // Convert this build_vector into a pair of horizontal binops followed by
8527 // a concat vector. We must adjust the outputs from the partial horizontal
8528 // matching calls above to account for undefined vector halves.
8529 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8530 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8531 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8532 bool isUndefLO = NumUndefsLO == Half;
8533 bool isUndefHI = NumUndefsHI == Half;
8534 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8535 isUndefHI);
8536 }
8537 }
8538
8539 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8540 VT == MVT::v16i16) {
8541 unsigned X86Opcode;
8542 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8543 InVec1))
8544 X86Opcode = X86ISD::HADD;
8545 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8546 InVec1))
8547 X86Opcode = X86ISD::HSUB;
8548 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8549 InVec1))
8550 X86Opcode = X86ISD::FHADD;
8551 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8552 InVec1))
8553 X86Opcode = X86ISD::FHSUB;
8554 else
8555 return SDValue();
8556
8557 // Don't try to expand this build_vector into a pair of horizontal add/sub
8558 // if we can simply emit a pair of scalar add/sub.
8559 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8560 return SDValue();
8561
8562 // Convert this build_vector into two horizontal add/sub followed by
8563 // a concat vector.
8564 bool isUndefLO = NumUndefsLO == Half;
8565 bool isUndefHI = NumUndefsHI == Half;
8566 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8567 isUndefLO, isUndefHI);
8568 }
8569
8570 return SDValue();
8571}
8572
8573static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8574 SelectionDAG &DAG);
8575
8576/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8577/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8578/// just apply the bit to the vectors.
8579/// NOTE: Its not in our interest to start make a general purpose vectorizer
8580/// from this, but enough scalar bit operations are created from the later
8581/// legalization + scalarization stages to need basic support.
8583 const X86Subtarget &Subtarget,
8584 SelectionDAG &DAG) {
8585 MVT VT = Op->getSimpleValueType(0);
8586 unsigned NumElems = VT.getVectorNumElements();
8587 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8588
8589 // Check that all elements have the same opcode.
8590 // TODO: Should we allow UNDEFS and if so how many?
8591 unsigned Opcode = Op->getOperand(0).getOpcode();
8592 for (unsigned i = 1; i < NumElems; ++i)
8593 if (Opcode != Op->getOperand(i).getOpcode())
8594 return SDValue();
8595
8596 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8597 bool IsShift = false;
8598 switch (Opcode) {
8599 default:
8600 return SDValue();
8601 case ISD::SHL:
8602 case ISD::SRL:
8603 case ISD::SRA:
8604 IsShift = true;
8605 break;
8606 case ISD::AND:
8607 case ISD::XOR:
8608 case ISD::OR:
8609 // Don't do this if the buildvector is a splat - we'd replace one
8610 // constant with an entire vector.
8611 if (Op->getSplatValue())
8612 return SDValue();
8613 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8614 return SDValue();
8615 break;
8616 }
8617
8618 SmallVector<SDValue, 4> LHSElts, RHSElts;
8619 for (SDValue Elt : Op->ops()) {
8620 SDValue LHS = Elt.getOperand(0);
8621 SDValue RHS = Elt.getOperand(1);
8622
8623 // We expect the canonicalized RHS operand to be the constant.
8624 if (!isa<ConstantSDNode>(RHS))
8625 return SDValue();
8626
8627 // Extend shift amounts.
8628 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8629 if (!IsShift)
8630 return SDValue();
8631 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8632 }
8633
8634 LHSElts.push_back(LHS);
8635 RHSElts.push_back(RHS);
8636 }
8637
8638 // Limit to shifts by uniform immediates.
8639 // TODO: Only accept vXi8/vXi64 special cases?
8640 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8641 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8642 return SDValue();
8643
8644 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8645 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8646 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8647
8648 if (!IsShift)
8649 return Res;
8650
8651 // Immediately lower the shift to ensure the constant build vector doesn't
8652 // get converted to a constant pool before the shift is lowered.
8653 return LowerShift(Res, Subtarget, DAG);
8654}
8655
8656/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8657/// functionality to do this, so it's all zeros, all ones, or some derivation
8658/// that is cheap to calculate.
8660 SelectionDAG &DAG,
8661 const X86Subtarget &Subtarget) {
8662 MVT VT = Op.getSimpleValueType();
8663
8664 // Vectors containing all zeros can be matched by pxor and xorps.
8665 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8666 return Op;
8667
8668 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8669 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8670 // vpcmpeqd on 256-bit vectors.
8671 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8672 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8673 return Op;
8674
8675 return getOnesVector(VT, DAG, DL);
8676 }
8677
8678 return SDValue();
8679}
8680
8681/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8682/// from a vector of source values and a vector of extraction indices.
8683/// The vectors might be manipulated to match the type of the permute op.
8684static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8685 const SDLoc &DL, SelectionDAG &DAG,
8686 const X86Subtarget &Subtarget) {
8687 MVT ShuffleVT = VT;
8688 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8689 unsigned NumElts = VT.getVectorNumElements();
8690 unsigned SizeInBits = VT.getSizeInBits();
8691
8692 // Adjust IndicesVec to match VT size.
8693 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8694 "Illegal variable permute mask size");
8695 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8696 // Narrow/widen the indices vector to the correct size.
8697 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8698 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8699 NumElts * VT.getScalarSizeInBits());
8700 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8701 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8702 SDLoc(IndicesVec), SizeInBits);
8703 // Zero-extend the index elements within the vector.
8704 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8705 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8706 IndicesVT, IndicesVec);
8707 }
8708 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8709
8710 // Handle SrcVec that don't match VT type.
8711 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8712 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8713 // Handle larger SrcVec by treating it as a larger permute.
8714 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8715 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8716 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8717 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8718 Subtarget, DAG, SDLoc(IndicesVec));
8719 SDValue NewSrcVec =
8720 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8721 if (NewSrcVec)
8722 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8723 return SDValue();
8724 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8725 // Widen smaller SrcVec to match VT.
8726 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8727 } else
8728 return SDValue();
8729 }
8730
8731 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8732 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8733 EVT SrcVT = Idx.getValueType();
8734 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8735 uint64_t IndexScale = 0;
8736 uint64_t IndexOffset = 0;
8737
8738 // If we're scaling a smaller permute op, then we need to repeat the
8739 // indices, scaling and offsetting them as well.
8740 // e.g. v4i32 -> v16i8 (Scale = 4)
8741 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8742 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8743 for (uint64_t i = 0; i != Scale; ++i) {
8744 IndexScale |= Scale << (i * NumDstBits);
8745 IndexOffset |= i << (i * NumDstBits);
8746 }
8747
8748 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8749 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8750 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8751 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8752 return Idx;
8753 };
8754
8755 unsigned Opcode = 0;
8756 switch (VT.SimpleTy) {
8757 default:
8758 break;
8759 case MVT::v16i8:
8760 if (Subtarget.hasSSSE3())
8761 Opcode = X86ISD::PSHUFB;
8762 break;
8763 case MVT::v8i16:
8764 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8765 Opcode = X86ISD::VPERMV;
8766 else if (Subtarget.hasSSSE3()) {
8767 Opcode = X86ISD::PSHUFB;
8768 ShuffleVT = MVT::v16i8;
8769 }
8770 break;
8771 case MVT::v4f32:
8772 case MVT::v4i32:
8773 if (Subtarget.hasAVX()) {
8774 Opcode = X86ISD::VPERMILPV;
8775 ShuffleVT = MVT::v4f32;
8776 } else if (Subtarget.hasSSSE3()) {
8777 Opcode = X86ISD::PSHUFB;
8778 ShuffleVT = MVT::v16i8;
8779 }
8780 break;
8781 case MVT::v2f64:
8782 case MVT::v2i64:
8783 if (Subtarget.hasAVX()) {
8784 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8785 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8786 Opcode = X86ISD::VPERMILPV;
8787 ShuffleVT = MVT::v2f64;
8788 } else if (Subtarget.hasSSE41()) {
8789 // SSE41 can compare v2i64 - select between indices 0 and 1.
8790 return DAG.getSelectCC(
8791 DL, IndicesVec,
8792 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8793 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8794 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8796 }
8797 break;
8798 case MVT::v32i8:
8799 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8800 Opcode = X86ISD::VPERMV;
8801 else if (Subtarget.hasXOP()) {
8802 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8803 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8804 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8805 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8806 return DAG.getNode(
8808 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8809 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8810 } else if (Subtarget.hasAVX()) {
8811 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8812 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8813 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8814 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8815 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8816 ArrayRef<SDValue> Ops) {
8817 // Permute Lo and Hi and then select based on index range.
8818 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8819 // care about the bit[7] as its just an index vector.
8820 SDValue Idx = Ops[2];
8821 EVT VT = Idx.getValueType();
8822 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8823 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8824 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8826 };
8827 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8828 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8829 PSHUFBBuilder);
8830 }
8831 break;
8832 case MVT::v16i16:
8833 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8834 Opcode = X86ISD::VPERMV;
8835 else if (Subtarget.hasAVX()) {
8836 // Scale to v32i8 and perform as v32i8.
8837 IndicesVec = ScaleIndices(IndicesVec, 2);
8838 return DAG.getBitcast(
8840 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8841 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8842 }
8843 break;
8844 case MVT::v8f32:
8845 case MVT::v8i32:
8846 if (Subtarget.hasAVX2())
8847 Opcode = X86ISD::VPERMV;
8848 else if (Subtarget.hasAVX()) {
8849 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8850 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8851 {0, 1, 2, 3, 0, 1, 2, 3});
8852 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8853 {4, 5, 6, 7, 4, 5, 6, 7});
8854 if (Subtarget.hasXOP())
8855 return DAG.getBitcast(
8856 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8857 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8858 // Permute Lo and Hi and then select based on index range.
8859 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8860 SDValue Res = DAG.getSelectCC(
8861 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8862 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8863 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8865 return DAG.getBitcast(VT, Res);
8866 }
8867 break;
8868 case MVT::v4i64:
8869 case MVT::v4f64:
8870 if (Subtarget.hasAVX512()) {
8871 if (!Subtarget.hasVLX()) {
8872 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8873 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8874 SDLoc(SrcVec));
8875 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8876 DAG, SDLoc(IndicesVec));
8877 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8878 DAG, Subtarget);
8879 return extract256BitVector(Res, 0, DAG, DL);
8880 }
8881 Opcode = X86ISD::VPERMV;
8882 } else if (Subtarget.hasAVX()) {
8883 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8884 SDValue LoLo =
8885 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8886 SDValue HiHi =
8887 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8888 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8889 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8890 if (Subtarget.hasXOP())
8891 return DAG.getBitcast(
8892 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8893 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8894 // Permute Lo and Hi and then select based on index range.
8895 // This works as VPERMILPD only uses index bit[1] to permute elements.
8896 SDValue Res = DAG.getSelectCC(
8897 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8898 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8899 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8901 return DAG.getBitcast(VT, Res);
8902 }
8903 break;
8904 case MVT::v64i8:
8905 if (Subtarget.hasVBMI())
8906 Opcode = X86ISD::VPERMV;
8907 break;
8908 case MVT::v32i16:
8909 if (Subtarget.hasBWI())
8910 Opcode = X86ISD::VPERMV;
8911 break;
8912 case MVT::v16f32:
8913 case MVT::v16i32:
8914 case MVT::v8f64:
8915 case MVT::v8i64:
8916 if (Subtarget.hasAVX512())
8917 Opcode = X86ISD::VPERMV;
8918 break;
8919 }
8920 if (!Opcode)
8921 return SDValue();
8922
8923 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8924 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8925 "Illegal variable permute shuffle type");
8926
8927 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8928 if (Scale > 1)
8929 IndicesVec = ScaleIndices(IndicesVec, Scale);
8930
8931 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8932 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8933
8934 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8935 SDValue Res = Opcode == X86ISD::VPERMV
8936 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8937 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8938 return DAG.getBitcast(VT, Res);
8939}
8940
8941// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8942// reasoned to be a permutation of a vector by indices in a non-constant vector.
8943// (build_vector (extract_elt V, (extract_elt I, 0)),
8944// (extract_elt V, (extract_elt I, 1)),
8945// ...
8946// ->
8947// (vpermv I, V)
8948//
8949// TODO: Handle undefs
8950// TODO: Utilize pshufb and zero mask blending to support more efficient
8951// construction of vectors with constant-0 elements.
8952static SDValue
8954 SelectionDAG &DAG,
8955 const X86Subtarget &Subtarget) {
8956 SDValue SrcVec, IndicesVec;
8957 // Check for a match of the permute source vector and permute index elements.
8958 // This is done by checking that the i-th build_vector operand is of the form:
8959 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8960 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8961 SDValue Op = V.getOperand(Idx);
8962 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8963 return SDValue();
8964
8965 // If this is the first extract encountered in V, set the source vector,
8966 // otherwise verify the extract is from the previously defined source
8967 // vector.
8968 if (!SrcVec)
8969 SrcVec = Op.getOperand(0);
8970 else if (SrcVec != Op.getOperand(0))
8971 return SDValue();
8972 SDValue ExtractedIndex = Op->getOperand(1);
8973 // Peek through extends.
8974 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8975 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8976 ExtractedIndex = ExtractedIndex.getOperand(0);
8977 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8978 return SDValue();
8979
8980 // If this is the first extract from the index vector candidate, set the
8981 // indices vector, otherwise verify the extract is from the previously
8982 // defined indices vector.
8983 if (!IndicesVec)
8984 IndicesVec = ExtractedIndex.getOperand(0);
8985 else if (IndicesVec != ExtractedIndex.getOperand(0))
8986 return SDValue();
8987
8988 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8989 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8990 return SDValue();
8991 }
8992
8993 MVT VT = V.getSimpleValueType();
8994 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8995}
8996
8997SDValue
8998X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8999 SDLoc dl(Op);
9000
9001 MVT VT = Op.getSimpleValueType();
9002 MVT EltVT = VT.getVectorElementType();
9003 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9004 unsigned NumElems = Op.getNumOperands();
9005
9006 // Generate vectors for predicate vectors.
9007 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9008 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9009
9010 if (VT.getVectorElementType() == MVT::bf16 &&
9011 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9012 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9013
9014 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9015 return VectorCst;
9016
9017 unsigned EVTBits = EltVT.getSizeInBits();
9018 APInt UndefMask = APInt::getZero(NumElems);
9019 APInt FrozenUndefMask = APInt::getZero(NumElems);
9020 APInt ZeroMask = APInt::getZero(NumElems);
9021 APInt NonZeroMask = APInt::getZero(NumElems);
9022 bool IsAllConstants = true;
9023 bool OneUseFrozenUndefs = true;
9024 SmallSet<SDValue, 8> Values;
9025 unsigned NumConstants = NumElems;
9026 for (unsigned i = 0; i < NumElems; ++i) {
9027 SDValue Elt = Op.getOperand(i);
9028 if (Elt.isUndef()) {
9029 UndefMask.setBit(i);
9030 continue;
9031 }
9032 if (ISD::isFreezeUndef(Elt.getNode())) {
9033 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9034 FrozenUndefMask.setBit(i);
9035 continue;
9036 }
9037 Values.insert(Elt);
9038 if (!isIntOrFPConstant(Elt)) {
9039 IsAllConstants = false;
9040 NumConstants--;
9041 }
9042 if (X86::isZeroNode(Elt)) {
9043 ZeroMask.setBit(i);
9044 } else {
9045 NonZeroMask.setBit(i);
9046 }
9047 }
9048
9049 // All undef vector. Return an UNDEF.
9050 if (UndefMask.isAllOnes())
9051 return DAG.getUNDEF(VT);
9052
9053 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9054 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9055 return DAG.getFreeze(DAG.getUNDEF(VT));
9056
9057 // All undef/freeze(undef)/zero vector. Return a zero vector.
9058 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9059 return getZeroVector(VT, Subtarget, DAG, dl);
9060
9061 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9062 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9063 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9064 // and blend the FREEZE-UNDEF operands back in.
9065 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9066 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9067 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9068 SmallVector<int, 16> BlendMask(NumElems, -1);
9069 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9070 for (unsigned i = 0; i < NumElems; ++i) {
9071 if (UndefMask[i]) {
9072 BlendMask[i] = -1;
9073 continue;
9074 }
9075 BlendMask[i] = i;
9076 if (!FrozenUndefMask[i])
9077 Elts[i] = Op.getOperand(i);
9078 else
9079 BlendMask[i] += NumElems;
9080 }
9081 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9082 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9083 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9084 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9085 }
9086
9087 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9088
9089 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9090 // be better off lowering to a smaller build vector and padding with
9091 // undef/zero.
9092 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9094 unsigned UpperElems = NumElems / 2;
9095 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9096 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9097 if (NumUpperUndefsOrZeros >= UpperElems) {
9098 if (VT.is512BitVector() &&
9099 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9100 UpperElems = NumElems - (NumElems / 4);
9101 // If freeze(undef) is in any upper elements, force to zero.
9102 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9103 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9104 SDValue NewBV =
9105 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9106 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9107 }
9108 }
9109
9110 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9111 return AddSub;
9112 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9113 return HorizontalOp;
9114 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9115 return Broadcast;
9116 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9117 return BitOp;
9118
9119 unsigned NumZero = ZeroMask.popcount();
9120 unsigned NumNonZero = NonZeroMask.popcount();
9121
9122 // If we are inserting one variable into a vector of non-zero constants, try
9123 // to avoid loading each constant element as a scalar. Load the constants as a
9124 // vector and then insert the variable scalar element. If insertion is not
9125 // supported, fall back to a shuffle to get the scalar blended with the
9126 // constants. Insertion into a zero vector is handled as a special-case
9127 // somewhere below here.
9128 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9129 FrozenUndefMask.isZero() &&
9132 // Create an all-constant vector. The variable element in the old
9133 // build vector is replaced by undef in the constant vector. Save the
9134 // variable scalar element and its index for use in the insertelement.
9135 LLVMContext &Context = *DAG.getContext();
9136 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9137 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9138 SDValue VarElt;
9139 SDValue InsIndex;
9140 for (unsigned i = 0; i != NumElems; ++i) {
9141 SDValue Elt = Op.getOperand(i);
9142 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9143 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9144 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9145 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9146 else if (!Elt.isUndef()) {
9147 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9148 "Expected one variable element in this vector");
9149 VarElt = Elt;
9150 InsIndex = DAG.getVectorIdxConstant(i, dl);
9151 }
9152 }
9153 Constant *CV = ConstantVector::get(ConstVecOps);
9154 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9155
9156 // The constants we just created may not be legal (eg, floating point). We
9157 // must lower the vector right here because we can not guarantee that we'll
9158 // legalize it before loading it. This is also why we could not just create
9159 // a new build vector here. If the build vector contains illegal constants,
9160 // it could get split back up into a series of insert elements.
9161 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9162 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9165 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9166 unsigned InsertC = InsIndex->getAsZExtVal();
9167 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9168 if (InsertC < NumEltsInLow128Bits)
9169 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9170
9171 // There's no good way to insert into the high elements of a >128-bit
9172 // vector, so use shuffles to avoid an extract/insert sequence.
9173 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9174 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9175 SmallVector<int, 8> ShuffleMask;
9176 unsigned NumElts = VT.getVectorNumElements();
9177 for (unsigned i = 0; i != NumElts; ++i)
9178 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9179 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9180 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9181 }
9182
9183 // Special case for single non-zero, non-undef, element.
9184 if (NumNonZero == 1) {
9185 unsigned Idx = NonZeroMask.countr_zero();
9186 SDValue Item = Op.getOperand(Idx);
9187
9188 // If we have a constant or non-constant insertion into the low element of
9189 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9190 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9191 // depending on what the source datatype is.
9192 if (Idx == 0) {
9193 if (NumZero == 0)
9194 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9195
9196 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9197 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9198 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9199 assert((VT.is128BitVector() || VT.is256BitVector() ||
9200 VT.is512BitVector()) &&
9201 "Expected an SSE value type!");
9202 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9203 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9204 // zero vector.
9205 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9206 }
9207
9208 // We can't directly insert an i8 or i16 into a vector, so zero extend
9209 // it to i32 first.
9210 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9211 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9212 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9213 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9214 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9215 return DAG.getBitcast(VT, Item);
9216 }
9217 }
9218
9219 // Is it a vector logical left shift?
9220 if (NumElems == 2 && Idx == 1 &&
9221 X86::isZeroNode(Op.getOperand(0)) &&
9222 !X86::isZeroNode(Op.getOperand(1))) {
9223 unsigned NumBits = VT.getSizeInBits();
9224 return getVShift(true, VT,
9226 VT, Op.getOperand(1)),
9227 NumBits/2, DAG, *this, dl);
9228 }
9229
9230 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9231 return SDValue();
9232
9233 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9234 // is a non-constant being inserted into an element other than the low one,
9235 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9236 // movd/movss) to move this into the low element, then shuffle it into
9237 // place.
9238 if (EVTBits == 32) {
9239 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9240 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9241 }
9242 }
9243
9244 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9245 if (Values.size() == 1) {
9246 if (EVTBits == 32) {
9247 // Instead of a shuffle like this:
9248 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9249 // Check if it's possible to issue this instead.
9250 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9251 unsigned Idx = NonZeroMask.countr_zero();
9252 SDValue Item = Op.getOperand(Idx);
9253 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9254 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9255 }
9256 return SDValue();
9257 }
9258
9259 // A vector full of immediates; various special cases are already
9260 // handled, so this is best done with a single constant-pool load.
9261 if (IsAllConstants)
9262 return SDValue();
9263
9264 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9265 return V;
9266
9267 // See if we can use a vector load to get all of the elements.
9268 {
9269 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9270 if (SDValue LD =
9271 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9272 return LD;
9273 }
9274
9275 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9276 // build_vector and broadcast it.
9277 // TODO: We could probably generalize this more.
9278 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9279 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9280 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9281 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9282 // Make sure all the even/odd operands match.
9283 for (unsigned i = 2; i != NumElems; ++i)
9284 if (Ops[i % 2] != Op.getOperand(i))
9285 return false;
9286 return true;
9287 };
9288 if (CanSplat(Op, NumElems, Ops)) {
9289 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9290 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9291 // Create a new build vector and cast to v2i64/v2f64.
9292 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9293 DAG.getBuildVector(NarrowVT, dl, Ops));
9294 // Broadcast from v2i64/v2f64 and cast to final VT.
9295 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9296 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9297 NewBV));
9298 }
9299 }
9300
9301 // For AVX-length vectors, build the individual 128-bit pieces and use
9302 // shuffles to put them in place.
9303 if (VT.getSizeInBits() > 128) {
9304 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9305
9306 // Build both the lower and upper subvector.
9307 SDValue Lower =
9308 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9310 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9311
9312 // Recreate the wider vector with the lower and upper part.
9313 return concatSubVectors(Lower, Upper, DAG, dl);
9314 }
9315
9316 // Let legalizer expand 2-wide build_vectors.
9317 if (EVTBits == 64) {
9318 if (NumNonZero == 1) {
9319 // One half is zero or undef.
9320 unsigned Idx = NonZeroMask.countr_zero();
9322 Op.getOperand(Idx));
9323 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9324 }
9325 return SDValue();
9326 }
9327
9328 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9329 if (EVTBits == 8 && NumElems == 16)
9330 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9331 NumZero, DAG, Subtarget))
9332 return V;
9333
9334 if (EltVT == MVT::i16 && NumElems == 8)
9335 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9336 NumZero, DAG, Subtarget))
9337 return V;
9338
9339 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9340 if (EVTBits == 32 && NumElems == 4)
9341 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9342 return V;
9343
9344 // If element VT is == 32 bits, turn it into a number of shuffles.
9345 if (NumElems == 4 && NumZero > 0) {
9346 SmallVector<SDValue, 8> Ops(NumElems);
9347 for (unsigned i = 0; i < 4; ++i) {
9348 bool isZero = !NonZeroMask[i];
9349 if (isZero)
9350 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9351 else
9352 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9353 }
9354
9355 for (unsigned i = 0; i < 2; ++i) {
9356 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9357 default: llvm_unreachable("Unexpected NonZero count");
9358 case 0:
9359 Ops[i] = Ops[i*2]; // Must be a zero vector.
9360 break;
9361 case 1:
9362 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9363 break;
9364 case 2:
9365 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9366 break;
9367 case 3:
9368 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9369 break;
9370 }
9371 }
9372
9373 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9374 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9375 int MaskVec[] = {
9376 Reverse1 ? 1 : 0,
9377 Reverse1 ? 0 : 1,
9378 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9379 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9380 };
9381 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9382 }
9383
9384 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9385
9386 // Check for a build vector from mostly shuffle plus few inserting.
9387 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9388 return Sh;
9389
9390 // For SSE 4.1, use insertps to put the high elements into the low element.
9391 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9393 if (!Op.getOperand(0).isUndef())
9394 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9395 else
9396 Result = DAG.getUNDEF(VT);
9397
9398 for (unsigned i = 1; i < NumElems; ++i) {
9399 if (Op.getOperand(i).isUndef()) continue;
9400 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9401 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9402 }
9403 return Result;
9404 }
9405
9406 // Otherwise, expand into a number of unpckl*, start by extending each of
9407 // our (non-undef) elements to the full vector width with the element in the
9408 // bottom slot of the vector (which generates no code for SSE).
9409 SmallVector<SDValue, 8> Ops(NumElems);
9410 for (unsigned i = 0; i < NumElems; ++i) {
9411 if (!Op.getOperand(i).isUndef())
9412 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9413 else
9414 Ops[i] = DAG.getUNDEF(VT);
9415 }
9416
9417 // Next, we iteratively mix elements, e.g. for v4f32:
9418 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9419 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9420 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9421 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9422 // Generate scaled UNPCKL shuffle mask.
9424 for(unsigned i = 0; i != Scale; ++i)
9425 Mask.push_back(i);
9426 for (unsigned i = 0; i != Scale; ++i)
9427 Mask.push_back(NumElems+i);
9428 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9429
9430 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9431 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9432 }
9433 return Ops[0];
9434}
9435
9436// 256-bit AVX can use the vinsertf128 instruction
9437// to create 256-bit vectors from two other 128-bit ones.
9438// TODO: Detect subvector broadcast here instead of DAG combine?
9440 const X86Subtarget &Subtarget) {
9441 SDLoc dl(Op);
9442 MVT ResVT = Op.getSimpleValueType();
9443
9444 assert((ResVT.is256BitVector() ||
9445 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9446
9447 unsigned NumOperands = Op.getNumOperands();
9448 unsigned NumFreezeUndef = 0;
9449 unsigned NumZero = 0;
9450 unsigned NumNonZero = 0;
9451 unsigned NonZeros = 0;
9452 for (unsigned i = 0; i != NumOperands; ++i) {
9453 SDValue SubVec = Op.getOperand(i);
9454 if (SubVec.isUndef())
9455 continue;
9456 if (ISD::isFreezeUndef(SubVec.getNode())) {
9457 // If the freeze(undef) has multiple uses then we must fold to zero.
9458 if (SubVec.hasOneUse())
9459 ++NumFreezeUndef;
9460 else
9461 ++NumZero;
9462 }
9463 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9464 ++NumZero;
9465 else {
9466 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9467 NonZeros |= 1 << i;
9468 ++NumNonZero;
9469 }
9470 }
9471
9472 // If we have more than 2 non-zeros, build each half separately.
9473 if (NumNonZero > 2) {
9474 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9475 ArrayRef<SDUse> Ops = Op->ops();
9476 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9477 Ops.slice(0, NumOperands/2));
9478 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9479 Ops.slice(NumOperands/2));
9480 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9481 }
9482
9483 // Otherwise, build it up through insert_subvectors.
9484 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9485 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9486 : DAG.getUNDEF(ResVT));
9487
9488 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9489 unsigned NumSubElems = SubVT.getVectorNumElements();
9490 for (unsigned i = 0; i != NumOperands; ++i) {
9491 if ((NonZeros & (1 << i)) == 0)
9492 continue;
9493
9494 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9495 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9496 }
9497
9498 return Vec;
9499}
9500
9501// Returns true if the given node is a type promotion (by concatenating i1
9502// zeros) of the result of a node that already zeros all upper bits of
9503// k-register.
9504// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9506 const X86Subtarget &Subtarget,
9507 SelectionDAG & DAG) {
9508 SDLoc dl(Op);
9509 MVT ResVT = Op.getSimpleValueType();
9510 unsigned NumOperands = Op.getNumOperands();
9511
9512 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9513 "Unexpected number of operands in CONCAT_VECTORS");
9514
9515 uint64_t Zeros = 0;
9516 uint64_t NonZeros = 0;
9517 for (unsigned i = 0; i != NumOperands; ++i) {
9518 SDValue SubVec = Op.getOperand(i);
9519 if (SubVec.isUndef())
9520 continue;
9521 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9522 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9523 Zeros |= (uint64_t)1 << i;
9524 else
9525 NonZeros |= (uint64_t)1 << i;
9526 }
9527
9528 unsigned NumElems = ResVT.getVectorNumElements();
9529
9530 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9531 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9532 // insert_subvector will give us two kshifts.
9533 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9534 Log2_64(NonZeros) != NumOperands - 1) {
9535 unsigned Idx = Log2_64(NonZeros);
9536 SDValue SubVec = Op.getOperand(Idx);
9537 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9538 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9539 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9540 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9541 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9542 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9543 DAG.getVectorIdxConstant(0, dl));
9544 }
9545
9546 // If there are zero or one non-zeros we can handle this very simply.
9547 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9548 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9549 if (!NonZeros)
9550 return Vec;
9551 unsigned Idx = Log2_64(NonZeros);
9552 SDValue SubVec = Op.getOperand(Idx);
9553 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9554 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9555 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9556 }
9557
9558 if (NumOperands > 2) {
9559 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9560 ArrayRef<SDUse> Ops = Op->ops();
9561 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9562 Ops.slice(0, NumOperands / 2));
9563 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9564 Ops.slice(NumOperands / 2));
9565 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9566 }
9567
9568 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9569
9570 if (ResVT.getVectorNumElements() >= 16)
9571 return Op; // The operation is legal with KUNPCK
9572
9573 SDValue Vec =
9574 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9575 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9576 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9577 DAG.getVectorIdxConstant(NumElems / 2, dl));
9578}
9579
9581 const X86Subtarget &Subtarget,
9582 SelectionDAG &DAG) {
9583 MVT VT = Op.getSimpleValueType();
9584 if (VT.getVectorElementType() == MVT::i1)
9585 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9586
9587 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9588 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9589 Op.getNumOperands() == 4)));
9590
9591 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9592 // from two other 128-bit ones.
9593
9594 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9595 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9596}
9597
9598//===----------------------------------------------------------------------===//
9599// Vector shuffle lowering
9600//
9601// This is an experimental code path for lowering vector shuffles on x86. It is
9602// designed to handle arbitrary vector shuffles and blends, gracefully
9603// degrading performance as necessary. It works hard to recognize idiomatic
9604// shuffles and lower them to optimal instruction patterns without leaving
9605// a framework that allows reasonably efficient handling of all vector shuffle
9606// patterns.
9607//===----------------------------------------------------------------------===//
9608
9609/// Tiny helper function to identify a no-op mask.
9610///
9611/// This is a somewhat boring predicate function. It checks whether the mask
9612/// array input, which is assumed to be a single-input shuffle mask of the kind
9613/// used by the X86 shuffle instructions (not a fully general
9614/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9615/// in-place shuffle are 'no-op's.
9617 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9618 assert(Mask[i] >= -1 && "Out of bound mask element!");
9619 if (Mask[i] >= 0 && Mask[i] != i)
9620 return false;
9621 }
9622 return true;
9623}
9624
9625/// Test whether there are elements crossing LaneSizeInBits lanes in this
9626/// shuffle mask.
9627///
9628/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9629/// and we routinely test for these.
9630static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9631 unsigned ScalarSizeInBits,
9632 ArrayRef<int> Mask) {
9633 assert(LaneSizeInBits && ScalarSizeInBits &&
9634 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9635 "Illegal shuffle lane size");
9636 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9637 int Size = Mask.size();
9638 for (int i = 0; i < Size; ++i)
9639 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9640 return true;
9641 return false;
9642}
9643
9644/// Test whether there are elements crossing 128-bit lanes in this
9645/// shuffle mask.
9647 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9648}
9649
9650/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9651/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9652/// better support 'repeated mask + lane permute' style shuffles.
9653static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9654 unsigned ScalarSizeInBits,
9655 ArrayRef<int> Mask) {
9656 assert(LaneSizeInBits && ScalarSizeInBits &&
9657 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9658 "Illegal shuffle lane size");
9659 int NumElts = Mask.size();
9660 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9661 int NumLanes = NumElts / NumEltsPerLane;
9662 if (NumLanes > 1) {
9663 for (int i = 0; i != NumLanes; ++i) {
9664 int SrcLane = -1;
9665 for (int j = 0; j != NumEltsPerLane; ++j) {
9666 int M = Mask[(i * NumEltsPerLane) + j];
9667 if (M < 0)
9668 continue;
9669 int Lane = (M % NumElts) / NumEltsPerLane;
9670 if (SrcLane >= 0 && SrcLane != Lane)
9671 return true;
9672 SrcLane = Lane;
9673 }
9674 }
9675 }
9676 return false;
9677}
9678
9679/// Test whether a shuffle mask is equivalent within each sub-lane.
9680///
9681/// This checks a shuffle mask to see if it is performing the same
9682/// lane-relative shuffle in each sub-lane. This trivially implies
9683/// that it is also not lane-crossing. It may however involve a blend from the
9684/// same lane of a second vector.
9685///
9686/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9687/// non-trivial to compute in the face of undef lanes. The representation is
9688/// suitable for use with existing 128-bit shuffles as entries from the second
9689/// vector have been remapped to [LaneSize, 2*LaneSize).
9690static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9691 ArrayRef<int> Mask,
9692 SmallVectorImpl<int> &RepeatedMask) {
9693 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9694 RepeatedMask.assign(LaneSize, -1);
9695 int Size = Mask.size();
9696 for (int i = 0; i < Size; ++i) {
9697 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9698 if (Mask[i] < 0)
9699 continue;
9700 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9701 // This entry crosses lanes, so there is no way to model this shuffle.
9702 return false;
9703
9704 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9705 // Adjust second vector indices to start at LaneSize instead of Size.
9706 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9707 : Mask[i] % LaneSize + LaneSize;
9708 if (RepeatedMask[i % LaneSize] < 0)
9709 // This is the first non-undef entry in this slot of a 128-bit lane.
9710 RepeatedMask[i % LaneSize] = LocalM;
9711 else if (RepeatedMask[i % LaneSize] != LocalM)
9712 // Found a mismatch with the repeated mask.
9713 return false;
9714 }
9715 return true;
9716}
9717
9718/// Test whether a shuffle mask is equivalent within each 128-bit lane.
9719static bool
9721 SmallVectorImpl<int> &RepeatedMask) {
9722 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9723}
9724
9725static bool
9727 SmallVector<int, 32> RepeatedMask;
9728 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9729}
9730
9731/// Test whether a shuffle mask is equivalent within each 256-bit lane.
9732static bool
9734 SmallVectorImpl<int> &RepeatedMask) {
9735 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9736}
9737
9738/// Test whether a target shuffle mask is equivalent within each sub-lane.
9739/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9740static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9741 unsigned EltSizeInBits,
9742 ArrayRef<int> Mask,
9743 SmallVectorImpl<int> &RepeatedMask) {
9744 int LaneSize = LaneSizeInBits / EltSizeInBits;
9745 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9746 int Size = Mask.size();
9747 for (int i = 0; i < Size; ++i) {
9748 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9749 if (Mask[i] == SM_SentinelUndef)
9750 continue;
9751 if (Mask[i] == SM_SentinelZero) {
9752 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9753 return false;
9754 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9755 continue;
9756 }
9757 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9758 // This entry crosses lanes, so there is no way to model this shuffle.
9759 return false;
9760
9761 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9762 // later vector indices to start at multiples of LaneSize instead of Size.
9763 int LaneM = Mask[i] / Size;
9764 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9765 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9766 // This is the first non-undef entry in this slot of a 128-bit lane.
9767 RepeatedMask[i % LaneSize] = LocalM;
9768 else if (RepeatedMask[i % LaneSize] != LocalM)
9769 // Found a mismatch with the repeated mask.
9770 return false;
9771 }
9772 return true;
9773}
9774
9775/// Test whether a target shuffle mask is equivalent within each sub-lane.
9776/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9777static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9778 ArrayRef<int> Mask,
9779 SmallVectorImpl<int> &RepeatedMask) {
9780 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9781 Mask, RepeatedMask);
9782}
9783
9784/// Checks whether the vector elements referenced by two shuffle masks are
9785/// equivalent.
9786static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9787 int Idx, int ExpectedIdx) {
9788 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9789 ExpectedIdx < MaskSize && "Out of range element index");
9790 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9791 return false;
9792
9793 EVT VT = Op.getValueType();
9794 switch (Op.getOpcode()) {
9795 case ISD::BUILD_VECTOR:
9796 // If the values are build vectors, we can look through them to find
9797 // equivalent inputs that make the shuffles equivalent.
9798 // TODO: Handle MaskSize != Op.getNumOperands()?
9799 if (MaskSize == (int)Op.getNumOperands() &&
9800 MaskSize == (int)ExpectedOp.getNumOperands())
9801 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9802 break;
9803 case ISD::BITCAST:
9804 if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) {
9806 EVT SrcVT = Src.getValueType();
9807 if (SrcVT.isVector() &&
9808 (SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9809 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9810 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9811 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9812 Idx / Scale, ExpectedIdx / Scale);
9813 }
9814 }
9815 break;
9816 case ISD::VECTOR_SHUFFLE: {
9817 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9818 return Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize &&
9819 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9820 }
9821 case X86ISD::VBROADCAST:
9823 // TODO: Handle MaskSize != VT.getVectorNumElements()?
9824 return (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize);
9825 case X86ISD::HADD:
9826 case X86ISD::HSUB:
9827 case X86ISD::FHADD:
9828 case X86ISD::FHSUB:
9829 case X86ISD::PACKSS:
9830 case X86ISD::PACKUS:
9831 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9832 // TODO: Handle MaskSize != NumElts?
9833 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9834 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9835 int NumElts = VT.getVectorNumElements();
9836 if (MaskSize == NumElts) {
9837 int NumLanes = VT.getSizeInBits() / 128;
9838 int NumEltsPerLane = NumElts / NumLanes;
9839 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9840 bool SameLane =
9841 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9842 bool SameElt =
9843 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9844 return SameLane && SameElt;
9845 }
9846 }
9847 break;
9848 }
9849
9850 return false;
9851}
9852
9853/// Checks whether a shuffle mask is equivalent to an explicit list of
9854/// arguments.
9855///
9856/// This is a fast way to test a shuffle mask against a fixed pattern:
9857///
9858/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9859///
9860/// It returns true if the mask is exactly as wide as the argument list, and
9861/// each element of the mask is either -1 (signifying undef) or the value given
9862/// in the argument.
9863static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9864 SDValue V1 = SDValue(),
9865 SDValue V2 = SDValue()) {
9866 int Size = Mask.size();
9867 if (Size != (int)ExpectedMask.size())
9868 return false;
9869
9870 for (int i = 0; i < Size; ++i) {
9871 assert(Mask[i] >= -1 && "Out of bound mask element!");
9872 int MaskIdx = Mask[i];
9873 int ExpectedIdx = ExpectedMask[i];
9874 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9875 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9876 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9877 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9878 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9879 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9880 return false;
9881 }
9882 }
9883 return true;
9884}
9885
9886/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9887///
9888/// The masks must be exactly the same width.
9889///
9890/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9891/// value in ExpectedMask is always accepted. Otherwise the indices must match.
9892///
9893/// SM_SentinelZero is accepted as a valid negative index but must match in
9894/// both, or via a known bits test.
9896 ArrayRef<int> ExpectedMask,
9897 const SelectionDAG &DAG,
9898 SDValue V1 = SDValue(),
9899 SDValue V2 = SDValue()) {
9900 int Size = Mask.size();
9901 if (Size != (int)ExpectedMask.size())
9902 return false;
9903 assert(llvm::all_of(ExpectedMask,
9904 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9905 "Illegal target shuffle mask");
9906
9907 // Check for out-of-range target shuffle mask indices.
9908 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9909 return false;
9910
9911 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9912 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9913 !V1.getValueType().isVector()))
9914 V1 = SDValue();
9915 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9916 !V2.getValueType().isVector()))
9917 V2 = SDValue();
9918
9919 APInt ZeroV1 = APInt::getZero(Size);
9920 APInt ZeroV2 = APInt::getZero(Size);
9921
9922 for (int i = 0; i < Size; ++i) {
9923 int MaskIdx = Mask[i];
9924 int ExpectedIdx = ExpectedMask[i];
9925 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9926 continue;
9927 if (MaskIdx == SM_SentinelZero) {
9928 // If we need this expected index to be a zero element, then update the
9929 // relevant zero mask and perform the known bits at the end to minimize
9930 // repeated computes.
9931 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9932 if (ExpectedV &&
9933 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9934 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9935 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9936 ZeroMask.setBit(BitIdx);
9937 continue;
9938 }
9939 }
9940 if (MaskIdx >= 0) {
9941 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9942 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9943 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9944 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9945 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9946 continue;
9947 }
9948 return false;
9949 }
9950 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9951 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9952}
9953
9954// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9955// instructions.
9957 const SelectionDAG &DAG) {
9958 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9959 return false;
9960
9961 SmallVector<int, 8> Unpcklwd;
9962 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9963 /* Unary = */ false);
9964 SmallVector<int, 8> Unpckhwd;
9965 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9966 /* Unary = */ false);
9967 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9968 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9969 return IsUnpackwdMask;
9970}
9971
9973 const SelectionDAG &DAG) {
9974 // Create 128-bit vector type based on mask size.
9975 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9976 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9977
9978 // We can't assume a canonical shuffle mask, so try the commuted version too.
9979 SmallVector<int, 4> CommutedMask(Mask);
9981
9982 // Match any of unary/binary or low/high.
9983 for (unsigned i = 0; i != 4; ++i) {
9984 SmallVector<int, 16> UnpackMask;
9985 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9986 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9987 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9988 return true;
9989 }
9990 return false;
9991}
9992
9993/// Return true if a shuffle mask chooses elements identically in its top and
9994/// bottom halves. For example, any splat mask has the same top and bottom
9995/// halves. If an element is undefined in only one half of the mask, the halves
9996/// are not considered identical.
9998 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9999 unsigned HalfSize = Mask.size() / 2;
10000 for (unsigned i = 0; i != HalfSize; ++i) {
10001 if (Mask[i] != Mask[i + HalfSize])
10002 return false;
10003 }
10004 return true;
10005}
10006
10007/// Get a 4-lane 8-bit shuffle immediate for a mask.
10008///
10009/// This helper function produces an 8-bit shuffle immediate corresponding to
10010/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10011/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10012/// example.
10013///
10014/// NB: We rely heavily on "undef" masks preserving the input lane.
10015static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10016 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10017 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10018 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10019 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10020 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10021
10022 // If the mask only uses one non-undef element, then fully 'splat' it to
10023 // improve later broadcast matching.
10024 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10025 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10026
10027 int FirstElt = Mask[FirstIndex];
10028 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10029 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10030
10031 unsigned Imm = 0;
10032 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10033 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10034 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10035 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10036 return Imm;
10037}
10038
10040 SelectionDAG &DAG) {
10041 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10042}
10043
10044// Canonicalize SHUFPD mask to improve chances of further folding.
10045// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10046static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10047 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10048 "Unexpected SHUFPD mask size");
10049 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10050 "Unexpected SHUFPD mask elements");
10051
10052 // If the mask only uses one non-undef element, then fully 'splat' it to
10053 // improve later broadcast matching.
10054 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10055 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10056 "All undef shuffle mask");
10057
10058 int FirstElt = Mask[FirstIndex];
10059 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10060 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10061 unsigned Imm = 0;
10062 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10063 Imm |= FirstElt << I;
10064 return Imm;
10065 }
10066
10067 // Attempt to keep any undef elements in place to improve chances of the
10068 // shuffle becoming a (commutative) blend.
10069 unsigned Imm = 0;
10070 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10071 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10072
10073 return Imm;
10074}
10075
10077 SelectionDAG &DAG) {
10078 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10079}
10080
10081// The Shuffle result is as follow:
10082// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10083// Each Zeroable's element correspond to a particular Mask's element.
10084// As described in computeZeroableShuffleElements function.
10085//
10086// The function looks for a sub-mask that the nonzero elements are in
10087// increasing order. If such sub-mask exist. The function returns true.
10088static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10089 ArrayRef<int> Mask, const EVT &VectorType,
10090 bool &IsZeroSideLeft) {
10091 int NextElement = -1;
10092 // Check if the Mask's nonzero elements are in increasing order.
10093 for (int i = 0, e = Mask.size(); i < e; i++) {
10094 // Checks if the mask's zeros elements are built from only zeros.
10095 assert(Mask[i] >= -1 && "Out of bound mask element!");
10096 if (Mask[i] < 0)
10097 return false;
10098 if (Zeroable[i])
10099 continue;
10100 // Find the lowest non zero element
10101 if (NextElement < 0) {
10102 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10103 IsZeroSideLeft = NextElement != 0;
10104 }
10105 // Exit if the mask's non zero elements are not in increasing order.
10106 if (NextElement != Mask[i])
10107 return false;
10108 NextElement++;
10109 }
10110 return true;
10111}
10112
10113/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10115 ArrayRef<int> Mask, SDValue V1,
10116 SDValue V2, const APInt &Zeroable,
10117 const X86Subtarget &Subtarget,
10118 SelectionDAG &DAG) {
10119 int Size = Mask.size();
10120 int LaneSize = 128 / VT.getScalarSizeInBits();
10121 const int NumBytes = VT.getSizeInBits() / 8;
10122 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10123
10124 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10125 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10126 (Subtarget.hasBWI() && VT.is512BitVector()));
10127
10128 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10129 // Sign bit set in i8 mask means zero element.
10130 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10131
10132 SDValue V;
10133 for (int i = 0; i < NumBytes; ++i) {
10134 int M = Mask[i / NumEltBytes];
10135 if (M < 0) {
10136 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10137 continue;
10138 }
10139 if (Zeroable[i / NumEltBytes]) {
10140 PSHUFBMask[i] = ZeroMask;
10141 continue;
10142 }
10143
10144 // We can only use a single input of V1 or V2.
10145 SDValue SrcV = (M >= Size ? V2 : V1);
10146 if (V && V != SrcV)
10147 return SDValue();
10148 V = SrcV;
10149 M %= Size;
10150
10151 // PSHUFB can't cross lanes, ensure this doesn't happen.
10152 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10153 return SDValue();
10154
10155 M = M % LaneSize;
10156 M = M * NumEltBytes + (i % NumEltBytes);
10157 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10158 }
10159 assert(V && "Failed to find a source input");
10160
10161 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10162 return DAG.getBitcast(
10163 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10164 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10165}
10166
10167static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10168 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10169 const SDLoc &dl);
10170
10171// X86 has dedicated shuffle that can be lowered to VEXPAND
10173 SDValue V2, ArrayRef<int> Mask,
10174 const APInt &Zeroable,
10175 const X86Subtarget &Subtarget,
10176 SelectionDAG &DAG) {
10177 bool IsLeftZeroSide = true;
10178 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10179 IsLeftZeroSide))
10180 return SDValue();
10181 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10183 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10184 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10185 unsigned NumElts = VT.getVectorNumElements();
10186 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10187 "Unexpected number of vector elements");
10188 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10189 Subtarget, DAG, DL);
10190 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10191 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10192 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10193}
10194
10195static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10196 unsigned &UnpackOpcode, bool IsUnary,
10197 ArrayRef<int> TargetMask, const SDLoc &DL,
10198 SelectionDAG &DAG,
10199 const X86Subtarget &Subtarget) {
10200 int NumElts = VT.getVectorNumElements();
10201
10202 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10203 for (int i = 0; i != NumElts; i += 2) {
10204 int M1 = TargetMask[i + 0];
10205 int M2 = TargetMask[i + 1];
10206 Undef1 &= (SM_SentinelUndef == M1);
10207 Undef2 &= (SM_SentinelUndef == M2);
10208 Zero1 &= isUndefOrZero(M1);
10209 Zero2 &= isUndefOrZero(M2);
10210 }
10211 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10212 "Zeroable shuffle detected");
10213
10214 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10215 SmallVector<int, 64> Unpckl, Unpckh;
10216 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10217 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10218 (IsUnary ? V1 : V2))) {
10219 UnpackOpcode = X86ISD::UNPCKL;
10220 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10221 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10222 return true;
10223 }
10224
10225 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10226 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10227 (IsUnary ? V1 : V2))) {
10228 UnpackOpcode = X86ISD::UNPCKH;
10229 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10230 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10231 return true;
10232 }
10233
10234 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10235 if (IsUnary && (Zero1 || Zero2)) {
10236 // Don't bother if we can blend instead.
10237 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10238 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10239 return false;
10240
10241 bool MatchLo = true, MatchHi = true;
10242 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10243 int M = TargetMask[i];
10244
10245 // Ignore if the input is known to be zero or the index is undef.
10246 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10247 (M == SM_SentinelUndef))
10248 continue;
10249
10250 MatchLo &= (M == Unpckl[i]);
10251 MatchHi &= (M == Unpckh[i]);
10252 }
10253
10254 if (MatchLo || MatchHi) {
10255 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10256 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10257 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10258 return true;
10259 }
10260 }
10261
10262 // If a binary shuffle, commute and try again.
10263 if (!IsUnary) {
10265 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10266 UnpackOpcode = X86ISD::UNPCKL;
10267 std::swap(V1, V2);
10268 return true;
10269 }
10270
10272 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10273 UnpackOpcode = X86ISD::UNPCKH;
10274 std::swap(V1, V2);
10275 return true;
10276 }
10277 }
10278
10279 return false;
10280}
10281
10282// X86 has dedicated unpack instructions that can handle specific blend
10283// operations: UNPCKH and UNPCKL.
10285 SDValue V2, ArrayRef<int> Mask,
10286 SelectionDAG &DAG) {
10287 SmallVector<int, 8> Unpckl;
10288 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10289 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10290 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10291
10292 SmallVector<int, 8> Unpckh;
10293 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10294 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10295 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10296
10297 // Commute and try again.
10299 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10300 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10301
10303 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10304 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10305
10306 return SDValue();
10307}
10308
10309/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10310/// followed by unpack 256-bit.
10312 SDValue V2, ArrayRef<int> Mask,
10313 SelectionDAG &DAG) {
10314 SmallVector<int, 32> Unpckl, Unpckh;
10315 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10316 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10317
10318 unsigned UnpackOpcode;
10319 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10320 UnpackOpcode = X86ISD::UNPCKL;
10321 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10322 UnpackOpcode = X86ISD::UNPCKH;
10323 else
10324 return SDValue();
10325
10326 // This is a "natural" unpack operation (rather than the 128-bit sectored
10327 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10328 // input in order to use the x86 instruction.
10329 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10330 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10331 V1 = DAG.getBitcast(VT, V1);
10332 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10333}
10334
10335// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10336// source into the lower elements and zeroing the upper elements.
10337static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10338 ArrayRef<int> Mask, const APInt &Zeroable,
10339 const X86Subtarget &Subtarget) {
10340 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10341 return false;
10342
10343 unsigned NumElts = Mask.size();
10344 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10345 unsigned MaxScale = 64 / EltSizeInBits;
10346
10347 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10348 unsigned SrcEltBits = EltSizeInBits * Scale;
10349 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10350 continue;
10351 unsigned NumSrcElts = NumElts / Scale;
10352 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10353 continue;
10354 unsigned UpperElts = NumElts - NumSrcElts;
10355 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10356 continue;
10357 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10358 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10359 DstVT = MVT::getIntegerVT(EltSizeInBits);
10360 if ((NumSrcElts * EltSizeInBits) >= 128) {
10361 // ISD::TRUNCATE
10362 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10363 } else {
10364 // X86ISD::VTRUNC
10365 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10366 }
10367 return true;
10368 }
10369
10370 return false;
10371}
10372
10373// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10374// element padding to the final DstVT.
10375static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10376 const X86Subtarget &Subtarget,
10377 SelectionDAG &DAG, bool ZeroUppers) {
10378 MVT SrcVT = Src.getSimpleValueType();
10379 MVT DstSVT = DstVT.getScalarType();
10380 unsigned NumDstElts = DstVT.getVectorNumElements();
10381 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10382 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10383
10384 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10385 return SDValue();
10386
10387 // Perform a direct ISD::TRUNCATE if possible.
10388 if (NumSrcElts == NumDstElts)
10389 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10390
10391 if (NumSrcElts > NumDstElts) {
10392 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10393 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10394 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10395 }
10396
10397 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10398 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10399 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10400 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10401 DstVT.getSizeInBits());
10402 }
10403
10404 // Non-VLX targets must truncate from a 512-bit type, so we need to
10405 // widen, truncate and then possibly extract the original subvector.
10406 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10407 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10408 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10409 }
10410
10411 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10412 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10413 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10414 if (DstVT != TruncVT)
10415 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10416 DstVT.getSizeInBits());
10417 return Trunc;
10418}
10419
10420// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10421//
10422// An example is the following:
10423//
10424// t0: ch = EntryToken
10425// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10426// t25: v4i32 = truncate t2
10427// t41: v8i16 = bitcast t25
10428// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10429// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10430// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10431// t18: v2i64 = bitcast t51
10432//
10433// One can just use a single vpmovdw instruction, without avx512vl we need to
10434// use the zmm variant and extract the lower subvector, padding with zeroes.
10435// TODO: Merge with lowerShuffleAsVTRUNC.
10437 SDValue V2, ArrayRef<int> Mask,
10438 const APInt &Zeroable,
10439 const X86Subtarget &Subtarget,
10440 SelectionDAG &DAG) {
10441 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10442 if (!Subtarget.hasAVX512())
10443 return SDValue();
10444
10445 unsigned NumElts = VT.getVectorNumElements();
10446 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10447 unsigned MaxScale = 64 / EltSizeInBits;
10448 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10449 unsigned SrcEltBits = EltSizeInBits * Scale;
10450 unsigned NumSrcElts = NumElts / Scale;
10451 unsigned UpperElts = NumElts - NumSrcElts;
10452 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10453 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10454 continue;
10455
10456 // Attempt to find a matching source truncation, but as a fall back VLX
10457 // cases can use the VPMOV directly.
10458 SDValue Src = peekThroughBitcasts(V1);
10459 if (Src.getOpcode() == ISD::TRUNCATE &&
10460 Src.getScalarValueSizeInBits() == SrcEltBits) {
10461 Src = Src.getOperand(0);
10462 } else if (Subtarget.hasVLX()) {
10463 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10464 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10465 Src = DAG.getBitcast(SrcVT, Src);
10466 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10467 if (Scale == 2 &&
10468 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10469 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10470 return SDValue();
10471 } else
10472 return SDValue();
10473
10474 // VPMOVWB is only available with avx512bw.
10475 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10476 return SDValue();
10477
10478 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10479 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10480 }
10481
10482 return SDValue();
10483}
10484
10485// Attempt to match binary shuffle patterns as a truncate.
10487 SDValue V2, ArrayRef<int> Mask,
10488 const APInt &Zeroable,
10489 const X86Subtarget &Subtarget,
10490 SelectionDAG &DAG) {
10491 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10492 "Unexpected VTRUNC type");
10493 if (!Subtarget.hasAVX512())
10494 return SDValue();
10495
10496 unsigned NumElts = VT.getVectorNumElements();
10497 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10498 unsigned MaxScale = 64 / EltSizeInBits;
10499 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10500 // TODO: Support non-BWI VPMOVWB truncations?
10501 unsigned SrcEltBits = EltSizeInBits * Scale;
10502 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10503 continue;
10504
10505 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10506 // Bail if the V2 elements are undef.
10507 unsigned NumHalfSrcElts = NumElts / Scale;
10508 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10509 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10510 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10511 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10512 continue;
10513
10514 // The elements beyond the truncation must be undef/zero.
10515 unsigned UpperElts = NumElts - NumSrcElts;
10516 if (UpperElts > 0 &&
10517 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10518 continue;
10519 bool UndefUppers =
10520 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10521
10522 // For offset truncations, ensure that the concat is cheap.
10523 if (Offset) {
10524 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
10525 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10526 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
10527 return Lo.getOperand(0) == Hi.getOperand(0);
10528 if (ISD::isNormalLoad(Lo.getNode()) &&
10529 ISD::isNormalLoad(Hi.getNode())) {
10530 auto *LDLo = cast<LoadSDNode>(Lo);
10531 auto *LDHi = cast<LoadSDNode>(Hi);
10533 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10534 }
10535 return false;
10536 };
10537 if (!IsCheapConcat(peekThroughBitcasts(V1), peekThroughBitcasts(V2)))
10538 continue;
10539 }
10540
10541 // As we're using both sources then we need to concat them together
10542 // and truncate from the double-sized src.
10543 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10544 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10545
10546 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10547 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10548 Src = DAG.getBitcast(SrcVT, Src);
10549
10550 // Shift the offset'd elements into place for the truncation.
10551 // TODO: Use getTargetVShiftByConstNode.
10552 if (Offset)
10553 Src = DAG.getNode(
10554 X86ISD::VSRLI, DL, SrcVT, Src,
10555 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10556
10557 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10558 }
10559 }
10560
10561 return SDValue();
10562}
10563
10564/// Check whether a compaction lowering can be done by dropping even/odd
10565/// elements and compute how many times even/odd elements must be dropped.
10566///
10567/// This handles shuffles which take every Nth element where N is a power of
10568/// two. Example shuffle masks:
10569///
10570/// (even)
10571/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10572/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10573/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10574/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10575/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10576/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10577///
10578/// (odd)
10579/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10580/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10581///
10582/// Any of these lanes can of course be undef.
10583///
10584/// This routine only supports N <= 3.
10585/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10586/// for larger N.
10587///
10588/// \returns N above, or the number of times even/odd elements must be dropped
10589/// if there is such a number. Otherwise returns zero.
10590static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10591 bool IsSingleInput) {
10592 // The modulus for the shuffle vector entries is based on whether this is
10593 // a single input or not.
10594 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10595 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10596 "We should only be called with masks with a power-of-2 size!");
10597
10598 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10599 int Offset = MatchEven ? 0 : 1;
10600
10601 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10602 // and 2^3 simultaneously. This is because we may have ambiguity with
10603 // partially undef inputs.
10604 bool ViableForN[3] = {true, true, true};
10605
10606 for (int i = 0, e = Mask.size(); i < e; ++i) {
10607 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10608 // want.
10609 if (Mask[i] < 0)
10610 continue;
10611
10612 bool IsAnyViable = false;
10613 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10614 if (ViableForN[j]) {
10615 uint64_t N = j + 1;
10616
10617 // The shuffle mask must be equal to (i * 2^N) % M.
10618 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10619 IsAnyViable = true;
10620 else
10621 ViableForN[j] = false;
10622 }
10623 // Early exit if we exhaust the possible powers of two.
10624 if (!IsAnyViable)
10625 break;
10626 }
10627
10628 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10629 if (ViableForN[j])
10630 return j + 1;
10631
10632 // Return 0 as there is no viable power of two.
10633 return 0;
10634}
10635
10636// X86 has dedicated pack instructions that can handle specific truncation
10637// operations: PACKSS and PACKUS.
10638// Checks for compaction shuffle masks if MaxStages > 1.
10639// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10640static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10641 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10642 const SelectionDAG &DAG,
10643 const X86Subtarget &Subtarget,
10644 unsigned MaxStages = 1) {
10645 unsigned NumElts = VT.getVectorNumElements();
10646 unsigned BitSize = VT.getScalarSizeInBits();
10647 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10648 "Illegal maximum compaction");
10649
10650 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10651 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10652 unsigned NumPackedBits = NumSrcBits - BitSize;
10653 N1 = peekThroughBitcasts(N1);
10654 N2 = peekThroughBitcasts(N2);
10655 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10656 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10657 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10658 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10659 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10660 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10661 return false;
10662 if (Subtarget.hasSSE41() || BitSize == 8) {
10663 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10664 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10665 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10666 V1 = N1;
10667 V2 = N2;
10668 SrcVT = PackVT;
10669 PackOpcode = X86ISD::PACKUS;
10670 return true;
10671 }
10672 }
10673 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10674 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10675 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10676 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10677 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10678 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10679 V1 = N1;
10680 V2 = N2;
10681 SrcVT = PackVT;
10682 PackOpcode = X86ISD::PACKSS;
10683 return true;
10684 }
10685 return false;
10686 };
10687
10688 // Attempt to match against wider and wider compaction patterns.
10689 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10690 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10691 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10692
10693 // Try binary shuffle.
10694 SmallVector<int, 32> BinaryMask;
10695 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10696 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10697 if (MatchPACK(V1, V2, PackVT))
10698 return true;
10699
10700 // Try unary shuffle.
10701 SmallVector<int, 32> UnaryMask;
10702 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10703 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10704 if (MatchPACK(V1, V1, PackVT))
10705 return true;
10706 }
10707
10708 return false;
10709}
10710
10712 SDValue V2, ArrayRef<int> Mask,
10713 const X86Subtarget &Subtarget,
10714 SelectionDAG &DAG) {
10715 MVT PackVT;
10716 unsigned PackOpcode;
10717 unsigned SizeBits = VT.getSizeInBits();
10718 unsigned EltBits = VT.getScalarSizeInBits();
10719 unsigned MaxStages = Log2_32(64 / EltBits);
10720 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10721 Subtarget, MaxStages))
10722 return SDValue();
10723
10724 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10725 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10726
10727 // Don't lower multi-stage packs on AVX512, truncation is better.
10728 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10729 return SDValue();
10730
10731 // Pack to the largest type possible:
10732 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10733 unsigned MaxPackBits = 16;
10734 if (CurrentEltBits > 16 &&
10735 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10736 MaxPackBits = 32;
10737
10738 // Repeatedly pack down to the target size.
10739 SDValue Res;
10740 for (unsigned i = 0; i != NumStages; ++i) {
10741 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10742 unsigned NumSrcElts = SizeBits / SrcEltBits;
10743 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10744 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10745 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10746 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10747 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10748 DAG.getBitcast(SrcVT, V2));
10749 V1 = V2 = Res;
10750 CurrentEltBits /= 2;
10751 }
10752 assert(Res && Res.getValueType() == VT &&
10753 "Failed to lower compaction shuffle");
10754 return Res;
10755}
10756
10757/// Try to emit a bitmask instruction for a shuffle.
10758///
10759/// This handles cases where we can model a blend exactly as a bitmask due to
10760/// one of the inputs being zeroable.
10762 SDValue V2, ArrayRef<int> Mask,
10763 const APInt &Zeroable,
10764 const X86Subtarget &Subtarget,
10765 SelectionDAG &DAG) {
10766 MVT MaskVT = VT;
10767 MVT EltVT = VT.getVectorElementType();
10768 SDValue Zero, AllOnes;
10769 // Use f64 if i64 isn't legal.
10770 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10771 EltVT = MVT::f64;
10772 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10773 }
10774
10775 MVT LogicVT = VT;
10776 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10777 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10778 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
10779 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10780 LogicVT =
10781 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10782 } else {
10783 Zero = DAG.getConstant(0, DL, EltVT);
10784 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10785 }
10786
10787 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10788 SDValue V;
10789 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10790 if (Zeroable[i])
10791 continue;
10792 if (Mask[i] % Size != i)
10793 return SDValue(); // Not a blend.
10794 if (!V)
10795 V = Mask[i] < Size ? V1 : V2;
10796 else if (V != (Mask[i] < Size ? V1 : V2))
10797 return SDValue(); // Can only let one input through the mask.
10798
10799 VMaskOps[i] = AllOnes;
10800 }
10801 if (!V)
10802 return SDValue(); // No non-zeroable elements!
10803
10804 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10805 VMask = DAG.getBitcast(LogicVT, VMask);
10806 V = DAG.getBitcast(LogicVT, V);
10807 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10808 return DAG.getBitcast(VT, And);
10809}
10810
10811/// Try to emit a blend instruction for a shuffle using bit math.
10812///
10813/// This is used as a fallback approach when first class blend instructions are
10814/// unavailable. Currently it is only suitable for integer vectors, but could
10815/// be generalized for floating point vectors if desirable.
10817 SDValue V2, ArrayRef<int> Mask,
10818 SelectionDAG &DAG) {
10819 assert(VT.isInteger() && "Only supports integer vector types!");
10820 MVT EltVT = VT.getVectorElementType();
10821 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10822 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10824 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10825 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10826 return SDValue(); // Shuffled input!
10827 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10828 }
10829
10830 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10831 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10832}
10833
10835 SDValue PreservedSrc,
10836 const X86Subtarget &Subtarget,
10837 SelectionDAG &DAG);
10838
10841 const APInt &Zeroable, bool &ForceV1Zero,
10842 bool &ForceV2Zero, uint64_t &BlendMask) {
10843 bool V1IsZeroOrUndef =
10845 bool V2IsZeroOrUndef =
10846 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10847
10848 BlendMask = 0;
10849 ForceV1Zero = false, ForceV2Zero = false;
10850 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10851
10852 int NumElts = Mask.size();
10853 int NumLanes = VT.getSizeInBits() / 128;
10854 int NumEltsPerLane = NumElts / NumLanes;
10855 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10856
10857 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10858 // then ensure the blend mask part for that lane just references that input.
10859 bool ForceWholeLaneMasks =
10860 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10861
10862 // Attempt to generate the binary blend mask. If an input is zero then
10863 // we can use any lane.
10864 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10865 // Keep track of the inputs used per lane.
10866 bool LaneV1InUse = false;
10867 bool LaneV2InUse = false;
10868 uint64_t LaneBlendMask = 0;
10869 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10870 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10871 int M = Mask[Elt];
10872 if (M == SM_SentinelUndef)
10873 continue;
10874 if (M == Elt || (0 <= M && M < NumElts &&
10875 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10876 Mask[Elt] = Elt;
10877 LaneV1InUse = true;
10878 continue;
10879 }
10880 if (M == (Elt + NumElts) ||
10881 (NumElts <= M &&
10882 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10883 LaneBlendMask |= 1ull << LaneElt;
10884 Mask[Elt] = Elt + NumElts;
10885 LaneV2InUse = true;
10886 continue;
10887 }
10888 if (Zeroable[Elt]) {
10889 if (V1IsZeroOrUndef) {
10890 ForceV1Zero = true;
10891 Mask[Elt] = Elt;
10892 LaneV1InUse = true;
10893 continue;
10894 }
10895 if (V2IsZeroOrUndef) {
10896 ForceV2Zero = true;
10897 LaneBlendMask |= 1ull << LaneElt;
10898 Mask[Elt] = Elt + NumElts;
10899 LaneV2InUse = true;
10900 continue;
10901 }
10902 }
10903 return false;
10904 }
10905
10906 // If we only used V2 then splat the lane blend mask to avoid any demanded
10907 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10908 // blend mask bit).
10909 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10910 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10911
10912 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10913 }
10914 return true;
10915}
10916
10917/// Try to emit a blend instruction for a shuffle.
10918///
10919/// This doesn't do any checks for the availability of instructions for blending
10920/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10921/// be matched in the backend with the type given. What it does check for is
10922/// that the shuffle mask is a blend, or convertible into a blend with zero.
10924 SDValue V2, ArrayRef<int> Original,
10925 const APInt &Zeroable,
10926 const X86Subtarget &Subtarget,
10927 SelectionDAG &DAG) {
10928 uint64_t BlendMask = 0;
10929 bool ForceV1Zero = false, ForceV2Zero = false;
10930 SmallVector<int, 64> Mask(Original);
10931 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10932 BlendMask))
10933 return SDValue();
10934
10935 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10936 if (ForceV1Zero)
10937 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10938 if (ForceV2Zero)
10939 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10940
10941 unsigned NumElts = VT.getVectorNumElements();
10942
10943 switch (VT.SimpleTy) {
10944 case MVT::v4i64:
10945 case MVT::v8i32:
10946 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10947 [[fallthrough]];
10948 case MVT::v4f64:
10949 case MVT::v8f32:
10950 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10951 [[fallthrough]];
10952 case MVT::v2f64:
10953 case MVT::v2i64:
10954 case MVT::v4f32:
10955 case MVT::v4i32:
10956 case MVT::v8i16:
10957 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10958 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10959 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10960 case MVT::v16i16: {
10961 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10962 SmallVector<int, 8> RepeatedMask;
10963 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10964 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10965 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10966 BlendMask = 0;
10967 for (int i = 0; i < 8; ++i)
10968 if (RepeatedMask[i] >= 8)
10969 BlendMask |= 1ull << i;
10970 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10971 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10972 }
10973 // Use PBLENDW for lower/upper lanes and then blend lanes.
10974 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10975 // merge to VSELECT where useful.
10976 uint64_t LoMask = BlendMask & 0xFF;
10977 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10978 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10979 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10980 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10981 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10982 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10983 return DAG.getVectorShuffle(
10984 MVT::v16i16, DL, Lo, Hi,
10985 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10986 }
10987 [[fallthrough]];
10988 }
10989 case MVT::v32i8:
10990 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10991 [[fallthrough]];
10992 case MVT::v16i8: {
10993 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10994
10995 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10996 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10997 Subtarget, DAG))
10998 return Masked;
10999
11000 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11001 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11002 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11003 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11004 }
11005
11006 // If we have VPTERNLOG, we can use that as a bit blend.
11007 if (Subtarget.hasVLX())
11008 if (SDValue BitBlend =
11009 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11010 return BitBlend;
11011
11012 // Scale the blend by the number of bytes per element.
11013 int Scale = VT.getScalarSizeInBits() / 8;
11014
11015 // This form of blend is always done on bytes. Compute the byte vector
11016 // type.
11017 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11018
11019 // x86 allows load folding with blendvb from the 2nd source operand. But
11020 // we are still using LLVM select here (see comment below), so that's V1.
11021 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11022 // allow that load-folding possibility.
11023 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11025 std::swap(V1, V2);
11026 }
11027
11028 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11029 // mix of LLVM's code generator and the x86 backend. We tell the code
11030 // generator that boolean values in the elements of an x86 vector register
11031 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11032 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11033 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11034 // of the element (the remaining are ignored) and 0 in that high bit would
11035 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11036 // the LLVM model for boolean values in vector elements gets the relevant
11037 // bit set, it is set backwards and over constrained relative to x86's
11038 // actual model.
11039 SmallVector<SDValue, 32> VSELECTMask;
11040 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11041 for (int j = 0; j < Scale; ++j)
11042 VSELECTMask.push_back(
11043 Mask[i] < 0
11044 ? DAG.getUNDEF(MVT::i8)
11045 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11046
11047 V1 = DAG.getBitcast(BlendVT, V1);
11048 V2 = DAG.getBitcast(BlendVT, V2);
11049 return DAG.getBitcast(
11050 VT,
11051 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11052 V1, V2));
11053 }
11054 case MVT::v16f32:
11055 case MVT::v8f64:
11056 case MVT::v8i64:
11057 case MVT::v16i32:
11058 case MVT::v32i16:
11059 case MVT::v64i8: {
11060 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11061 bool OptForSize = DAG.shouldOptForSize();
11062 if (!OptForSize) {
11063 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11064 Subtarget, DAG))
11065 return Masked;
11066 }
11067
11068 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11069 // masked move.
11070 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11071 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11072 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11073 }
11074 default:
11075 llvm_unreachable("Not a supported integer vector type!");
11076 }
11077}
11078
11079/// Try to lower as a blend of elements from two inputs followed by
11080/// a single-input permutation.
11081///
11082/// This matches the pattern where we can blend elements from two inputs and
11083/// then reduce the shuffle to a single-input permutation.
11085 SDValue V1, SDValue V2,
11086 ArrayRef<int> Mask,
11087 SelectionDAG &DAG,
11088 bool ImmBlends = false) {
11089 // We build up the blend mask while checking whether a blend is a viable way
11090 // to reduce the shuffle.
11091 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11092 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11093
11094 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11095 if (Mask[i] < 0)
11096 continue;
11097
11098 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11099
11100 if (BlendMask[Mask[i] % Size] < 0)
11101 BlendMask[Mask[i] % Size] = Mask[i];
11102 else if (BlendMask[Mask[i] % Size] != Mask[i])
11103 return SDValue(); // Can't blend in the needed input!
11104
11105 PermuteMask[i] = Mask[i] % Size;
11106 }
11107
11108 // If only immediate blends, then bail if the blend mask can't be widened to
11109 // i16.
11110 unsigned EltSize = VT.getScalarSizeInBits();
11111 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11112 return SDValue();
11113
11114 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11115 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11116}
11117
11118/// Try to lower as an unpack of elements from two inputs followed by
11119/// a single-input permutation.
11120///
11121/// This matches the pattern where we can unpack elements from two inputs and
11122/// then reduce the shuffle to a single-input (wider) permutation.
11124 SDValue V1, SDValue V2,
11125 ArrayRef<int> Mask,
11126 SelectionDAG &DAG) {
11127 int NumElts = Mask.size();
11128 int NumLanes = VT.getSizeInBits() / 128;
11129 int NumLaneElts = NumElts / NumLanes;
11130 int NumHalfLaneElts = NumLaneElts / 2;
11131
11132 bool MatchLo = true, MatchHi = true;
11133 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11134
11135 // Determine UNPCKL/UNPCKH type and operand order.
11136 for (int Elt = 0; Elt != NumElts; ++Elt) {
11137 int M = Mask[Elt];
11138 if (M < 0)
11139 continue;
11140
11141 // Normalize the mask value depending on whether it's V1 or V2.
11142 int NormM = M;
11143 SDValue &Op = Ops[Elt & 1];
11144 if (M < NumElts && (Op.isUndef() || Op == V1))
11145 Op = V1;
11146 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11147 Op = V2;
11148 NormM -= NumElts;
11149 } else
11150 return SDValue();
11151
11152 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11153 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11154 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11155 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11156 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11157 if (MatchLoAnyLane || MatchHiAnyLane) {
11158 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11159 "Failed to match UNPCKLO/UNPCKHI");
11160 break;
11161 }
11162 }
11163 MatchLo &= MatchLoAnyLane;
11164 MatchHi &= MatchHiAnyLane;
11165 if (!MatchLo && !MatchHi)
11166 return SDValue();
11167 }
11168 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11169
11170 // Element indices have changed after unpacking. Calculate permute mask
11171 // so that they will be put back to the position as dictated by the
11172 // original shuffle mask indices.
11173 SmallVector<int, 32> PermuteMask(NumElts, -1);
11174 for (int Elt = 0; Elt != NumElts; ++Elt) {
11175 int M = Mask[Elt];
11176 if (M < 0)
11177 continue;
11178 int NormM = M;
11179 if (NumElts <= M)
11180 NormM -= NumElts;
11181 bool IsFirstOp = M < NumElts;
11182 int BaseMaskElt =
11183 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11184 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11185 PermuteMask[Elt] = BaseMaskElt;
11186 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11187 PermuteMask[Elt] = BaseMaskElt + 1;
11188 assert(PermuteMask[Elt] != -1 &&
11189 "Input mask element is defined but failed to assign permute mask");
11190 }
11191
11192 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11193 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11194 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11195}
11196
11197/// Try to lower a shuffle as a permute of the inputs followed by an
11198/// UNPCK instruction.
11199///
11200/// This specifically targets cases where we end up with alternating between
11201/// the two inputs, and so can permute them into something that feeds a single
11202/// UNPCK instruction. Note that this routine only targets integer vectors
11203/// because for floating point vectors we have a generalized SHUFPS lowering
11204/// strategy that handles everything that doesn't *exactly* match an unpack,
11205/// making this clever lowering unnecessary.
11207 SDValue V1, SDValue V2,
11208 ArrayRef<int> Mask,
11209 const X86Subtarget &Subtarget,
11210 SelectionDAG &DAG) {
11211 int Size = Mask.size();
11212 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11213
11214 // This routine only supports 128-bit integer dual input vectors.
11215 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11216 return SDValue();
11217
11218 int NumLoInputs =
11219 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11220 int NumHiInputs =
11221 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11222
11223 bool UnpackLo = NumLoInputs >= NumHiInputs;
11224
11225 auto TryUnpack = [&](int ScalarSize, int Scale) {
11226 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11227 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11228
11229 for (int i = 0; i < Size; ++i) {
11230 if (Mask[i] < 0)
11231 continue;
11232
11233 // Each element of the unpack contains Scale elements from this mask.
11234 int UnpackIdx = i / Scale;
11235
11236 // We only handle the case where V1 feeds the first slots of the unpack.
11237 // We rely on canonicalization to ensure this is the case.
11238 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11239 return SDValue();
11240
11241 // Setup the mask for this input. The indexing is tricky as we have to
11242 // handle the unpack stride.
11243 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11244 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11245 Mask[i] % Size;
11246 }
11247
11248 // If we will have to shuffle both inputs to use the unpack, check whether
11249 // we can just unpack first and shuffle the result. If so, skip this unpack.
11250 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11251 !isNoopShuffleMask(V2Mask))
11252 return SDValue();
11253
11254 // Shuffle the inputs into place.
11255 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11256 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11257
11258 // Cast the inputs to the type we will use to unpack them.
11259 MVT UnpackVT =
11260 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11261 V1 = DAG.getBitcast(UnpackVT, V1);
11262 V2 = DAG.getBitcast(UnpackVT, V2);
11263
11264 // Unpack the inputs and cast the result back to the desired type.
11265 return DAG.getBitcast(
11266 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11267 UnpackVT, V1, V2));
11268 };
11269
11270 // We try each unpack from the largest to the smallest to try and find one
11271 // that fits this mask.
11272 int OrigScalarSize = VT.getScalarSizeInBits();
11273 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11274 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11275 return Unpack;
11276
11277 // If we're shuffling with a zero vector then we're better off not doing
11278 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11280 ISD::isBuildVectorAllZeros(V2.getNode()))
11281 return SDValue();
11282
11283 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11284 // initial unpack.
11285 if (NumLoInputs == 0 || NumHiInputs == 0) {
11286 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11287 "We have to have *some* inputs!");
11288 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11289
11290 // FIXME: We could consider the total complexity of the permute of each
11291 // possible unpacking. Or at the least we should consider how many
11292 // half-crossings are created.
11293 // FIXME: We could consider commuting the unpacks.
11294
11295 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11296 for (int i = 0; i < Size; ++i) {
11297 if (Mask[i] < 0)
11298 continue;
11299
11300 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11301
11302 PermMask[i] =
11303 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11304 }
11305 return DAG.getVectorShuffle(
11306 VT, DL,
11307 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11308 V1, V2),
11309 DAG.getUNDEF(VT), PermMask);
11310 }
11311
11312 return SDValue();
11313}
11314
11315/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11316/// permuting the elements of the result in place.
11318 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11319 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11320 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11321 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11322 (VT.is512BitVector() && !Subtarget.hasBWI()))
11323 return SDValue();
11324
11325 // We don't currently support lane crossing permutes.
11326 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11327 return SDValue();
11328
11329 int Scale = VT.getScalarSizeInBits() / 8;
11330 int NumLanes = VT.getSizeInBits() / 128;
11331 int NumElts = VT.getVectorNumElements();
11332 int NumEltsPerLane = NumElts / NumLanes;
11333
11334 // Determine range of mask elts.
11335 bool Blend1 = true;
11336 bool Blend2 = true;
11337 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11338 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11339 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11340 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11341 int M = Mask[Lane + Elt];
11342 if (M < 0)
11343 continue;
11344 if (M < NumElts) {
11345 Blend1 &= (M == (Lane + Elt));
11346 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11347 M = M % NumEltsPerLane;
11348 Range1.first = std::min(Range1.first, M);
11349 Range1.second = std::max(Range1.second, M);
11350 } else {
11351 M -= NumElts;
11352 Blend2 &= (M == (Lane + Elt));
11353 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11354 M = M % NumEltsPerLane;
11355 Range2.first = std::min(Range2.first, M);
11356 Range2.second = std::max(Range2.second, M);
11357 }
11358 }
11359 }
11360
11361 // Bail if we don't need both elements.
11362 // TODO - it might be worth doing this for unary shuffles if the permute
11363 // can be widened.
11364 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11365 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11366 return SDValue();
11367
11368 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11369 return SDValue();
11370
11371 // Rotate the 2 ops so we can access both ranges, then permute the result.
11372 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11373 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11374 SDValue Rotate = DAG.getBitcast(
11375 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11376 DAG.getBitcast(ByteVT, Lo),
11377 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11378 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11379 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11380 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11381 int M = Mask[Lane + Elt];
11382 if (M < 0)
11383 continue;
11384 if (M < NumElts)
11385 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11386 else
11387 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11388 }
11389 }
11390 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11391 };
11392
11393 // Check if the ranges are small enough to rotate from either direction.
11394 if (Range2.second < Range1.first)
11395 return RotateAndPermute(V1, V2, Range1.first, 0);
11396 if (Range1.second < Range2.first)
11397 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11398 return SDValue();
11399}
11400
11402 return isUndefOrEqual(Mask, 0);
11403}
11404
11406 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11407}
11408
11409/// Check if the Mask consists of the same element repeated multiple times.
11411 size_t NumUndefs = 0;
11412 std::optional<int> UniqueElt;
11413 for (int Elt : Mask) {
11414 if (Elt == SM_SentinelUndef) {
11415 NumUndefs++;
11416 continue;
11417 }
11418 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11419 return false;
11420 UniqueElt = Elt;
11421 }
11422 // Make sure the element is repeated enough times by checking the number of
11423 // undefs is small.
11424 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11425}
11426
11427/// Generic routine to decompose a shuffle and blend into independent
11428/// blends and permutes.
11429///
11430/// This matches the extremely common pattern for handling combined
11431/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11432/// operations. It will try to pick the best arrangement of shuffles and
11433/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11435 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11436 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11437 int NumElts = Mask.size();
11438 int NumLanes = VT.getSizeInBits() / 128;
11439 int NumEltsPerLane = NumElts / NumLanes;
11440
11441 // Shuffle the input elements into the desired positions in V1 and V2 and
11442 // unpack/blend them together.
11443 bool IsAlternating = true;
11444 bool V1Zero = true, V2Zero = true;
11445 SmallVector<int, 32> V1Mask(NumElts, -1);
11446 SmallVector<int, 32> V2Mask(NumElts, -1);
11447 SmallVector<int, 32> FinalMask(NumElts, -1);
11448 for (int i = 0; i < NumElts; ++i) {
11449 int M = Mask[i];
11450 if (M >= 0 && M < NumElts) {
11451 V1Mask[i] = M;
11452 FinalMask[i] = i;
11453 V1Zero &= Zeroable[i];
11454 IsAlternating &= (i & 1) == 0;
11455 } else if (M >= NumElts) {
11456 V2Mask[i] = M - NumElts;
11457 FinalMask[i] = i + NumElts;
11458 V2Zero &= Zeroable[i];
11459 IsAlternating &= (i & 1) == 1;
11460 }
11461 }
11462
11463 // If we effectively only demand the 0'th element of \p Input, and not only
11464 // as 0'th element, then broadcast said input,
11465 // and change \p InputMask to be a no-op (identity) mask.
11466 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11467 &DAG](SDValue &Input,
11468 MutableArrayRef<int> InputMask) {
11469 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11470 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11471 !X86::mayFoldLoad(Input, Subtarget)))
11472 return;
11473 if (isNoopShuffleMask(InputMask))
11474 return;
11475 assert(isBroadcastShuffleMask(InputMask) &&
11476 "Expected to demand only the 0'th element.");
11477 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11478 for (auto I : enumerate(InputMask)) {
11479 int &InputMaskElt = I.value();
11480 if (InputMaskElt >= 0)
11481 InputMaskElt = I.index();
11482 }
11483 };
11484
11485 // Currently, we may need to produce one shuffle per input, and blend results.
11486 // It is possible that the shuffle for one of the inputs is already a no-op.
11487 // See if we can simplify non-no-op shuffles into broadcasts,
11488 // which we consider to be strictly better than an arbitrary shuffle.
11489 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11491 canonicalizeBroadcastableInput(V1, V1Mask);
11492 canonicalizeBroadcastableInput(V2, V2Mask);
11493 }
11494
11495 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11496 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11497 // the shuffle may be able to fold with a load or other benefit. However, when
11498 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11499 // pre-shuffle first is a better strategy.
11500 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11501 // Only prefer immediate blends to unpack/rotate.
11502 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11503 DAG, true))
11504 return BlendPerm;
11505 // If either input vector provides only a single element which is repeated
11506 // multiple times, unpacking from both input vectors would generate worse
11507 // code. e.g. for
11508 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11509 // it is better to process t4 first to create a vector of t4[0], then unpack
11510 // that vector with t2.
11511 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11513 if (SDValue UnpackPerm =
11514 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11515 return UnpackPerm;
11517 DL, VT, V1, V2, Mask, Subtarget, DAG))
11518 return RotatePerm;
11519 // Unpack/rotate failed - try again with variable blends.
11520 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11521 DAG))
11522 return BlendPerm;
11523 if (VT.getScalarSizeInBits() >= 32)
11524 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11525 DL, VT, V1, V2, Mask, Subtarget, DAG))
11526 return PermUnpack;
11527 }
11528
11529 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11530 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11531 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11532 // than half the elements coming from each source.
11533 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11534 V1Mask.assign(NumElts, -1);
11535 V2Mask.assign(NumElts, -1);
11536 FinalMask.assign(NumElts, -1);
11537 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11538 for (int j = 0; j != NumEltsPerLane; ++j) {
11539 int M = Mask[i + j];
11540 if (M >= 0 && M < NumElts) {
11541 V1Mask[i + (j / 2)] = M;
11542 FinalMask[i + j] = i + (j / 2);
11543 } else if (M >= NumElts) {
11544 V2Mask[i + (j / 2)] = M - NumElts;
11545 FinalMask[i + j] = i + (j / 2) + NumElts;
11546 }
11547 }
11548 }
11549
11550 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11551 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11552 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11553}
11554
11555static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11556 const X86Subtarget &Subtarget,
11557 ArrayRef<int> Mask) {
11558 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11559 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11560
11561 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11562 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11563 int MaxSubElts = 64 / EltSizeInBits;
11564 unsigned RotateAmt, NumSubElts;
11565 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11566 MaxSubElts, NumSubElts, RotateAmt))
11567 return -1;
11568 unsigned NumElts = Mask.size();
11569 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11570 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11571 return RotateAmt;
11572}
11573
11574/// Lower shuffle using X86ISD::VROTLI rotations.
11576 ArrayRef<int> Mask,
11577 const X86Subtarget &Subtarget,
11578 SelectionDAG &DAG) {
11579 // Only XOP + AVX512 targets have bit rotation instructions.
11580 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11581 bool IsLegal =
11582 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11583 if (!IsLegal && Subtarget.hasSSE3())
11584 return SDValue();
11585
11586 MVT RotateVT;
11587 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11588 Subtarget, Mask);
11589 if (RotateAmt < 0)
11590 return SDValue();
11591
11592 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11593 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11594 // widen to vXi16 or more then existing lowering should will be better.
11595 if (!IsLegal) {
11596 if ((RotateAmt % 16) == 0)
11597 return SDValue();
11598 // TODO: Use getTargetVShiftByConstNode.
11599 unsigned ShlAmt = RotateAmt;
11600 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11601 V1 = DAG.getBitcast(RotateVT, V1);
11602 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11603 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11604 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11605 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11606 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11607 return DAG.getBitcast(VT, Rot);
11608 }
11609
11610 SDValue Rot =
11611 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11612 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11613 return DAG.getBitcast(VT, Rot);
11614}
11615
11616/// Try to match a vector shuffle as an element rotation.
11617///
11618/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11620 ArrayRef<int> Mask) {
11621 int NumElts = Mask.size();
11622
11623 // We need to detect various ways of spelling a rotation:
11624 // [11, 12, 13, 14, 15, 0, 1, 2]
11625 // [-1, 12, 13, 14, -1, -1, 1, -1]
11626 // [-1, -1, -1, -1, -1, -1, 1, 2]
11627 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11628 // [-1, 4, 5, 6, -1, -1, 9, -1]
11629 // [-1, 4, 5, 6, -1, -1, -1, -1]
11630 int Rotation = 0;
11631 SDValue Lo, Hi;
11632 for (int i = 0; i < NumElts; ++i) {
11633 int M = Mask[i];
11634 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11635 "Unexpected mask index.");
11636 if (M < 0)
11637 continue;
11638
11639 // Determine where a rotated vector would have started.
11640 int StartIdx = i - (M % NumElts);
11641 if (StartIdx == 0)
11642 // The identity rotation isn't interesting, stop.
11643 return -1;
11644
11645 // If we found the tail of a vector the rotation must be the missing
11646 // front. If we found the head of a vector, it must be how much of the
11647 // head.
11648 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11649
11650 if (Rotation == 0)
11651 Rotation = CandidateRotation;
11652 else if (Rotation != CandidateRotation)
11653 // The rotations don't match, so we can't match this mask.
11654 return -1;
11655
11656 // Compute which value this mask is pointing at.
11657 SDValue MaskV = M < NumElts ? V1 : V2;
11658
11659 // Compute which of the two target values this index should be assigned
11660 // to. This reflects whether the high elements are remaining or the low
11661 // elements are remaining.
11662 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11663
11664 // Either set up this value if we've not encountered it before, or check
11665 // that it remains consistent.
11666 if (!TargetV)
11667 TargetV = MaskV;
11668 else if (TargetV != MaskV)
11669 // This may be a rotation, but it pulls from the inputs in some
11670 // unsupported interleaving.
11671 return -1;
11672 }
11673
11674 // Check that we successfully analyzed the mask, and normalize the results.
11675 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11676 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11677 if (!Lo)
11678 Lo = Hi;
11679 else if (!Hi)
11680 Hi = Lo;
11681
11682 V1 = Lo;
11683 V2 = Hi;
11684
11685 return Rotation;
11686}
11687
11688/// Try to lower a vector shuffle as a byte rotation.
11689///
11690/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11691/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11692/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11693/// try to generically lower a vector shuffle through such an pattern. It
11694/// does not check for the profitability of lowering either as PALIGNR or
11695/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11696/// This matches shuffle vectors that look like:
11697///
11698/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11699///
11700/// Essentially it concatenates V1 and V2, shifts right by some number of
11701/// elements, and takes the low elements as the result. Note that while this is
11702/// specified as a *right shift* because x86 is little-endian, it is a *left
11703/// rotate* of the vector lanes.
11705 ArrayRef<int> Mask) {
11706 // Don't accept any shuffles with zero elements.
11707 if (isAnyZero(Mask))
11708 return -1;
11709
11710 // PALIGNR works on 128-bit lanes.
11711 SmallVector<int, 16> RepeatedMask;
11712 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11713 return -1;
11714
11715 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11716 if (Rotation <= 0)
11717 return -1;
11718
11719 // PALIGNR rotates bytes, so we need to scale the
11720 // rotation based on how many bytes are in the vector lane.
11721 int NumElts = RepeatedMask.size();
11722 int Scale = 16 / NumElts;
11723 return Rotation * Scale;
11724}
11725
11727 SDValue V2, ArrayRef<int> Mask,
11728 const X86Subtarget &Subtarget,
11729 SelectionDAG &DAG) {
11730 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11731
11732 SDValue Lo = V1, Hi = V2;
11733 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11734 if (ByteRotation <= 0)
11735 return SDValue();
11736
11737 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11738 // PSLLDQ/PSRLDQ.
11739 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11740 Lo = DAG.getBitcast(ByteVT, Lo);
11741 Hi = DAG.getBitcast(ByteVT, Hi);
11742
11743 // SSSE3 targets can use the palignr instruction.
11744 if (Subtarget.hasSSSE3()) {
11745 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11746 "512-bit PALIGNR requires BWI instructions");
11747 return DAG.getBitcast(
11748 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11749 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11750 }
11751
11752 assert(VT.is128BitVector() &&
11753 "Rotate-based lowering only supports 128-bit lowering!");
11754 assert(Mask.size() <= 16 &&
11755 "Can shuffle at most 16 bytes in a 128-bit vector!");
11756 assert(ByteVT == MVT::v16i8 &&
11757 "SSE2 rotate lowering only needed for v16i8!");
11758
11759 // Default SSE2 implementation
11760 int LoByteShift = 16 - ByteRotation;
11761 int HiByteShift = ByteRotation;
11762
11763 SDValue LoShift =
11764 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11765 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11766 SDValue HiShift =
11767 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11768 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11769 return DAG.getBitcast(VT,
11770 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11771}
11772
11773/// Try to lower a vector shuffle as a dword/qword rotation.
11774///
11775/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11776/// rotation of the concatenation of two vectors; This routine will
11777/// try to generically lower a vector shuffle through such an pattern.
11778///
11779/// Essentially it concatenates V1 and V2, shifts right by some number of
11780/// elements, and takes the low elements as the result. Note that while this is
11781/// specified as a *right shift* because x86 is little-endian, it is a *left
11782/// rotate* of the vector lanes.
11784 SDValue V2, ArrayRef<int> Mask,
11785 const APInt &Zeroable,
11786 const X86Subtarget &Subtarget,
11787 SelectionDAG &DAG) {
11788 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11789 "Only 32-bit and 64-bit elements are supported!");
11790
11791 // 128/256-bit vectors are only supported with VLX.
11792 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11793 && "VLX required for 128/256-bit vectors");
11794
11795 SDValue Lo = V1, Hi = V2;
11796 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11797 if (0 < Rotation)
11798 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11799 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11800
11801 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11802 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11803 // TODO: We can probably make this more aggressive and use shift-pairs like
11804 // lowerShuffleAsByteShiftMask.
11805 unsigned NumElts = Mask.size();
11806 unsigned ZeroLo = Zeroable.countr_one();
11807 unsigned ZeroHi = Zeroable.countl_one();
11808 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11809 if (!ZeroLo && !ZeroHi)
11810 return SDValue();
11811
11812 if (ZeroLo) {
11813 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11814 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11815 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11816 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11817 getZeroVector(VT, Subtarget, DAG, DL),
11818 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11819 }
11820
11821 if (ZeroHi) {
11822 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11823 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11824 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11825 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11826 getZeroVector(VT, Subtarget, DAG, DL), Src,
11827 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11828 }
11829
11830 return SDValue();
11831}
11832
11833/// Try to lower a vector shuffle as a byte shift sequence.
11835 SDValue V2, ArrayRef<int> Mask,
11836 const APInt &Zeroable,
11837 const X86Subtarget &Subtarget,
11838 SelectionDAG &DAG) {
11839 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11840 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11841
11842 // We need a shuffle that has zeros at one/both ends and a sequential
11843 // shuffle from one source within.
11844 unsigned ZeroLo = Zeroable.countr_one();
11845 unsigned ZeroHi = Zeroable.countl_one();
11846 if (!ZeroLo && !ZeroHi)
11847 return SDValue();
11848
11849 unsigned NumElts = Mask.size();
11850 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11851 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11852 return SDValue();
11853
11854 unsigned Scale = VT.getScalarSizeInBits() / 8;
11855 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11856 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11857 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11858 return SDValue();
11859
11860 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11861 Res = DAG.getBitcast(MVT::v16i8, Res);
11862
11863 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11864 // inner sequential set of elements, possibly offset:
11865 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11866 // 01234567 --> 4567zzzz --> zzzzz456
11867 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11868 if (ZeroLo == 0) {
11869 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11870 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11871 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11872 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11873 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11874 } else if (ZeroHi == 0) {
11875 unsigned Shift = Mask[ZeroLo] % NumElts;
11876 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11877 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11878 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11879 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11880 } else if (!Subtarget.hasSSSE3()) {
11881 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11882 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11883 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11884 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11885 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11886 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11887 Shift += Mask[ZeroLo] % NumElts;
11888 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11889 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11890 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11891 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11892 } else
11893 return SDValue();
11894
11895 return DAG.getBitcast(VT, Res);
11896}
11897
11898/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11899///
11900/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11901/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11902/// matches elements from one of the input vectors shuffled to the left or
11903/// right with zeroable elements 'shifted in'. It handles both the strictly
11904/// bit-wise element shifts and the byte shift across an entire 128-bit double
11905/// quad word lane.
11906///
11907/// PSHL : (little-endian) left bit shift.
11908/// [ zz, 0, zz, 2 ]
11909/// [ -1, 4, zz, -1 ]
11910/// PSRL : (little-endian) right bit shift.
11911/// [ 1, zz, 3, zz]
11912/// [ -1, -1, 7, zz]
11913/// PSLLDQ : (little-endian) left byte shift
11914/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11915/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11916/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11917/// PSRLDQ : (little-endian) right byte shift
11918/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11919/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11920/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11921static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11922 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11923 int MaskOffset, const APInt &Zeroable,
11924 const X86Subtarget &Subtarget) {
11925 int Size = Mask.size();
11926 unsigned SizeInBits = Size * ScalarSizeInBits;
11927
11928 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11929 for (int i = 0; i < Size; i += Scale)
11930 for (int j = 0; j < Shift; ++j)
11931 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11932 return false;
11933
11934 return true;
11935 };
11936
11937 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11938 for (int i = 0; i != Size; i += Scale) {
11939 unsigned Pos = Left ? i + Shift : i;
11940 unsigned Low = Left ? i : i + Shift;
11941 unsigned Len = Scale - Shift;
11942 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11943 return -1;
11944 }
11945
11946 int ShiftEltBits = ScalarSizeInBits * Scale;
11947 bool ByteShift = ShiftEltBits > 64;
11948 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11949 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11950 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11951
11952 // Normalize the scale for byte shifts to still produce an i64 element
11953 // type.
11954 Scale = ByteShift ? Scale / 2 : Scale;
11955
11956 // We need to round trip through the appropriate type for the shift.
11957 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11958 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11959 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11960 return (int)ShiftAmt;
11961 };
11962
11963 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11964 // keep doubling the size of the integer elements up to that. We can
11965 // then shift the elements of the integer vector by whole multiples of
11966 // their width within the elements of the larger integer vector. Test each
11967 // multiple to see if we can find a match with the moved element indices
11968 // and that the shifted in elements are all zeroable.
11969 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11970 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11971 for (int Shift = 1; Shift != Scale; ++Shift)
11972 for (bool Left : {true, false})
11973 if (CheckZeros(Shift, Scale, Left)) {
11974 int ShiftAmt = MatchShift(Shift, Scale, Left);
11975 if (0 < ShiftAmt)
11976 return ShiftAmt;
11977 }
11978
11979 // no match
11980 return -1;
11981}
11982
11984 SDValue V2, ArrayRef<int> Mask,
11985 const APInt &Zeroable,
11986 const X86Subtarget &Subtarget,
11987 SelectionDAG &DAG, bool BitwiseOnly) {
11988 int Size = Mask.size();
11989 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11990
11991 MVT ShiftVT;
11992 SDValue V = V1;
11993 unsigned Opcode;
11994
11995 // Try to match shuffle against V1 shift.
11996 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11997 Mask, 0, Zeroable, Subtarget);
11998
11999 // If V1 failed, try to match shuffle against V2 shift.
12000 if (ShiftAmt < 0) {
12001 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12002 Mask, Size, Zeroable, Subtarget);
12003 V = V2;
12004 }
12005
12006 if (ShiftAmt < 0)
12007 return SDValue();
12008
12009 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12010 return SDValue();
12011
12012 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12013 "Illegal integer vector type");
12014 V = DAG.getBitcast(ShiftVT, V);
12015 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12016 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12017 return DAG.getBitcast(VT, V);
12018}
12019
12020// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12021// Remainder of lower half result is zero and upper half is all undef.
12022static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12023 ArrayRef<int> Mask, uint64_t &BitLen,
12024 uint64_t &BitIdx, const APInt &Zeroable) {
12025 int Size = Mask.size();
12026 int HalfSize = Size / 2;
12027 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12028 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12029
12030 // Upper half must be undefined.
12031 if (!isUndefUpperHalf(Mask))
12032 return false;
12033
12034 // Determine the extraction length from the part of the
12035 // lower half that isn't zeroable.
12036 int Len = HalfSize;
12037 for (; Len > 0; --Len)
12038 if (!Zeroable[Len - 1])
12039 break;
12040 assert(Len > 0 && "Zeroable shuffle mask");
12041
12042 // Attempt to match first Len sequential elements from the lower half.
12043 SDValue Src;
12044 int Idx = -1;
12045 for (int i = 0; i != Len; ++i) {
12046 int M = Mask[i];
12047 if (M == SM_SentinelUndef)
12048 continue;
12049 SDValue &V = (M < Size ? V1 : V2);
12050 M = M % Size;
12051
12052 // The extracted elements must start at a valid index and all mask
12053 // elements must be in the lower half.
12054 if (i > M || M >= HalfSize)
12055 return false;
12056
12057 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12058 Src = V;
12059 Idx = M - i;
12060 continue;
12061 }
12062 return false;
12063 }
12064
12065 if (!Src || Idx < 0)
12066 return false;
12067
12068 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12069 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12070 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12071 V1 = Src;
12072 return true;
12073}
12074
12075// INSERTQ: Extract lowest Len elements from lower half of second source and
12076// insert over first source, starting at Idx.
12077// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12078static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12079 ArrayRef<int> Mask, uint64_t &BitLen,
12080 uint64_t &BitIdx) {
12081 int Size = Mask.size();
12082 int HalfSize = Size / 2;
12083 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12084
12085 // Upper half must be undefined.
12086 if (!isUndefUpperHalf(Mask))
12087 return false;
12088
12089 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12090 SDValue Base;
12091
12092 // Attempt to match first source from mask before insertion point.
12093 if (isUndefInRange(Mask, 0, Idx)) {
12094 /* EMPTY */
12095 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12096 Base = V1;
12097 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12098 Base = V2;
12099 } else {
12100 continue;
12101 }
12102
12103 // Extend the extraction length looking to match both the insertion of
12104 // the second source and the remaining elements of the first.
12105 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12106 SDValue Insert;
12107 int Len = Hi - Idx;
12108
12109 // Match insertion.
12110 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12111 Insert = V1;
12112 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12113 Insert = V2;
12114 } else {
12115 continue;
12116 }
12117
12118 // Match the remaining elements of the lower half.
12119 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12120 /* EMPTY */
12121 } else if ((!Base || (Base == V1)) &&
12122 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12123 Base = V1;
12124 } else if ((!Base || (Base == V2)) &&
12125 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12126 Size + Hi)) {
12127 Base = V2;
12128 } else {
12129 continue;
12130 }
12131
12132 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12133 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12134 V1 = Base;
12135 V2 = Insert;
12136 return true;
12137 }
12138 }
12139
12140 return false;
12141}
12142
12143/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12145 SDValue V2, ArrayRef<int> Mask,
12146 const APInt &Zeroable, SelectionDAG &DAG) {
12147 uint64_t BitLen, BitIdx;
12148 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12149 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12150 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12151 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12152
12153 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12154 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12155 V2 ? V2 : DAG.getUNDEF(VT),
12156 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12157 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12158
12159 return SDValue();
12160}
12161
12162/// Lower a vector shuffle as a zero or any extension.
12163///
12164/// Given a specific number of elements, element bit width, and extension
12165/// stride, produce either a zero or any extension based on the available
12166/// features of the subtarget. The extended elements are consecutive and
12167/// begin and can start from an offsetted element index in the input; to
12168/// avoid excess shuffling the offset must either being in the bottom lane
12169/// or at the start of a higher lane. All extended elements must be from
12170/// the same lane.
12172 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
12173 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12174 assert(Scale > 1 && "Need a scale to extend.");
12175 int EltBits = VT.getScalarSizeInBits();
12176 int NumElements = VT.getVectorNumElements();
12177 int NumEltsPerLane = 128 / EltBits;
12178 int OffsetLane = Offset / NumEltsPerLane;
12179 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12180 "Only 8, 16, and 32 bit elements can be extended.");
12181 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12182 assert(0 <= Offset && "Extension offset must be positive.");
12183 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12184 "Extension offset must be in the first lane or start an upper lane.");
12185
12186 // Check that an index is in same lane as the base offset.
12187 auto SafeOffset = [&](int Idx) {
12188 return OffsetLane == (Idx / NumEltsPerLane);
12189 };
12190
12191 // Shift along an input so that the offset base moves to the first element.
12192 auto ShuffleOffset = [&](SDValue V) {
12193 if (!Offset)
12194 return V;
12195
12196 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12197 for (int i = 0; i * Scale < NumElements; ++i) {
12198 int SrcIdx = i + Offset;
12199 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12200 }
12201 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12202 };
12203
12204 // Found a valid a/zext mask! Try various lowering strategies based on the
12205 // input type and available ISA extensions.
12206 if (Subtarget.hasSSE41()) {
12207 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12208 // PUNPCK will catch this in a later shuffle match.
12209 if (Offset && Scale == 2 && VT.is128BitVector())
12210 return SDValue();
12211 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12212 NumElements / Scale);
12213 InputV = DAG.getBitcast(VT, InputV);
12214 InputV = ShuffleOffset(InputV);
12216 DL, ExtVT, InputV, DAG);
12217 return DAG.getBitcast(VT, InputV);
12218 }
12219
12220 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12221 InputV = DAG.getBitcast(VT, InputV);
12222
12223 // For any extends we can cheat for larger element sizes and use shuffle
12224 // instructions that can fold with a load and/or copy.
12225 if (AnyExt && EltBits == 32) {
12226 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12227 -1};
12228 return DAG.getBitcast(
12229 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12230 DAG.getBitcast(MVT::v4i32, InputV),
12231 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12232 }
12233 if (AnyExt && EltBits == 16 && Scale > 2) {
12234 int PSHUFDMask[4] = {Offset / 2, -1,
12235 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12236 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12237 DAG.getBitcast(MVT::v4i32, InputV),
12238 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12239 int PSHUFWMask[4] = {1, -1, -1, -1};
12240 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12241 return DAG.getBitcast(
12242 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12243 DAG.getBitcast(MVT::v8i16, InputV),
12244 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12245 }
12246
12247 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12248 // to 64-bits.
12249 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12250 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12251 assert(VT.is128BitVector() && "Unexpected vector width!");
12252
12253 int LoIdx = Offset * EltBits;
12254 SDValue Lo = DAG.getBitcast(
12255 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12256 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12257 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12258
12259 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12260 return DAG.getBitcast(VT, Lo);
12261
12262 int HiIdx = (Offset + 1) * EltBits;
12263 SDValue Hi = DAG.getBitcast(
12264 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12265 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12266 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12267 return DAG.getBitcast(VT,
12268 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12269 }
12270
12271 // If this would require more than 2 unpack instructions to expand, use
12272 // pshufb when available. We can only use more than 2 unpack instructions
12273 // when zero extending i8 elements which also makes it easier to use pshufb.
12274 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12275 assert(NumElements == 16 && "Unexpected byte vector width!");
12276 SDValue PSHUFBMask[16];
12277 for (int i = 0; i < 16; ++i) {
12278 int Idx = Offset + (i / Scale);
12279 if ((i % Scale == 0 && SafeOffset(Idx))) {
12280 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12281 continue;
12282 }
12283 PSHUFBMask[i] =
12284 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12285 }
12286 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12287 return DAG.getBitcast(
12288 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12289 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12290 }
12291
12292 // If we are extending from an offset, ensure we start on a boundary that
12293 // we can unpack from.
12294 int AlignToUnpack = Offset % (NumElements / Scale);
12295 if (AlignToUnpack) {
12296 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12297 for (int i = AlignToUnpack; i < NumElements; ++i)
12298 ShMask[i - AlignToUnpack] = i;
12299 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12300 Offset -= AlignToUnpack;
12301 }
12302
12303 // Otherwise emit a sequence of unpacks.
12304 do {
12305 unsigned UnpackLoHi = X86ISD::UNPCKL;
12306 if (Offset >= (NumElements / 2)) {
12307 UnpackLoHi = X86ISD::UNPCKH;
12308 Offset -= (NumElements / 2);
12309 }
12310
12311 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12312 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12313 : getZeroVector(InputVT, Subtarget, DAG, DL);
12314 InputV = DAG.getBitcast(InputVT, InputV);
12315 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12316 Scale /= 2;
12317 EltBits *= 2;
12318 NumElements /= 2;
12319 } while (Scale > 1);
12320 return DAG.getBitcast(VT, InputV);
12321}
12322
12323/// Try to lower a vector shuffle as a zero extension on any microarch.
12324///
12325/// This routine will try to do everything in its power to cleverly lower
12326/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12327/// check for the profitability of this lowering, it tries to aggressively
12328/// match this pattern. It will use all of the micro-architectural details it
12329/// can to emit an efficient lowering. It handles both blends with all-zero
12330/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12331/// masking out later).
12332///
12333/// The reason we have dedicated lowering for zext-style shuffles is that they
12334/// are both incredibly common and often quite performance sensitive.
12336 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12337 const APInt &Zeroable, const X86Subtarget &Subtarget,
12338 SelectionDAG &DAG) {
12339 int Bits = VT.getSizeInBits();
12340 int NumLanes = Bits / 128;
12341 int NumElements = VT.getVectorNumElements();
12342 int NumEltsPerLane = NumElements / NumLanes;
12343 assert(VT.getScalarSizeInBits() <= 32 &&
12344 "Exceeds 32-bit integer zero extension limit");
12345 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12346
12347 // Define a helper function to check a particular ext-scale and lower to it if
12348 // valid.
12349 auto Lower = [&](int Scale) -> SDValue {
12350 SDValue InputV;
12351 bool AnyExt = true;
12352 int Offset = 0;
12353 int Matches = 0;
12354 for (int i = 0; i < NumElements; ++i) {
12355 int M = Mask[i];
12356 if (M < 0)
12357 continue; // Valid anywhere but doesn't tell us anything.
12358 if (i % Scale != 0) {
12359 // Each of the extended elements need to be zeroable.
12360 if (!Zeroable[i])
12361 return SDValue();
12362
12363 // We no longer are in the anyext case.
12364 AnyExt = false;
12365 continue;
12366 }
12367
12368 // Each of the base elements needs to be consecutive indices into the
12369 // same input vector.
12370 SDValue V = M < NumElements ? V1 : V2;
12371 M = M % NumElements;
12372 if (!InputV) {
12373 InputV = V;
12374 Offset = M - (i / Scale);
12375 } else if (InputV != V)
12376 return SDValue(); // Flip-flopping inputs.
12377
12378 // Offset must start in the lowest 128-bit lane or at the start of an
12379 // upper lane.
12380 // FIXME: Is it ever worth allowing a negative base offset?
12381 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12382 (Offset % NumEltsPerLane) == 0))
12383 return SDValue();
12384
12385 // If we are offsetting, all referenced entries must come from the same
12386 // lane.
12387 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12388 return SDValue();
12389
12390 if ((M % NumElements) != (Offset + (i / Scale)))
12391 return SDValue(); // Non-consecutive strided elements.
12392 Matches++;
12393 }
12394
12395 // If we fail to find an input, we have a zero-shuffle which should always
12396 // have already been handled.
12397 // FIXME: Maybe handle this here in case during blending we end up with one?
12398 if (!InputV)
12399 return SDValue();
12400
12401 // If we are offsetting, don't extend if we only match a single input, we
12402 // can always do better by using a basic PSHUF or PUNPCK.
12403 if (Offset != 0 && Matches < 2)
12404 return SDValue();
12405
12406 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12407 InputV, Mask, Subtarget, DAG);
12408 };
12409
12410 // The widest scale possible for extending is to a 64-bit integer.
12411 assert(Bits % 64 == 0 &&
12412 "The number of bits in a vector must be divisible by 64 on x86!");
12413 int NumExtElements = Bits / 64;
12414
12415 // Each iteration, try extending the elements half as much, but into twice as
12416 // many elements.
12417 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12418 assert(NumElements % NumExtElements == 0 &&
12419 "The input vector size must be divisible by the extended size.");
12420 if (SDValue V = Lower(NumElements / NumExtElements))
12421 return V;
12422 }
12423
12424 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12425 if (Bits != 128)
12426 return SDValue();
12427
12428 // Returns one of the source operands if the shuffle can be reduced to a
12429 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12430 auto CanZExtLowHalf = [&]() {
12431 for (int i = NumElements / 2; i != NumElements; ++i)
12432 if (!Zeroable[i])
12433 return SDValue();
12434 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12435 return V1;
12436 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12437 return V2;
12438 return SDValue();
12439 };
12440
12441 if (SDValue V = CanZExtLowHalf()) {
12442 V = DAG.getBitcast(MVT::v2i64, V);
12443 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12444 return DAG.getBitcast(VT, V);
12445 }
12446
12447 // No viable ext lowering found.
12448 return SDValue();
12449}
12450
12451/// Try to get a scalar value for a specific element of a vector.
12452///
12453/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12455 SelectionDAG &DAG) {
12456 MVT VT = V.getSimpleValueType();
12457 MVT EltVT = VT.getVectorElementType();
12458 V = peekThroughBitcasts(V);
12459
12460 // If the bitcasts shift the element size, we can't extract an equivalent
12461 // element from it.
12462 MVT NewVT = V.getSimpleValueType();
12463 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12464 return SDValue();
12465
12466 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12467 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12468 // Ensure the scalar operand is the same size as the destination.
12469 // FIXME: Add support for scalar truncation where possible.
12470 SDValue S = V.getOperand(Idx);
12471 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12472 return DAG.getBitcast(EltVT, S);
12473 }
12474
12475 return SDValue();
12476}
12477
12478/// Helper to test for a load that can be folded with x86 shuffles.
12479///
12480/// This is particularly important because the set of instructions varies
12481/// significantly based on whether the operand is a load or not.
12483 return V.hasOneUse() &&
12485}
12486
12487template<typename T>
12488static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12489 T EltVT = VT.getScalarType();
12490 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12491 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12492}
12493
12494/// Try to lower insertion of a single element into a zero vector.
12495///
12496/// This is a common pattern that we have especially efficient patterns to lower
12497/// across all subtarget feature sets.
12499 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12500 const APInt &Zeroable, const X86Subtarget &Subtarget,
12501 SelectionDAG &DAG) {
12502 MVT ExtVT = VT;
12503 MVT EltVT = VT.getVectorElementType();
12504 unsigned NumElts = VT.getVectorNumElements();
12505 unsigned EltBits = VT.getScalarSizeInBits();
12506
12507 if (isSoftF16(EltVT, Subtarget))
12508 return SDValue();
12509
12510 int V2Index =
12511 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12512 Mask.begin();
12513 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12514 bool IsV1Zeroable = true;
12515 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12516 if (i != V2Index && !Zeroable[i]) {
12517 IsV1Zeroable = false;
12518 break;
12519 }
12520
12521 // Bail if a non-zero V1 isn't used in place.
12522 if (!IsV1Zeroable) {
12523 SmallVector<int, 8> V1Mask(Mask);
12524 V1Mask[V2Index] = -1;
12525 if (!isNoopShuffleMask(V1Mask))
12526 return SDValue();
12527 }
12528
12529 // Check for a single input from a SCALAR_TO_VECTOR node.
12530 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12531 // all the smarts here sunk into that routine. However, the current
12532 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12533 // vector shuffle lowering is dead.
12534 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12535 DAG);
12536 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12537 // We need to zext the scalar if it is smaller than an i32.
12538 V2S = DAG.getBitcast(EltVT, V2S);
12539 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12540 // Using zext to expand a narrow element won't work for non-zero
12541 // insertions. But we can use a masked constant vector if we're
12542 // inserting V2 into the bottom of V1.
12543 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12544 return SDValue();
12545
12546 // Zero-extend directly to i32.
12547 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12548 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12549
12550 // If we're inserting into a constant, mask off the inserted index
12551 // and OR with the zero-extended scalar.
12552 if (!IsV1Zeroable) {
12553 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12554 Bits[V2Index] = APInt::getZero(EltBits);
12555 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12556 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12557 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12558 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12559 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12560 }
12561 }
12562 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12563 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12564 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12565 // Either not inserting from the low element of the input or the input
12566 // element size is too small to use VZEXT_MOVL to clear the high bits.
12567 return SDValue();
12568 }
12569
12570 if (!IsV1Zeroable) {
12571 // If V1 can't be treated as a zero vector we have fewer options to lower
12572 // this. We can't support integer vectors or non-zero targets cheaply.
12573 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12574 if (!VT.isFloatingPoint() || V2Index != 0)
12575 return SDValue();
12576 if (!VT.is128BitVector())
12577 return SDValue();
12578
12579 // Otherwise, use MOVSD, MOVSS or MOVSH.
12580 unsigned MovOpc = 0;
12581 if (EltVT == MVT::f16)
12582 MovOpc = X86ISD::MOVSH;
12583 else if (EltVT == MVT::f32)
12584 MovOpc = X86ISD::MOVSS;
12585 else if (EltVT == MVT::f64)
12586 MovOpc = X86ISD::MOVSD;
12587 else
12588 llvm_unreachable("Unsupported floating point element type to handle!");
12589 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12590 }
12591
12592 // This lowering only works for the low element with floating point vectors.
12593 if (VT.isFloatingPoint() && V2Index != 0)
12594 return SDValue();
12595
12596 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12597 if (ExtVT != VT)
12598 V2 = DAG.getBitcast(VT, V2);
12599
12600 if (V2Index != 0) {
12601 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12602 // the desired position. Otherwise it is more efficient to do a vector
12603 // shift left. We know that we can do a vector shift left because all
12604 // the inputs are zero.
12605 if (VT.isFloatingPoint() || NumElts <= 4) {
12606 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12607 V2Shuffle[V2Index] = 0;
12608 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12609 } else {
12610 V2 = DAG.getBitcast(MVT::v16i8, V2);
12611 V2 = DAG.getNode(
12612 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12613 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12614 V2 = DAG.getBitcast(VT, V2);
12615 }
12616 }
12617 return V2;
12618}
12619
12620/// Try to lower broadcast of a single - truncated - integer element,
12621/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12622///
12623/// This assumes we have AVX2.
12625 int BroadcastIdx,
12626 const X86Subtarget &Subtarget,
12627 SelectionDAG &DAG) {
12628 assert(Subtarget.hasAVX2() &&
12629 "We can only lower integer broadcasts with AVX2!");
12630
12631 MVT EltVT = VT.getVectorElementType();
12632 MVT V0VT = V0.getSimpleValueType();
12633
12634 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12635 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12636
12637 MVT V0EltVT = V0VT.getVectorElementType();
12638 if (!V0EltVT.isInteger())
12639 return SDValue();
12640
12641 const unsigned EltSize = EltVT.getSizeInBits();
12642 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12643
12644 // This is only a truncation if the original element type is larger.
12645 if (V0EltSize <= EltSize)
12646 return SDValue();
12647
12648 assert(((V0EltSize % EltSize) == 0) &&
12649 "Scalar type sizes must all be powers of 2 on x86!");
12650
12651 const unsigned V0Opc = V0.getOpcode();
12652 const unsigned Scale = V0EltSize / EltSize;
12653 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12654
12655 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12656 V0Opc != ISD::BUILD_VECTOR)
12657 return SDValue();
12658
12659 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12660
12661 // If we're extracting non-least-significant bits, shift so we can truncate.
12662 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12663 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12664 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12665 if (const int OffsetIdx = BroadcastIdx % Scale)
12666 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12667 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12668
12669 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12670 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12671}
12672
12673/// Test whether this can be lowered with a single SHUFPS instruction.
12674///
12675/// This is used to disable more specialized lowerings when the shufps lowering
12676/// will happen to be efficient.
12678 // This routine only handles 128-bit shufps.
12679 assert(Mask.size() == 4 && "Unsupported mask size!");
12680 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12681 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12682 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12683 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12684
12685 // To lower with a single SHUFPS we need to have the low half and high half
12686 // each requiring a single input.
12687 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12688 return false;
12689 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12690 return false;
12691
12692 return true;
12693}
12694
12695/// Test whether the specified input (0 or 1) is in-place blended by the
12696/// given mask.
12697///
12698/// This returns true if the elements from a particular input are already in the
12699/// slot required by the given mask and require no permutation.
12700static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12701 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12702 int Size = Mask.size();
12703 for (int i = 0; i < Size; ++i)
12704 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12705 return false;
12706
12707 return true;
12708}
12709
12710/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12711/// the given mask.
12712///
12714 int BroadcastableElement = 0) {
12715 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12716 int Size = Mask.size();
12717 for (int i = 0; i < Size; ++i)
12718 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12719 Mask[i] % Size != BroadcastableElement)
12720 return false;
12721 return true;
12722}
12723
12724/// If we are extracting two 128-bit halves of a vector and shuffling the
12725/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12726/// multi-shuffle lowering.
12728 SDValue N1, ArrayRef<int> Mask,
12729 SelectionDAG &DAG) {
12730 MVT VT = N0.getSimpleValueType();
12731 assert((VT.is128BitVector() &&
12732 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12733 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12734
12735 // Check that both sources are extracts of the same source vector.
12736 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12738 N0.getOperand(0) != N1.getOperand(0) ||
12739 !N0.hasOneUse() || !N1.hasOneUse())
12740 return SDValue();
12741
12742 SDValue WideVec = N0.getOperand(0);
12743 MVT WideVT = WideVec.getSimpleValueType();
12744 if (!WideVT.is256BitVector())
12745 return SDValue();
12746
12747 // Match extracts of each half of the wide source vector. Commute the shuffle
12748 // if the extract of the low half is N1.
12749 unsigned NumElts = VT.getVectorNumElements();
12750 SmallVector<int, 4> NewMask(Mask);
12751 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12752 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12753 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12755 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12756 return SDValue();
12757
12758 // Final bailout: if the mask is simple, we are better off using an extract
12759 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12760 // because that avoids a constant load from memory.
12761 if (NumElts == 4 &&
12762 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12763 return SDValue();
12764
12765 // Extend the shuffle mask with undef elements.
12766 NewMask.append(NumElts, -1);
12767
12768 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12769 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12770 NewMask);
12771 // This is free: ymm -> xmm.
12772 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12773 DAG.getVectorIdxConstant(0, DL));
12774}
12775
12776/// Try to lower broadcast of a single element.
12777///
12778/// For convenience, this code also bundles all of the subtarget feature set
12779/// filtering. While a little annoying to re-dispatch on type here, there isn't
12780/// a convenient way to factor it out.
12782 SDValue V2, ArrayRef<int> Mask,
12783 const X86Subtarget &Subtarget,
12784 SelectionDAG &DAG) {
12785 MVT EltVT = VT.getVectorElementType();
12786 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12787 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12788 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12789 return SDValue();
12790
12791 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12792 // we can only broadcast from a register with AVX2.
12793 unsigned NumEltBits = VT.getScalarSizeInBits();
12794 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12797 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12798
12799 // Check that the mask is a broadcast.
12800 int BroadcastIdx = getSplatIndex(Mask);
12801 if (BroadcastIdx < 0) {
12802 // Check for hidden broadcast.
12803 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
12804 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
12805 return SDValue();
12806 BroadcastIdx = 0;
12807 }
12808 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12809 "a sorted mask where the broadcast "
12810 "comes from V1.");
12811 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
12812
12813 // Go up the chain of (vector) values to find a scalar load that we can
12814 // combine with the broadcast.
12815 // TODO: Combine this logic with findEltLoadSrc() used by
12816 // EltsFromConsecutiveLoads().
12817 int BitOffset = BroadcastIdx * NumEltBits;
12818 SDValue V = V1;
12819 for (;;) {
12820 switch (V.getOpcode()) {
12821 case ISD::BITCAST: {
12822 V = V.getOperand(0);
12823 continue;
12824 }
12825 case ISD::CONCAT_VECTORS: {
12826 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12827 int OpIdx = BitOffset / OpBitWidth;
12828 V = V.getOperand(OpIdx);
12829 BitOffset %= OpBitWidth;
12830 continue;
12831 }
12833 // The extraction index adds to the existing offset.
12834 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12835 unsigned Idx = V.getConstantOperandVal(1);
12836 unsigned BeginOffset = Idx * EltBitWidth;
12837 BitOffset += BeginOffset;
12838 V = V.getOperand(0);
12839 continue;
12840 }
12841 case ISD::INSERT_SUBVECTOR: {
12842 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12843 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12844 int Idx = (int)V.getConstantOperandVal(2);
12845 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12846 int BeginOffset = Idx * EltBitWidth;
12847 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12848 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12849 BitOffset -= BeginOffset;
12850 V = VInner;
12851 } else {
12852 V = VOuter;
12853 }
12854 continue;
12855 }
12856 }
12857 break;
12858 }
12859 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12860 BroadcastIdx = BitOffset / NumEltBits;
12861
12862 // Do we need to bitcast the source to retrieve the original broadcast index?
12863 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12864
12865 // Check if this is a broadcast of a scalar. We special case lowering
12866 // for scalars so that we can more effectively fold with loads.
12867 // If the original value has a larger element type than the shuffle, the
12868 // broadcast element is in essence truncated. Make that explicit to ease
12869 // folding.
12870 if (BitCastSrc && VT.isInteger())
12871 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12872 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12873 return TruncBroadcast;
12874
12875 // Also check the simpler case, where we can directly reuse the scalar.
12876 if (!BitCastSrc &&
12877 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12878 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12879 V = V.getOperand(BroadcastIdx);
12880
12881 // If we can't broadcast from a register, check that the input is a load.
12882 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12883 return SDValue();
12884 } else if (ISD::isNormalLoad(V.getNode()) &&
12885 cast<LoadSDNode>(V)->isSimple()) {
12886 // We do not check for one-use of the vector load because a broadcast load
12887 // is expected to be a win for code size, register pressure, and possibly
12888 // uops even if the original vector load is not eliminated.
12889
12890 // Reduce the vector load and shuffle to a broadcasted scalar load.
12891 LoadSDNode *Ld = cast<LoadSDNode>(V);
12892 SDValue BaseAddr = Ld->getOperand(1);
12893 MVT SVT = VT.getScalarType();
12894 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12895 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12896 SDValue NewAddr =
12898
12899 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12900 // than MOVDDUP.
12901 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12902 if (Opcode == X86ISD::VBROADCAST) {
12903 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12904 SDValue Ops[] = {Ld->getChain(), NewAddr};
12905 V = DAG.getMemIntrinsicNode(
12906 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12908 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12910 return DAG.getBitcast(VT, V);
12911 }
12912 assert(SVT == MVT::f64 && "Unexpected VT!");
12913 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12915 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12917 } else if (!BroadcastFromReg) {
12918 // We can't broadcast from a vector register.
12919 return SDValue();
12920 } else if (BitOffset != 0) {
12921 // We can only broadcast from the zero-element of a vector register,
12922 // but it can be advantageous to broadcast from the zero-element of a
12923 // subvector.
12924 if (!VT.is256BitVector() && !VT.is512BitVector())
12925 return SDValue();
12926
12927 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12928 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12929 return SDValue();
12930
12931 // If we are broadcasting an element from the lowest 128-bit subvector, try
12932 // to move the element in position.
12933 if (BitOffset < 128 && NumActiveElts > 1 &&
12934 V.getScalarValueSizeInBits() == NumEltBits) {
12935 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12936 "Unexpected bit-offset");
12937 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
12938 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
12939 V = extractSubVector(V, 0, DAG, DL, 128);
12940 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
12941 } else {
12942 // Only broadcast the zero-element of a 128-bit subvector.
12943 if ((BitOffset % 128) != 0)
12944 return SDValue();
12945
12946 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12947 "Unexpected bit-offset");
12948 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12949 "Unexpected vector size");
12950 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12951 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12952 }
12953 }
12954
12955 // On AVX we can use VBROADCAST directly for scalar sources.
12956 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12957 V = DAG.getBitcast(MVT::f64, V);
12958 if (Subtarget.hasAVX()) {
12959 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12960 return DAG.getBitcast(VT, V);
12961 }
12962 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12963 }
12964
12965 // If this is a scalar, do the broadcast on this type and bitcast.
12966 if (!V.getValueType().isVector()) {
12967 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12968 "Unexpected scalar size");
12969 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12971 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12972 }
12973
12974 // We only support broadcasting from 128-bit vectors to minimize the
12975 // number of patterns we need to deal with in isel. So extract down to
12976 // 128-bits, removing as many bitcasts as possible.
12977 if (V.getValueSizeInBits() > 128)
12979
12980 // Otherwise cast V to a vector with the same element type as VT, but
12981 // possibly narrower than VT. Then perform the broadcast.
12982 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12983 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12984 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12985}
12986
12987// Check for whether we can use INSERTPS to perform the shuffle. We only use
12988// INSERTPS when the V1 elements are already in the correct locations
12989// because otherwise we can just always use two SHUFPS instructions which
12990// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12991// perform INSERTPS if a single V1 element is out of place and all V2
12992// elements are zeroable.
12994 unsigned &InsertPSMask,
12995 const APInt &Zeroable,
12996 ArrayRef<int> Mask, SelectionDAG &DAG) {
12997 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12998 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12999 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13000
13001 // Attempt to match INSERTPS with one element from VA or VB being
13002 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13003 // are updated.
13004 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13005 ArrayRef<int> CandidateMask) {
13006 unsigned ZMask = 0;
13007 int VADstIndex = -1;
13008 int VBDstIndex = -1;
13009 bool VAUsedInPlace = false;
13010
13011 for (int i = 0; i < 4; ++i) {
13012 // Synthesize a zero mask from the zeroable elements (includes undefs).
13013 if (Zeroable[i]) {
13014 ZMask |= 1 << i;
13015 continue;
13016 }
13017
13018 // Flag if we use any VA inputs in place.
13019 if (i == CandidateMask[i]) {
13020 VAUsedInPlace = true;
13021 continue;
13022 }
13023
13024 // We can only insert a single non-zeroable element.
13025 if (VADstIndex >= 0 || VBDstIndex >= 0)
13026 return false;
13027
13028 if (CandidateMask[i] < 4) {
13029 // VA input out of place for insertion.
13030 VADstIndex = i;
13031 } else {
13032 // VB input for insertion.
13033 VBDstIndex = i;
13034 }
13035 }
13036
13037 // Don't bother if we have no (non-zeroable) element for insertion.
13038 if (VADstIndex < 0 && VBDstIndex < 0)
13039 return false;
13040
13041 // Determine element insertion src/dst indices. The src index is from the
13042 // start of the inserted vector, not the start of the concatenated vector.
13043 unsigned VBSrcIndex = 0;
13044 if (VADstIndex >= 0) {
13045 // If we have a VA input out of place, we use VA as the V2 element
13046 // insertion and don't use the original V2 at all.
13047 VBSrcIndex = CandidateMask[VADstIndex];
13048 VBDstIndex = VADstIndex;
13049 VB = VA;
13050 } else {
13051 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13052 }
13053
13054 // If no V1 inputs are used in place, then the result is created only from
13055 // the zero mask and the V2 insertion - so remove V1 dependency.
13056 if (!VAUsedInPlace)
13057 VA = DAG.getUNDEF(MVT::v4f32);
13058
13059 // Update V1, V2 and InsertPSMask accordingly.
13060 V1 = VA;
13061 V2 = VB;
13062
13063 // Insert the V2 element into the desired position.
13064 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13065 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13066 return true;
13067 };
13068
13069 if (matchAsInsertPS(V1, V2, Mask))
13070 return true;
13071
13072 // Commute and try again.
13073 SmallVector<int, 4> CommutedMask(Mask);
13075 if (matchAsInsertPS(V2, V1, CommutedMask))
13076 return true;
13077
13078 return false;
13079}
13080
13082 ArrayRef<int> Mask, const APInt &Zeroable,
13083 SelectionDAG &DAG) {
13084 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13085 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13086
13087 // Attempt to match the insertps pattern.
13088 unsigned InsertPSMask = 0;
13089 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13090 return SDValue();
13091
13092 // Insert the V2 element into the desired position.
13093 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13094 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13095}
13096
13097/// Handle lowering of 2-lane 64-bit floating point shuffles.
13098///
13099/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13100/// support for floating point shuffles but not integer shuffles. These
13101/// instructions will incur a domain crossing penalty on some chips though so
13102/// it is better to avoid lowering through this for integer vectors where
13103/// possible.
13105 const APInt &Zeroable, SDValue V1, SDValue V2,
13106 const X86Subtarget &Subtarget,
13107 SelectionDAG &DAG) {
13108 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13109 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13110 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13111
13112 if (V2.isUndef()) {
13113 // Check for being able to broadcast a single element.
13114 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13115 Mask, Subtarget, DAG))
13116 return Broadcast;
13117
13118 // Straight shuffle of a single input vector. Simulate this by using the
13119 // single input as both of the "inputs" to this instruction..
13120 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13121
13122 if (Subtarget.hasAVX()) {
13123 // If we have AVX, we can use VPERMILPS which will allow folding a load
13124 // into the shuffle.
13125 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13126 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13127 }
13128
13129 return DAG.getNode(
13130 X86ISD::SHUFP, DL, MVT::v2f64,
13131 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13132 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13133 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13134 }
13135 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13136 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13137 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13138 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13139
13140 if (Subtarget.hasAVX2())
13141 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13142 return Extract;
13143
13144 // When loading a scalar and then shuffling it into a vector we can often do
13145 // the insertion cheaply.
13147 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13148 return Insertion;
13149 // Try inverting the insertion since for v2 masks it is easy to do and we
13150 // can't reliably sort the mask one way or the other.
13151 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13152 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13154 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13155 return Insertion;
13156
13157 // Try to use one of the special instruction patterns to handle two common
13158 // blend patterns if a zero-blend above didn't work.
13159 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13160 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13161 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13162 // We can either use a special instruction to load over the low double or
13163 // to move just the low double.
13164 return DAG.getNode(
13165 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13166 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13167
13168 if (Subtarget.hasSSE41())
13169 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13170 Zeroable, Subtarget, DAG))
13171 return Blend;
13172
13173 // Use dedicated unpack instructions for masks that match their pattern.
13174 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13175 return V;
13176
13177 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13178 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13179 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13180}
13181
13182/// Handle lowering of 2-lane 64-bit integer shuffles.
13183///
13184/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13185/// the integer unit to minimize domain crossing penalties. However, for blends
13186/// it falls back to the floating point shuffle operation with appropriate bit
13187/// casting.
13189 const APInt &Zeroable, SDValue V1, SDValue V2,
13190 const X86Subtarget &Subtarget,
13191 SelectionDAG &DAG) {
13192 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13193 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13194 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13195
13196 if (V2.isUndef()) {
13197 // Check for being able to broadcast a single element.
13198 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13199 Mask, Subtarget, DAG))
13200 return Broadcast;
13201
13202 // Straight shuffle of a single input vector. For everything from SSE2
13203 // onward this has a single fast instruction with no scary immediates.
13204 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13205 V1 = DAG.getBitcast(MVT::v4i32, V1);
13206 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13207 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13208 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13209 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13210 return DAG.getBitcast(
13211 MVT::v2i64,
13212 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13213 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13214 }
13215 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13216 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13217 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13218 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13219
13220 if (Subtarget.hasAVX2())
13221 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13222 return Extract;
13223
13224 // Try to use shift instructions.
13225 if (SDValue Shift =
13226 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13227 DAG, /*BitwiseOnly*/ false))
13228 return Shift;
13229
13230 // When loading a scalar and then shuffling it into a vector we can often do
13231 // the insertion cheaply.
13233 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13234 return Insertion;
13235 // Try inverting the insertion since for v2 masks it is easy to do and we
13236 // can't reliably sort the mask one way or the other.
13237 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13239 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13240 return Insertion;
13241
13242 // We have different paths for blend lowering, but they all must use the
13243 // *exact* same predicate.
13244 bool IsBlendSupported = Subtarget.hasSSE41();
13245 if (IsBlendSupported)
13246 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13247 Zeroable, Subtarget, DAG))
13248 return Blend;
13249
13250 // Use dedicated unpack instructions for masks that match their pattern.
13251 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13252 return V;
13253
13254 // Try to use byte rotation instructions.
13255 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13256 if (Subtarget.hasSSSE3()) {
13257 if (Subtarget.hasVLX())
13258 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13259 Zeroable, Subtarget, DAG))
13260 return Rotate;
13261
13262 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13263 Subtarget, DAG))
13264 return Rotate;
13265 }
13266
13267 // If we have direct support for blends, we should lower by decomposing into
13268 // a permute. That will be faster than the domain cross.
13269 if (IsBlendSupported)
13270 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13271 Zeroable, Subtarget, DAG);
13272
13273 // We implement this with SHUFPD which is pretty lame because it will likely
13274 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13275 // However, all the alternatives are still more cycles and newer chips don't
13276 // have this problem. It would be really nice if x86 had better shuffles here.
13277 V1 = DAG.getBitcast(MVT::v2f64, V1);
13278 V2 = DAG.getBitcast(MVT::v2f64, V2);
13279 return DAG.getBitcast(MVT::v2i64,
13280 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13281}
13282
13283/// Lower a vector shuffle using the SHUFPS instruction.
13284///
13285/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13286/// It makes no assumptions about whether this is the *best* lowering, it simply
13287/// uses it.
13289 ArrayRef<int> Mask, SDValue V1,
13290 SDValue V2, SelectionDAG &DAG) {
13291 SDValue LowV = V1, HighV = V2;
13292 SmallVector<int, 4> NewMask(Mask);
13293 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13294
13295 if (NumV2Elements == 1) {
13296 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13297
13298 // Compute the index adjacent to V2Index and in the same half by toggling
13299 // the low bit.
13300 int V2AdjIndex = V2Index ^ 1;
13301
13302 if (Mask[V2AdjIndex] < 0) {
13303 // Handles all the cases where we have a single V2 element and an undef.
13304 // This will only ever happen in the high lanes because we commute the
13305 // vector otherwise.
13306 if (V2Index < 2)
13307 std::swap(LowV, HighV);
13308 NewMask[V2Index] -= 4;
13309 } else {
13310 // Handle the case where the V2 element ends up adjacent to a V1 element.
13311 // To make this work, blend them together as the first step.
13312 int V1Index = V2AdjIndex;
13313 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13314 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13315 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13316
13317 // Now proceed to reconstruct the final blend as we have the necessary
13318 // high or low half formed.
13319 if (V2Index < 2) {
13320 LowV = V2;
13321 HighV = V1;
13322 } else {
13323 HighV = V2;
13324 }
13325 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13326 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13327 }
13328 } else if (NumV2Elements == 2) {
13329 if (Mask[0] < 4 && Mask[1] < 4) {
13330 // Handle the easy case where we have V1 in the low lanes and V2 in the
13331 // high lanes.
13332 NewMask[2] -= 4;
13333 NewMask[3] -= 4;
13334 } else if (Mask[2] < 4 && Mask[3] < 4) {
13335 // We also handle the reversed case because this utility may get called
13336 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13337 // arrange things in the right direction.
13338 NewMask[0] -= 4;
13339 NewMask[1] -= 4;
13340 HighV = V1;
13341 LowV = V2;
13342 } else {
13343 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13344 // trying to place elements directly, just blend them and set up the final
13345 // shuffle to place them.
13346
13347 // The first two blend mask elements are for V1, the second two are for
13348 // V2.
13349 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13350 Mask[2] < 4 ? Mask[2] : Mask[3],
13351 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13352 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13353 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13354 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13355
13356 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13357 // a blend.
13358 LowV = HighV = V1;
13359 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13360 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13361 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13362 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13363 }
13364 } else if (NumV2Elements == 3) {
13365 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13366 // we can get here due to other paths (e.g repeated mask matching) that we
13367 // don't want to do another round of lowerVECTOR_SHUFFLE.
13369 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13370 }
13371 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13372 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13373}
13374
13375/// Lower 4-lane 32-bit floating point shuffles.
13376///
13377/// Uses instructions exclusively from the floating point unit to minimize
13378/// domain crossing penalties, as these are sufficient to implement all v4f32
13379/// shuffles.
13381 const APInt &Zeroable, SDValue V1, SDValue V2,
13382 const X86Subtarget &Subtarget,
13383 SelectionDAG &DAG) {
13384 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13385 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13386 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13387
13388 if (Subtarget.hasSSE41())
13389 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13390 Zeroable, Subtarget, DAG))
13391 return Blend;
13392
13393 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13394
13395 if (NumV2Elements == 0) {
13396 // Check for being able to broadcast a single element.
13397 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13398 Mask, Subtarget, DAG))
13399 return Broadcast;
13400
13401 // Use even/odd duplicate instructions for masks that match their pattern.
13402 if (Subtarget.hasSSE3()) {
13403 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13404 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13405 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13406 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13407 }
13408
13409 if (Subtarget.hasAVX()) {
13410 // If we have AVX, we can use VPERMILPS which will allow folding a load
13411 // into the shuffle.
13412 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13413 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13414 }
13415
13416 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13417 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13418 if (!Subtarget.hasSSE2()) {
13419 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13420 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13421 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13422 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13423 }
13424
13425 // Otherwise, use a straight shuffle of a single input vector. We pass the
13426 // input vector to both operands to simulate this with a SHUFPS.
13427 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13428 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13429 }
13430
13431 if (Subtarget.hasSSE2())
13433 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13434 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13435 return ZExt;
13436 }
13437
13438 if (Subtarget.hasAVX2())
13439 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13440 return Extract;
13441
13442 // There are special ways we can lower some single-element blends. However, we
13443 // have custom ways we can lower more complex single-element blends below that
13444 // we defer to if both this and BLENDPS fail to match, so restrict this to
13445 // when the V2 input is targeting element 0 of the mask -- that is the fast
13446 // case here.
13447 if (NumV2Elements == 1 && Mask[0] >= 4)
13449 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13450 return V;
13451
13452 if (Subtarget.hasSSE41()) {
13453 // Use INSERTPS if we can complete the shuffle efficiently.
13454 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13455 return V;
13456
13457 if (!isSingleSHUFPSMask(Mask))
13458 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13459 V2, Mask, DAG))
13460 return BlendPerm;
13461 }
13462
13463 // Use low/high mov instructions. These are only valid in SSE1 because
13464 // otherwise they are widened to v2f64 and never get here.
13465 if (!Subtarget.hasSSE2()) {
13466 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13467 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13468 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13469 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13470 }
13471
13472 // Use dedicated unpack instructions for masks that match their pattern.
13473 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13474 return V;
13475
13476 // Otherwise fall back to a SHUFPS lowering strategy.
13477 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13478}
13479
13480/// Lower 4-lane i32 vector shuffles.
13481///
13482/// We try to handle these with integer-domain shuffles where we can, but for
13483/// blends we use the floating point domain blend instructions.
13485 const APInt &Zeroable, SDValue V1, SDValue V2,
13486 const X86Subtarget &Subtarget,
13487 SelectionDAG &DAG) {
13488 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13489 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13490 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13491
13492 // Whenever we can lower this as a zext, that instruction is strictly faster
13493 // than any alternative. It also allows us to fold memory operands into the
13494 // shuffle in many cases.
13495 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13496 Zeroable, Subtarget, DAG))
13497 return ZExt;
13498
13499 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13500
13501 // Try to use shift instructions if fast.
13502 if (Subtarget.preferLowerShuffleAsShift()) {
13503 if (SDValue Shift =
13504 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13505 Subtarget, DAG, /*BitwiseOnly*/ true))
13506 return Shift;
13507 if (NumV2Elements == 0)
13508 if (SDValue Rotate =
13509 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13510 return Rotate;
13511 }
13512
13513 if (NumV2Elements == 0) {
13514 // Try to use broadcast unless the mask only has one non-undef element.
13515 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13516 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13517 Mask, Subtarget, DAG))
13518 return Broadcast;
13519 }
13520
13521 // Straight shuffle of a single input vector. For everything from SSE2
13522 // onward this has a single fast instruction with no scary immediates.
13523 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13524 // but we aren't actually going to use the UNPCK instruction because doing
13525 // so prevents folding a load into this instruction or making a copy.
13526 const int UnpackLoMask[] = {0, 0, 1, 1};
13527 const int UnpackHiMask[] = {2, 2, 3, 3};
13528 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13529 Mask = UnpackLoMask;
13530 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13531 Mask = UnpackHiMask;
13532
13533 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13534 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13535 }
13536
13537 if (Subtarget.hasAVX2())
13538 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13539 return Extract;
13540
13541 // Try to use shift instructions.
13542 if (SDValue Shift =
13543 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13544 DAG, /*BitwiseOnly*/ false))
13545 return Shift;
13546
13547 // There are special ways we can lower some single-element blends.
13548 if (NumV2Elements == 1)
13550 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13551 return V;
13552
13553 // We have different paths for blend lowering, but they all must use the
13554 // *exact* same predicate.
13555 bool IsBlendSupported = Subtarget.hasSSE41();
13556 if (IsBlendSupported)
13557 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13558 Zeroable, Subtarget, DAG))
13559 return Blend;
13560
13561 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13562 Zeroable, Subtarget, DAG))
13563 return Masked;
13564
13565 // Use dedicated unpack instructions for masks that match their pattern.
13566 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13567 return V;
13568
13569 // Try to use byte rotation instructions.
13570 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13571 if (Subtarget.hasSSSE3()) {
13572 if (Subtarget.hasVLX())
13573 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13574 Zeroable, Subtarget, DAG))
13575 return Rotate;
13576
13577 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13578 Subtarget, DAG))
13579 return Rotate;
13580 }
13581
13582 // Assume that a single SHUFPS is faster than an alternative sequence of
13583 // multiple instructions (even if the CPU has a domain penalty).
13584 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13585 if (!isSingleSHUFPSMask(Mask)) {
13586 // If we have direct support for blends, we should lower by decomposing into
13587 // a permute. That will be faster than the domain cross.
13588 if (IsBlendSupported)
13589 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13590 Zeroable, Subtarget, DAG);
13591
13592 // Try to lower by permuting the inputs into an unpack instruction.
13593 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13594 Mask, Subtarget, DAG))
13595 return Unpack;
13596 }
13597
13598 // We implement this with SHUFPS because it can blend from two vectors.
13599 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13600 // up the inputs, bypassing domain shift penalties that we would incur if we
13601 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13602 // relevant.
13603 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13604 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13605 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13606 return DAG.getBitcast(MVT::v4i32, ShufPS);
13607}
13608
13609/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13610/// shuffle lowering, and the most complex part.
13611///
13612/// The lowering strategy is to try to form pairs of input lanes which are
13613/// targeted at the same half of the final vector, and then use a dword shuffle
13614/// to place them onto the right half, and finally unpack the paired lanes into
13615/// their final position.
13616///
13617/// The exact breakdown of how to form these dword pairs and align them on the
13618/// correct sides is really tricky. See the comments within the function for
13619/// more of the details.
13620///
13621/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13622/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13623/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13624/// vector, form the analogous 128-bit 8-element Mask.
13626 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13627 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13628 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13629 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13630
13631 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13632 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13633 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13634
13635 // Attempt to directly match PSHUFLW or PSHUFHW.
13636 if (isUndefOrInRange(LoMask, 0, 4) &&
13637 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13638 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13639 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13640 }
13641 if (isUndefOrInRange(HiMask, 4, 8) &&
13642 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13643 for (int i = 0; i != 4; ++i)
13644 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13645 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13646 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13647 }
13648
13649 SmallVector<int, 4> LoInputs;
13650 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13651 array_pod_sort(LoInputs.begin(), LoInputs.end());
13652 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13653 SmallVector<int, 4> HiInputs;
13654 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13655 array_pod_sort(HiInputs.begin(), HiInputs.end());
13656 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13657 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13658 int NumHToL = LoInputs.size() - NumLToL;
13659 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13660 int NumHToH = HiInputs.size() - NumLToH;
13661 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13662 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13663 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13664 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13665
13666 // If we are shuffling values from one half - check how many different DWORD
13667 // pairs we need to create. If only 1 or 2 then we can perform this as a
13668 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13669 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13670 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13671 V = DAG.getNode(ShufWOp, DL, VT, V,
13672 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13673 V = DAG.getBitcast(PSHUFDVT, V);
13674 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13675 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13676 return DAG.getBitcast(VT, V);
13677 };
13678
13679 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13680 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13681 SmallVector<std::pair<int, int>, 4> DWordPairs;
13682 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13683
13684 // Collect the different DWORD pairs.
13685 for (int DWord = 0; DWord != 4; ++DWord) {
13686 int M0 = Mask[2 * DWord + 0];
13687 int M1 = Mask[2 * DWord + 1];
13688 M0 = (M0 >= 0 ? M0 % 4 : M0);
13689 M1 = (M1 >= 0 ? M1 % 4 : M1);
13690 if (M0 < 0 && M1 < 0)
13691 continue;
13692
13693 bool Match = false;
13694 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13695 auto &DWordPair = DWordPairs[j];
13696 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13697 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13698 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13699 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13700 PSHUFDMask[DWord] = DOffset + j;
13701 Match = true;
13702 break;
13703 }
13704 }
13705 if (!Match) {
13706 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13707 DWordPairs.push_back(std::make_pair(M0, M1));
13708 }
13709 }
13710
13711 if (DWordPairs.size() <= 2) {
13712 DWordPairs.resize(2, std::make_pair(-1, -1));
13713 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13714 DWordPairs[1].first, DWordPairs[1].second};
13715 if ((NumHToL + NumHToH) == 0)
13716 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13717 if ((NumLToL + NumLToH) == 0)
13718 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13719 }
13720 }
13721
13722 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13723 // such inputs we can swap two of the dwords across the half mark and end up
13724 // with <=2 inputs to each half in each half. Once there, we can fall through
13725 // to the generic code below. For example:
13726 //
13727 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13728 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13729 //
13730 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13731 // and an existing 2-into-2 on the other half. In this case we may have to
13732 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13733 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13734 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13735 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13736 // half than the one we target for fixing) will be fixed when we re-enter this
13737 // path. We will also combine away any sequence of PSHUFD instructions that
13738 // result into a single instruction. Here is an example of the tricky case:
13739 //
13740 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13741 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13742 //
13743 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13744 //
13745 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13746 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13747 //
13748 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13749 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13750 //
13751 // The result is fine to be handled by the generic logic.
13752 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13753 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13754 int AOffset, int BOffset) {
13755 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13756 "Must call this with A having 3 or 1 inputs from the A half.");
13757 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13758 "Must call this with B having 1 or 3 inputs from the B half.");
13759 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13760 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13761
13762 bool ThreeAInputs = AToAInputs.size() == 3;
13763
13764 // Compute the index of dword with only one word among the three inputs in
13765 // a half by taking the sum of the half with three inputs and subtracting
13766 // the sum of the actual three inputs. The difference is the remaining
13767 // slot.
13768 int ADWord = 0, BDWord = 0;
13769 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13770 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13771 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13772 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13773 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13774 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13775 int TripleNonInputIdx =
13776 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13777 TripleDWord = TripleNonInputIdx / 2;
13778
13779 // We use xor with one to compute the adjacent DWord to whichever one the
13780 // OneInput is in.
13781 OneInputDWord = (OneInput / 2) ^ 1;
13782
13783 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13784 // and BToA inputs. If there is also such a problem with the BToB and AToB
13785 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13786 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13787 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13788 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13789 // Compute how many inputs will be flipped by swapping these DWords. We
13790 // need
13791 // to balance this to ensure we don't form a 3-1 shuffle in the other
13792 // half.
13793 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13794 llvm::count(AToBInputs, 2 * ADWord + 1);
13795 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13796 llvm::count(BToBInputs, 2 * BDWord + 1);
13797 if ((NumFlippedAToBInputs == 1 &&
13798 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13799 (NumFlippedBToBInputs == 1 &&
13800 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13801 // We choose whether to fix the A half or B half based on whether that
13802 // half has zero flipped inputs. At zero, we may not be able to fix it
13803 // with that half. We also bias towards fixing the B half because that
13804 // will more commonly be the high half, and we have to bias one way.
13805 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13806 ArrayRef<int> Inputs) {
13807 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13808 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13809 // Determine whether the free index is in the flipped dword or the
13810 // unflipped dword based on where the pinned index is. We use this bit
13811 // in an xor to conditionally select the adjacent dword.
13812 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13813 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13814 if (IsFixIdxInput == IsFixFreeIdxInput)
13815 FixFreeIdx += 1;
13816 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13817 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13818 "We need to be changing the number of flipped inputs!");
13819 int PSHUFHalfMask[] = {0, 1, 2, 3};
13820 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13821 V = DAG.getNode(
13822 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13823 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13824 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13825
13826 for (int &M : Mask)
13827 if (M >= 0 && M == FixIdx)
13828 M = FixFreeIdx;
13829 else if (M >= 0 && M == FixFreeIdx)
13830 M = FixIdx;
13831 };
13832 if (NumFlippedBToBInputs != 0) {
13833 int BPinnedIdx =
13834 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13835 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13836 } else {
13837 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13838 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13839 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13840 }
13841 }
13842 }
13843
13844 int PSHUFDMask[] = {0, 1, 2, 3};
13845 PSHUFDMask[ADWord] = BDWord;
13846 PSHUFDMask[BDWord] = ADWord;
13847 V = DAG.getBitcast(
13848 VT,
13849 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13850 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13851
13852 // Adjust the mask to match the new locations of A and B.
13853 for (int &M : Mask)
13854 if (M >= 0 && M/2 == ADWord)
13855 M = 2 * BDWord + M % 2;
13856 else if (M >= 0 && M/2 == BDWord)
13857 M = 2 * ADWord + M % 2;
13858
13859 // Recurse back into this routine to re-compute state now that this isn't
13860 // a 3 and 1 problem.
13861 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13862 };
13863 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13864 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13865 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13866 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13867
13868 // At this point there are at most two inputs to the low and high halves from
13869 // each half. That means the inputs can always be grouped into dwords and
13870 // those dwords can then be moved to the correct half with a dword shuffle.
13871 // We use at most one low and one high word shuffle to collect these paired
13872 // inputs into dwords, and finally a dword shuffle to place them.
13873 int PSHUFLMask[4] = {-1, -1, -1, -1};
13874 int PSHUFHMask[4] = {-1, -1, -1, -1};
13875 int PSHUFDMask[4] = {-1, -1, -1, -1};
13876
13877 // First fix the masks for all the inputs that are staying in their
13878 // original halves. This will then dictate the targets of the cross-half
13879 // shuffles.
13880 auto fixInPlaceInputs =
13881 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13882 MutableArrayRef<int> SourceHalfMask,
13883 MutableArrayRef<int> HalfMask, int HalfOffset) {
13884 if (InPlaceInputs.empty())
13885 return;
13886 if (InPlaceInputs.size() == 1) {
13887 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13888 InPlaceInputs[0] - HalfOffset;
13889 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13890 return;
13891 }
13892 if (IncomingInputs.empty()) {
13893 // Just fix all of the in place inputs.
13894 for (int Input : InPlaceInputs) {
13895 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13896 PSHUFDMask[Input / 2] = Input / 2;
13897 }
13898 return;
13899 }
13900
13901 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13902 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13903 InPlaceInputs[0] - HalfOffset;
13904 // Put the second input next to the first so that they are packed into
13905 // a dword. We find the adjacent index by toggling the low bit.
13906 int AdjIndex = InPlaceInputs[0] ^ 1;
13907 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13908 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13909 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13910 };
13911 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13912 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13913
13914 // Now gather the cross-half inputs and place them into a free dword of
13915 // their target half.
13916 // FIXME: This operation could almost certainly be simplified dramatically to
13917 // look more like the 3-1 fixing operation.
13918 auto moveInputsToRightHalf = [&PSHUFDMask](
13919 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13920 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13921 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13922 int DestOffset) {
13923 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13924 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13925 };
13926 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13927 int Word) {
13928 int LowWord = Word & ~1;
13929 int HighWord = Word | 1;
13930 return isWordClobbered(SourceHalfMask, LowWord) ||
13931 isWordClobbered(SourceHalfMask, HighWord);
13932 };
13933
13934 if (IncomingInputs.empty())
13935 return;
13936
13937 if (ExistingInputs.empty()) {
13938 // Map any dwords with inputs from them into the right half.
13939 for (int Input : IncomingInputs) {
13940 // If the source half mask maps over the inputs, turn those into
13941 // swaps and use the swapped lane.
13942 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13943 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13944 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13945 Input - SourceOffset;
13946 // We have to swap the uses in our half mask in one sweep.
13947 for (int &M : HalfMask)
13948 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13949 M = Input;
13950 else if (M == Input)
13951 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13952 } else {
13953 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13954 Input - SourceOffset &&
13955 "Previous placement doesn't match!");
13956 }
13957 // Note that this correctly re-maps both when we do a swap and when
13958 // we observe the other side of the swap above. We rely on that to
13959 // avoid swapping the members of the input list directly.
13960 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13961 }
13962
13963 // Map the input's dword into the correct half.
13964 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13965 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13966 else
13967 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13968 Input / 2 &&
13969 "Previous placement doesn't match!");
13970 }
13971
13972 // And just directly shift any other-half mask elements to be same-half
13973 // as we will have mirrored the dword containing the element into the
13974 // same position within that half.
13975 for (int &M : HalfMask)
13976 if (M >= SourceOffset && M < SourceOffset + 4) {
13977 M = M - SourceOffset + DestOffset;
13978 assert(M >= 0 && "This should never wrap below zero!");
13979 }
13980 return;
13981 }
13982
13983 // Ensure we have the input in a viable dword of its current half. This
13984 // is particularly tricky because the original position may be clobbered
13985 // by inputs being moved and *staying* in that half.
13986 if (IncomingInputs.size() == 1) {
13987 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13988 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13989 SourceOffset;
13990 SourceHalfMask[InputFixed - SourceOffset] =
13991 IncomingInputs[0] - SourceOffset;
13992 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13993 InputFixed);
13994 IncomingInputs[0] = InputFixed;
13995 }
13996 } else if (IncomingInputs.size() == 2) {
13997 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13998 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13999 // We have two non-adjacent or clobbered inputs we need to extract from
14000 // the source half. To do this, we need to map them into some adjacent
14001 // dword slot in the source mask.
14002 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14003 IncomingInputs[1] - SourceOffset};
14004
14005 // If there is a free slot in the source half mask adjacent to one of
14006 // the inputs, place the other input in it. We use (Index XOR 1) to
14007 // compute an adjacent index.
14008 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14009 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14010 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14011 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14012 InputsFixed[1] = InputsFixed[0] ^ 1;
14013 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14014 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14015 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14016 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14017 InputsFixed[0] = InputsFixed[1] ^ 1;
14018 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14019 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14020 // The two inputs are in the same DWord but it is clobbered and the
14021 // adjacent DWord isn't used at all. Move both inputs to the free
14022 // slot.
14023 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14024 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14025 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14026 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14027 } else {
14028 // The only way we hit this point is if there is no clobbering
14029 // (because there are no off-half inputs to this half) and there is no
14030 // free slot adjacent to one of the inputs. In this case, we have to
14031 // swap an input with a non-input.
14032 for (int i = 0; i < 4; ++i)
14033 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14034 "We can't handle any clobbers here!");
14035 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14036 "Cannot have adjacent inputs here!");
14037
14038 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14039 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14040
14041 // We also have to update the final source mask in this case because
14042 // it may need to undo the above swap.
14043 for (int &M : FinalSourceHalfMask)
14044 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14045 M = InputsFixed[1] + SourceOffset;
14046 else if (M == InputsFixed[1] + SourceOffset)
14047 M = (InputsFixed[0] ^ 1) + SourceOffset;
14048
14049 InputsFixed[1] = InputsFixed[0] ^ 1;
14050 }
14051
14052 // Point everything at the fixed inputs.
14053 for (int &M : HalfMask)
14054 if (M == IncomingInputs[0])
14055 M = InputsFixed[0] + SourceOffset;
14056 else if (M == IncomingInputs[1])
14057 M = InputsFixed[1] + SourceOffset;
14058
14059 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14060 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14061 }
14062 } else {
14063 llvm_unreachable("Unhandled input size!");
14064 }
14065
14066 // Now hoist the DWord down to the right half.
14067 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14068 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14069 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14070 for (int &M : HalfMask)
14071 for (int Input : IncomingInputs)
14072 if (M == Input)
14073 M = FreeDWord * 2 + Input % 2;
14074 };
14075 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14076 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14077 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14078 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14079
14080 // Now enact all the shuffles we've computed to move the inputs into their
14081 // target half.
14082 if (!isNoopShuffleMask(PSHUFLMask))
14083 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14084 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14085 if (!isNoopShuffleMask(PSHUFHMask))
14086 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14087 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14088 if (!isNoopShuffleMask(PSHUFDMask))
14089 V = DAG.getBitcast(
14090 VT,
14091 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14092 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14093
14094 // At this point, each half should contain all its inputs, and we can then
14095 // just shuffle them into their final position.
14096 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
14097 "Failed to lift all the high half inputs to the low mask!");
14098 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
14099 "Failed to lift all the low half inputs to the high mask!");
14100
14101 // Do a half shuffle for the low mask.
14102 if (!isNoopShuffleMask(LoMask))
14103 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14104 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14105
14106 // Do a half shuffle with the high mask after shifting its values down.
14107 for (int &M : HiMask)
14108 if (M >= 0)
14109 M -= 4;
14110 if (!isNoopShuffleMask(HiMask))
14111 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14112 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14113
14114 return V;
14115}
14116
14117/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14118/// blend if only one input is used.
14120 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14121 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14123 "Lane crossing shuffle masks not supported");
14124
14125 int NumBytes = VT.getSizeInBits() / 8;
14126 int Size = Mask.size();
14127 int Scale = NumBytes / Size;
14128
14129 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14130 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14131 V1InUse = false;
14132 V2InUse = false;
14133
14134 for (int i = 0; i < NumBytes; ++i) {
14135 int M = Mask[i / Scale];
14136 if (M < 0)
14137 continue;
14138
14139 const int ZeroMask = 0x80;
14140 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14141 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14142 if (Zeroable[i / Scale])
14143 V1Idx = V2Idx = ZeroMask;
14144
14145 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14146 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14147 V1InUse |= (ZeroMask != V1Idx);
14148 V2InUse |= (ZeroMask != V2Idx);
14149 }
14150
14151 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14152 if (V1InUse)
14153 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14154 DAG.getBuildVector(ShufVT, DL, V1Mask));
14155 if (V2InUse)
14156 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14157 DAG.getBuildVector(ShufVT, DL, V2Mask));
14158
14159 // If we need shuffled inputs from both, blend the two.
14160 SDValue V;
14161 if (V1InUse && V2InUse)
14162 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14163 else
14164 V = V1InUse ? V1 : V2;
14165
14166 // Cast the result back to the correct type.
14167 return DAG.getBitcast(VT, V);
14168}
14169
14170/// Generic lowering of 8-lane i16 shuffles.
14171///
14172/// This handles both single-input shuffles and combined shuffle/blends with
14173/// two inputs. The single input shuffles are immediately delegated to
14174/// a dedicated lowering routine.
14175///
14176/// The blends are lowered in one of three fundamental ways. If there are few
14177/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14178/// of the input is significantly cheaper when lowered as an interleaving of
14179/// the two inputs, try to interleave them. Otherwise, blend the low and high
14180/// halves of the inputs separately (making them have relatively few inputs)
14181/// and then concatenate them.
14183 const APInt &Zeroable, SDValue V1, SDValue V2,
14184 const X86Subtarget &Subtarget,
14185 SelectionDAG &DAG) {
14186 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14187 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14188 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14189
14190 // Whenever we can lower this as a zext, that instruction is strictly faster
14191 // than any alternative.
14192 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14193 Zeroable, Subtarget, DAG))
14194 return ZExt;
14195
14196 // Try to use lower using a truncation.
14197 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14198 Subtarget, DAG))
14199 return V;
14200
14201 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14202
14203 if (NumV2Inputs == 0) {
14204 // Try to use shift instructions.
14205 if (SDValue Shift =
14206 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14207 Subtarget, DAG, /*BitwiseOnly*/ false))
14208 return Shift;
14209
14210 // Check for being able to broadcast a single element.
14211 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14212 Mask, Subtarget, DAG))
14213 return Broadcast;
14214
14215 // Try to use bit rotation instructions.
14216 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14217 Subtarget, DAG))
14218 return Rotate;
14219
14220 // Use dedicated unpack instructions for masks that match their pattern.
14221 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14222 return V;
14223
14224 // Use dedicated pack instructions for masks that match their pattern.
14225 if (SDValue V =
14226 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14227 return V;
14228
14229 // Try to use byte rotation instructions.
14230 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14231 Subtarget, DAG))
14232 return Rotate;
14233
14234 // Make a copy of the mask so it can be modified.
14235 SmallVector<int, 8> MutableMask(Mask);
14236 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14237 Subtarget, DAG);
14238 }
14239
14240 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14241 "All single-input shuffles should be canonicalized to be V1-input "
14242 "shuffles.");
14243
14244 // Try to use shift instructions.
14245 if (SDValue Shift =
14246 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14247 DAG, /*BitwiseOnly*/ false))
14248 return Shift;
14249
14250 // See if we can use SSE4A Extraction / Insertion.
14251 if (Subtarget.hasSSE4A())
14252 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14253 Zeroable, DAG))
14254 return V;
14255
14256 // There are special ways we can lower some single-element blends.
14257 if (NumV2Inputs == 1)
14259 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14260 return V;
14261
14262 // We have different paths for blend lowering, but they all must use the
14263 // *exact* same predicate.
14264 bool IsBlendSupported = Subtarget.hasSSE41();
14265 if (IsBlendSupported)
14266 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14267 Zeroable, Subtarget, DAG))
14268 return Blend;
14269
14270 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14271 Zeroable, Subtarget, DAG))
14272 return Masked;
14273
14274 // Use dedicated unpack instructions for masks that match their pattern.
14275 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14276 return V;
14277
14278 // Use dedicated pack instructions for masks that match their pattern.
14279 if (SDValue V =
14280 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14281 return V;
14282
14283 // Try to use lower using a truncation.
14284 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14285 Subtarget, DAG))
14286 return V;
14287
14288 // Try to use byte rotation instructions.
14289 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14290 Subtarget, DAG))
14291 return Rotate;
14292
14293 if (SDValue BitBlend =
14294 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14295 return BitBlend;
14296
14297 // Try to use byte shift instructions to mask.
14298 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14299 Zeroable, Subtarget, DAG))
14300 return V;
14301
14302 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14303 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14304 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14305 !Subtarget.hasVLX()) {
14306 // Check if this is part of a 256-bit vector truncation.
14307 unsigned PackOpc = 0;
14308 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14311 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14312 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14313 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14314 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14315 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14316 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14317 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14318 PackOpc = X86ISD::PACKUS;
14319 } else if (Subtarget.hasSSE41()) {
14320 SmallVector<SDValue, 4> DWordClearOps(4,
14321 DAG.getConstant(0, DL, MVT::i32));
14322 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14323 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14324 SDValue DWordClearMask =
14325 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14326 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14327 DWordClearMask);
14328 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14329 DWordClearMask);
14330 PackOpc = X86ISD::PACKUS;
14331 } else if (!Subtarget.hasSSSE3()) {
14332 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14333 V1 = DAG.getBitcast(MVT::v4i32, V1);
14334 V2 = DAG.getBitcast(MVT::v4i32, V2);
14335 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14336 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14337 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14338 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14339 PackOpc = X86ISD::PACKSS;
14340 }
14341 if (PackOpc) {
14342 // Now pack things back together.
14343 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14344 if (NumEvenDrops == 2) {
14345 Result = DAG.getBitcast(MVT::v4i32, Result);
14346 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14347 }
14348 return Result;
14349 }
14350 }
14351
14352 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14353 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14354 if (NumOddDrops == 1) {
14355 bool HasSSE41 = Subtarget.hasSSE41();
14356 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14357 DAG.getBitcast(MVT::v4i32, V1),
14358 DAG.getTargetConstant(16, DL, MVT::i8));
14359 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14360 DAG.getBitcast(MVT::v4i32, V2),
14361 DAG.getTargetConstant(16, DL, MVT::i8));
14362 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14363 MVT::v8i16, V1, V2);
14364 }
14365
14366 // Try to lower by permuting the inputs into an unpack instruction.
14367 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14368 Mask, Subtarget, DAG))
14369 return Unpack;
14370
14371 // If we can't directly blend but can use PSHUFB, that will be better as it
14372 // can both shuffle and set up the inefficient blend.
14373 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14374 bool V1InUse, V2InUse;
14375 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14376 Zeroable, DAG, V1InUse, V2InUse);
14377 }
14378
14379 // We can always bit-blend if we have to so the fallback strategy is to
14380 // decompose into single-input permutes and blends/unpacks.
14381 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14382 Zeroable, Subtarget, DAG);
14383}
14384
14385/// Lower 8-lane 16-bit floating point shuffles.
14387 const APInt &Zeroable, SDValue V1, SDValue V2,
14388 const X86Subtarget &Subtarget,
14389 SelectionDAG &DAG) {
14390 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14391 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14392 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14393 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14394
14395 if (Subtarget.hasFP16()) {
14396 if (NumV2Elements == 0) {
14397 // Check for being able to broadcast a single element.
14398 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14399 Mask, Subtarget, DAG))
14400 return Broadcast;
14401 }
14402 if (NumV2Elements == 1 && Mask[0] >= 8)
14404 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14405 return V;
14406 }
14407
14408 V1 = DAG.getBitcast(MVT::v8i16, V1);
14409 V2 = DAG.getBitcast(MVT::v8i16, V2);
14410 return DAG.getBitcast(MVT::v8f16,
14411 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14412}
14413
14414// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14415// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14416// the active subvector is extracted.
14418 ArrayRef<int> OriginalMask, SDValue V1,
14419 SDValue V2, const X86Subtarget &Subtarget,
14420 SelectionDAG &DAG) {
14421 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14422 SmallVector<int, 32> Mask(OriginalMask);
14423 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14424 !isShuffleFoldableLoad(V2)) {
14426 std::swap(V1, V2);
14427 }
14428
14429 MVT MaskVT = VT.changeTypeToInteger();
14430 SDValue MaskNode;
14431 MVT ShuffleVT = VT;
14432 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14433 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14434 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14435 ShuffleVT = V1.getSimpleValueType();
14436
14437 // Adjust mask to correct indices for the second input.
14438 int NumElts = VT.getVectorNumElements();
14439 unsigned Scale = 512 / VT.getSizeInBits();
14440 SmallVector<int, 32> AdjustedMask(Mask);
14441 for (int &M : AdjustedMask)
14442 if (NumElts <= M)
14443 M += (Scale - 1) * NumElts;
14444 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14445 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14446 } else {
14447 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14448 }
14449
14450 SDValue Result;
14451 if (V2.isUndef())
14452 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14453 else
14454 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14455
14456 if (VT != ShuffleVT)
14457 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14458
14459 return Result;
14460}
14461
14462/// Generic lowering of v16i8 shuffles.
14463///
14464/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14465/// detect any complexity reducing interleaving. If that doesn't help, it uses
14466/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14467/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14468/// back together.
14470 const APInt &Zeroable, SDValue V1, SDValue V2,
14471 const X86Subtarget &Subtarget,
14472 SelectionDAG &DAG) {
14473 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14474 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14475 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14476
14477 // Try to use shift instructions.
14478 if (SDValue Shift =
14479 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14480 DAG, /*BitwiseOnly*/ false))
14481 return Shift;
14482
14483 // Try to use byte rotation instructions.
14484 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14485 Subtarget, DAG))
14486 return Rotate;
14487
14488 // Use dedicated pack instructions for masks that match their pattern.
14489 if (SDValue V =
14490 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14491 return V;
14492
14493 // Try to use a zext lowering.
14494 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14495 Zeroable, Subtarget, DAG))
14496 return ZExt;
14497
14498 // Try to use lower using a truncation.
14499 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14500 Subtarget, DAG))
14501 return V;
14502
14503 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14504 Subtarget, DAG))
14505 return V;
14506
14507 // See if we can use SSE4A Extraction / Insertion.
14508 if (Subtarget.hasSSE4A())
14509 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14510 Zeroable, DAG))
14511 return V;
14512
14513 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14514
14515 // For single-input shuffles, there are some nicer lowering tricks we can use.
14516 if (NumV2Elements == 0) {
14517 // Check for being able to broadcast a single element.
14518 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14519 Mask, Subtarget, DAG))
14520 return Broadcast;
14521
14522 // Try to use bit rotation instructions.
14523 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14524 Subtarget, DAG))
14525 return Rotate;
14526
14527 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14528 return V;
14529
14530 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14531 // Notably, this handles splat and partial-splat shuffles more efficiently.
14532 // However, it only makes sense if the pre-duplication shuffle simplifies
14533 // things significantly. Currently, this means we need to be able to
14534 // express the pre-duplication shuffle as an i16 shuffle.
14535 //
14536 // FIXME: We should check for other patterns which can be widened into an
14537 // i16 shuffle as well.
14538 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14539 for (int i = 0; i < 16; i += 2)
14540 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14541 return false;
14542
14543 return true;
14544 };
14545 auto tryToWidenViaDuplication = [&]() -> SDValue {
14546 if (!canWidenViaDuplication(Mask))
14547 return SDValue();
14548 SmallVector<int, 4> LoInputs;
14549 copy_if(Mask, std::back_inserter(LoInputs),
14550 [](int M) { return M >= 0 && M < 8; });
14551 array_pod_sort(LoInputs.begin(), LoInputs.end());
14552 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14553 SmallVector<int, 4> HiInputs;
14554 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14555 array_pod_sort(HiInputs.begin(), HiInputs.end());
14556 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14557
14558 bool TargetLo = LoInputs.size() >= HiInputs.size();
14559 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14560 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14561
14562 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14564 for (int I : InPlaceInputs) {
14565 PreDupI16Shuffle[I/2] = I/2;
14566 LaneMap[I] = I;
14567 }
14568 int j = TargetLo ? 0 : 4, je = j + 4;
14569 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14570 // Check if j is already a shuffle of this input. This happens when
14571 // there are two adjacent bytes after we move the low one.
14572 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14573 // If we haven't yet mapped the input, search for a slot into which
14574 // we can map it.
14575 while (j < je && PreDupI16Shuffle[j] >= 0)
14576 ++j;
14577
14578 if (j == je)
14579 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14580 return SDValue();
14581
14582 // Map this input with the i16 shuffle.
14583 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14584 }
14585
14586 // Update the lane map based on the mapping we ended up with.
14587 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14588 }
14589 V1 = DAG.getBitcast(
14590 MVT::v16i8,
14591 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14592 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14593
14594 // Unpack the bytes to form the i16s that will be shuffled into place.
14595 bool EvenInUse = false, OddInUse = false;
14596 for (int i = 0; i < 16; i += 2) {
14597 EvenInUse |= (Mask[i + 0] >= 0);
14598 OddInUse |= (Mask[i + 1] >= 0);
14599 if (EvenInUse && OddInUse)
14600 break;
14601 }
14602 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14603 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14604 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14605
14606 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14607 for (int i = 0; i < 16; ++i)
14608 if (Mask[i] >= 0) {
14609 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14610 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14611 if (PostDupI16Shuffle[i / 2] < 0)
14612 PostDupI16Shuffle[i / 2] = MappedMask;
14613 else
14614 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14615 "Conflicting entries in the original shuffle!");
14616 }
14617 return DAG.getBitcast(
14618 MVT::v16i8,
14619 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14620 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14621 };
14622 if (SDValue V = tryToWidenViaDuplication())
14623 return V;
14624 }
14625
14626 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14627 Zeroable, Subtarget, DAG))
14628 return Masked;
14629
14630 // Use dedicated unpack instructions for masks that match their pattern.
14631 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14632 return V;
14633
14634 // Try to use byte shift instructions to mask.
14635 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14636 Zeroable, Subtarget, DAG))
14637 return V;
14638
14639 // Check for compaction patterns.
14640 bool IsSingleInput = V2.isUndef();
14641 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14642
14643 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14644 // with PSHUFB. It is important to do this before we attempt to generate any
14645 // blends but after all of the single-input lowerings. If the single input
14646 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14647 // want to preserve that and we can DAG combine any longer sequences into
14648 // a PSHUFB in the end. But once we start blending from multiple inputs,
14649 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14650 // and there are *very* few patterns that would actually be faster than the
14651 // PSHUFB approach because of its ability to zero lanes.
14652 //
14653 // If the mask is a binary compaction, we can more efficiently perform this
14654 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14655 //
14656 // FIXME: The only exceptions to the above are blends which are exact
14657 // interleavings with direct instructions supporting them. We currently don't
14658 // handle those well here.
14659 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14660 bool V1InUse = false;
14661 bool V2InUse = false;
14662
14664 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14665
14666 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14667 // do so. This avoids using them to handle blends-with-zero which is
14668 // important as a single pshufb is significantly faster for that.
14669 if (V1InUse && V2InUse) {
14670 if (Subtarget.hasSSE41())
14671 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14672 Zeroable, Subtarget, DAG))
14673 return Blend;
14674
14675 // We can use an unpack to do the blending rather than an or in some
14676 // cases. Even though the or may be (very minorly) more efficient, we
14677 // preference this lowering because there are common cases where part of
14678 // the complexity of the shuffles goes away when we do the final blend as
14679 // an unpack.
14680 // FIXME: It might be worth trying to detect if the unpack-feeding
14681 // shuffles will both be pshufb, in which case we shouldn't bother with
14682 // this.
14684 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14685 return Unpack;
14686
14687 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14688 if (Subtarget.hasVBMI())
14689 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14690 DAG);
14691
14692 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14693 if (Subtarget.hasXOP()) {
14694 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14695 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14696 }
14697
14698 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14699 // PALIGNR will be cheaper than the second PSHUFB+OR.
14701 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14702 return V;
14703 }
14704
14705 return PSHUFB;
14706 }
14707
14708 // There are special ways we can lower some single-element blends.
14709 if (NumV2Elements == 1)
14711 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14712 return V;
14713
14714 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14715 return Blend;
14716
14717 // Check whether a compaction lowering can be done. This handles shuffles
14718 // which take every Nth element for some even N. See the helper function for
14719 // details.
14720 //
14721 // We special case these as they can be particularly efficiently handled with
14722 // the PACKUSB instruction on x86 and they show up in common patterns of
14723 // rearranging bytes to truncate wide elements.
14724 if (NumEvenDrops) {
14725 // NumEvenDrops is the power of two stride of the elements. Another way of
14726 // thinking about it is that we need to drop the even elements this many
14727 // times to get the original input.
14728
14729 // First we need to zero all the dropped bytes.
14730 assert(NumEvenDrops <= 3 &&
14731 "No support for dropping even elements more than 3 times.");
14732 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14733 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14734 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14735 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14736 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14737 WordClearMask);
14738 if (!IsSingleInput)
14739 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14740 WordClearMask);
14741
14742 // Now pack things back together.
14743 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14744 IsSingleInput ? V1 : V2);
14745 for (int i = 1; i < NumEvenDrops; ++i) {
14746 Result = DAG.getBitcast(MVT::v8i16, Result);
14747 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14748 }
14749 return Result;
14750 }
14751
14752 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14753 if (NumOddDrops == 1) {
14754 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14755 DAG.getBitcast(MVT::v8i16, V1),
14756 DAG.getTargetConstant(8, DL, MVT::i8));
14757 if (!IsSingleInput)
14758 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14759 DAG.getBitcast(MVT::v8i16, V2),
14760 DAG.getTargetConstant(8, DL, MVT::i8));
14761 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14762 IsSingleInput ? V1 : V2);
14763 }
14764
14765 // Handle multi-input cases by blending/unpacking single-input shuffles.
14766 if (NumV2Elements > 0)
14767 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14768 Zeroable, Subtarget, DAG);
14769
14770 // The fallback path for single-input shuffles widens this into two v8i16
14771 // vectors with unpacks, shuffles those, and then pulls them back together
14772 // with a pack.
14773 SDValue V = V1;
14774
14775 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14776 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14777 for (int i = 0; i < 16; ++i)
14778 if (Mask[i] >= 0)
14779 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14780
14781 SDValue VLoHalf, VHiHalf;
14782 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14783 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14784 // i16s.
14785 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14786 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14787 // Use a mask to drop the high bytes.
14788 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14789 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14790 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14791
14792 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14793 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14794
14795 // Squash the masks to point directly into VLoHalf.
14796 for (int &M : LoBlendMask)
14797 if (M >= 0)
14798 M /= 2;
14799 for (int &M : HiBlendMask)
14800 if (M >= 0)
14801 M /= 2;
14802 } else {
14803 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14804 // VHiHalf so that we can blend them as i16s.
14805 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14806
14807 VLoHalf = DAG.getBitcast(
14808 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14809 VHiHalf = DAG.getBitcast(
14810 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14811 }
14812
14813 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14814 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14815
14816 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14817}
14818
14819/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14820///
14821/// This routine breaks down the specific type of 128-bit shuffle and
14822/// dispatches to the lowering routines accordingly.
14824 MVT VT, SDValue V1, SDValue V2,
14825 const APInt &Zeroable,
14826 const X86Subtarget &Subtarget,
14827 SelectionDAG &DAG) {
14828 if (VT == MVT::v8bf16) {
14829 V1 = DAG.getBitcast(MVT::v8i16, V1);
14830 V2 = DAG.getBitcast(MVT::v8i16, V2);
14831 return DAG.getBitcast(VT,
14832 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14833 }
14834
14835 switch (VT.SimpleTy) {
14836 case MVT::v2i64:
14837 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14838 case MVT::v2f64:
14839 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14840 case MVT::v4i32:
14841 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14842 case MVT::v4f32:
14843 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14844 case MVT::v8i16:
14845 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14846 case MVT::v8f16:
14847 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14848 case MVT::v16i8:
14849 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14850
14851 default:
14852 llvm_unreachable("Unimplemented!");
14853 }
14854}
14855
14856/// Generic routine to split vector shuffle into half-sized shuffles.
14857///
14858/// This routine just extracts two subvectors, shuffles them independently, and
14859/// then concatenates them back together. This should work effectively with all
14860/// AVX vector shuffle types.
14862 SDValue V2, ArrayRef<int> Mask,
14863 SelectionDAG &DAG, bool SimpleOnly) {
14864 assert(VT.getSizeInBits() >= 256 &&
14865 "Only for 256-bit or wider vector shuffles!");
14866 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14867 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14868
14869 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14870 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14871
14872 int NumElements = VT.getVectorNumElements();
14873 int SplitNumElements = NumElements / 2;
14874 MVT ScalarVT = VT.getVectorElementType();
14875 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14876
14877 // Use splitVector/extractSubVector so that split build-vectors just build two
14878 // narrower build vectors. This helps shuffling with splats and zeros.
14879 auto SplitVector = [&](SDValue V) {
14880 SDValue LoV, HiV;
14881 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14882 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14883 DAG.getBitcast(SplitVT, HiV));
14884 };
14885
14886 SDValue LoV1, HiV1, LoV2, HiV2;
14887 std::tie(LoV1, HiV1) = SplitVector(V1);
14888 std::tie(LoV2, HiV2) = SplitVector(V2);
14889
14890 // Now create two 4-way blends of these half-width vectors.
14891 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14892 bool &UseHiV1, bool &UseLoV2,
14893 bool &UseHiV2) {
14894 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14895 for (int i = 0; i < SplitNumElements; ++i) {
14896 int M = HalfMask[i];
14897 if (M >= NumElements) {
14898 if (M >= NumElements + SplitNumElements)
14899 UseHiV2 = true;
14900 else
14901 UseLoV2 = true;
14902 } else if (M >= 0) {
14903 if (M >= SplitNumElements)
14904 UseHiV1 = true;
14905 else
14906 UseLoV1 = true;
14907 }
14908 }
14909 };
14910
14911 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14912 if (!SimpleOnly)
14913 return true;
14914
14915 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14916 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14917
14918 return !(UseHiV1 || UseHiV2);
14919 };
14920
14921 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14922 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14923 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14924 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14925 for (int i = 0; i < SplitNumElements; ++i) {
14926 int M = HalfMask[i];
14927 if (M >= NumElements) {
14928 V2BlendMask[i] = M - NumElements;
14929 BlendMask[i] = SplitNumElements + i;
14930 } else if (M >= 0) {
14931 V1BlendMask[i] = M;
14932 BlendMask[i] = i;
14933 }
14934 }
14935
14936 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14937 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14938
14939 // Because the lowering happens after all combining takes place, we need to
14940 // manually combine these blend masks as much as possible so that we create
14941 // a minimal number of high-level vector shuffle nodes.
14942 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14943
14944 // First try just blending the halves of V1 or V2.
14945 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14946 return DAG.getUNDEF(SplitVT);
14947 if (!UseLoV2 && !UseHiV2)
14948 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14949 if (!UseLoV1 && !UseHiV1)
14950 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14951
14952 SDValue V1Blend, V2Blend;
14953 if (UseLoV1 && UseHiV1) {
14954 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14955 } else {
14956 // We only use half of V1 so map the usage down into the final blend mask.
14957 V1Blend = UseLoV1 ? LoV1 : HiV1;
14958 for (int i = 0; i < SplitNumElements; ++i)
14959 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14960 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14961 }
14962 if (UseLoV2 && UseHiV2) {
14963 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14964 } else {
14965 // We only use half of V2 so map the usage down into the final blend mask.
14966 V2Blend = UseLoV2 ? LoV2 : HiV2;
14967 for (int i = 0; i < SplitNumElements; ++i)
14968 if (BlendMask[i] >= SplitNumElements)
14969 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14970 }
14971 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14972 };
14973
14974 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14975 return SDValue();
14976
14977 SDValue Lo = HalfBlend(LoMask);
14978 SDValue Hi = HalfBlend(HiMask);
14979 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14980}
14981
14982/// Either split a vector in halves or decompose the shuffles and the
14983/// blend/unpack.
14984///
14985/// This is provided as a good fallback for many lowerings of non-single-input
14986/// shuffles with more than one 128-bit lane. In those cases, we want to select
14987/// between splitting the shuffle into 128-bit components and stitching those
14988/// back together vs. extracting the single-input shuffles and blending those
14989/// results.
14991 SDValue V2, ArrayRef<int> Mask,
14992 const APInt &Zeroable,
14993 const X86Subtarget &Subtarget,
14994 SelectionDAG &DAG) {
14995 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14996 "shuffles as it could then recurse on itself.");
14997 int Size = Mask.size();
14998
14999 // If this can be modeled as a broadcast of two elements followed by a blend,
15000 // prefer that lowering. This is especially important because broadcasts can
15001 // often fold with memory operands.
15002 auto DoBothBroadcast = [&] {
15003 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15004 for (int M : Mask)
15005 if (M >= Size) {
15006 if (V2BroadcastIdx < 0)
15007 V2BroadcastIdx = M - Size;
15008 else if (M - Size != V2BroadcastIdx)
15009 return false;
15010 } else if (M >= 0) {
15011 if (V1BroadcastIdx < 0)
15012 V1BroadcastIdx = M;
15013 else if (M != V1BroadcastIdx)
15014 return false;
15015 }
15016 return true;
15017 };
15018 if (DoBothBroadcast())
15019 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15020 Subtarget, DAG);
15021
15022 // If the inputs all stem from a single 128-bit lane of each input, then we
15023 // split them rather than blending because the split will decompose to
15024 // unusually few instructions.
15025 int LaneCount = VT.getSizeInBits() / 128;
15026 int LaneSize = Size / LaneCount;
15027 SmallBitVector LaneInputs[2];
15028 LaneInputs[0].resize(LaneCount, false);
15029 LaneInputs[1].resize(LaneCount, false);
15030 for (int i = 0; i < Size; ++i)
15031 if (Mask[i] >= 0)
15032 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15033 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15034 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15035 /*SimpleOnly*/ false);
15036
15037 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15038 // requires that the decomposed single-input shuffles don't end up here.
15039 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15040 Subtarget, DAG);
15041}
15042
15043// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15044// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15046 SDValue V1, SDValue V2,
15047 ArrayRef<int> Mask,
15048 SelectionDAG &DAG) {
15049 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15050
15051 int LHSMask[4] = {-1, -1, -1, -1};
15052 int RHSMask[4] = {-1, -1, -1, -1};
15053 int SHUFPDMask[4] = {-1, -1, -1, -1};
15054
15055 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15056 // perform the shuffle once the lanes have been shuffled in place.
15057 for (int i = 0; i != 4; ++i) {
15058 int M = Mask[i];
15059 if (M < 0)
15060 continue;
15061 int LaneBase = i & ~1;
15062 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15063 LaneMask[LaneBase + (M & 1)] = M;
15064 SHUFPDMask[i] = M & 1;
15065 }
15066
15067 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15068 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15069 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15070 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15071}
15072
15073/// Lower a vector shuffle crossing multiple 128-bit lanes as
15074/// a lane permutation followed by a per-lane permutation.
15075///
15076/// This is mainly for cases where we can have non-repeating permutes
15077/// in each lane.
15078///
15079/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15080/// we should investigate merging them.
15082 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15083 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15084 int NumElts = VT.getVectorNumElements();
15085 int NumLanes = VT.getSizeInBits() / 128;
15086 int NumEltsPerLane = NumElts / NumLanes;
15087 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15088
15089 /// Attempts to find a sublane permute with the given size
15090 /// that gets all elements into their target lanes.
15091 ///
15092 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15093 /// If unsuccessful, returns false and may overwrite InLaneMask.
15094 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15095 int NumSublanesPerLane = NumSublanes / NumLanes;
15096 int NumEltsPerSublane = NumElts / NumSublanes;
15097
15098 SmallVector<int, 16> CrossLaneMask;
15099 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15100 // CrossLaneMask but one entry == one sublane.
15101 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15102 APInt DemandedCrossLane = APInt::getZero(NumElts);
15103
15104 for (int i = 0; i != NumElts; ++i) {
15105 int M = Mask[i];
15106 if (M < 0)
15107 continue;
15108
15109 int SrcSublane = M / NumEltsPerSublane;
15110 int DstLane = i / NumEltsPerLane;
15111
15112 // We only need to get the elements into the right lane, not sublane.
15113 // So search all sublanes that make up the destination lane.
15114 bool Found = false;
15115 int DstSubStart = DstLane * NumSublanesPerLane;
15116 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15117 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15118 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15119 continue;
15120
15121 Found = true;
15122 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15123 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15124 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15125 DemandedCrossLane.setBit(InLaneMask[i]);
15126 break;
15127 }
15128 if (!Found)
15129 return SDValue();
15130 }
15131
15132 // Fill CrossLaneMask using CrossLaneMaskLarge.
15133 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15134
15135 if (!CanUseSublanes) {
15136 // If we're only shuffling a single lowest lane and the rest are identity
15137 // then don't bother.
15138 // TODO - isShuffleMaskInputInPlace could be extended to something like
15139 // this.
15140 int NumIdentityLanes = 0;
15141 bool OnlyShuffleLowestLane = true;
15142 for (int i = 0; i != NumLanes; ++i) {
15143 int LaneOffset = i * NumEltsPerLane;
15144 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15145 i * NumEltsPerLane))
15146 NumIdentityLanes++;
15147 else if (CrossLaneMask[LaneOffset] != 0)
15148 OnlyShuffleLowestLane = false;
15149 }
15150 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15151 return SDValue();
15152 }
15153
15154 // Avoid returning the same shuffle operation. For example,
15155 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15156 // undef:v16i16
15157 if (CrossLaneMask == Mask || InLaneMask == Mask)
15158 return SDValue();
15159
15160 // Simplify CrossLaneMask based on the actual demanded elements.
15161 if (V1.hasOneUse())
15162 for (int i = 0; i != NumElts; ++i)
15163 if (!DemandedCrossLane[i])
15164 CrossLaneMask[i] = SM_SentinelUndef;
15165
15166 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15167 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15168 InLaneMask);
15169 };
15170
15171 // First attempt a solution with full lanes.
15172 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15173 return V;
15174
15175 // The rest of the solutions use sublanes.
15176 if (!CanUseSublanes)
15177 return SDValue();
15178
15179 // Then attempt a solution with 64-bit sublanes (vpermq).
15180 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15181 return V;
15182
15183 // If that doesn't work and we have fast variable cross-lane shuffle,
15184 // attempt 32-bit sublanes (vpermd).
15185 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15186 return SDValue();
15187
15188 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15189}
15190
15191/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15192static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15193 SmallVector<int> &InLaneMask) {
15194 int Size = Mask.size();
15195 InLaneMask.assign(Mask.begin(), Mask.end());
15196 for (int i = 0; i < Size; ++i) {
15197 int &M = InLaneMask[i];
15198 if (M < 0)
15199 continue;
15200 if (((M % Size) / LaneSize) != (i / LaneSize))
15201 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15202 }
15203}
15204
15205/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15206/// source with a lane permutation.
15207///
15208/// This lowering strategy results in four instructions in the worst case for a
15209/// single-input cross lane shuffle which is lower than any other fully general
15210/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15211/// shuffle pattern should be handled prior to trying this lowering.
15213 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15214 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15215 // FIXME: This should probably be generalized for 512-bit vectors as well.
15216 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15217 int Size = Mask.size();
15218 int LaneSize = Size / 2;
15219
15220 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15221 // Only do this if the elements aren't all from the lower lane,
15222 // otherwise we're (probably) better off doing a split.
15223 if (VT == MVT::v4f64 &&
15224 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15225 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15226
15227 // If there are only inputs from one 128-bit lane, splitting will in fact be
15228 // less expensive. The flags track whether the given lane contains an element
15229 // that crosses to another lane.
15230 bool AllLanes;
15231 if (!Subtarget.hasAVX2()) {
15232 bool LaneCrossing[2] = {false, false};
15233 for (int i = 0; i < Size; ++i)
15234 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15235 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15236 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15237 } else {
15238 bool LaneUsed[2] = {false, false};
15239 for (int i = 0; i < Size; ++i)
15240 if (Mask[i] >= 0)
15241 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15242 AllLanes = LaneUsed[0] && LaneUsed[1];
15243 }
15244
15245 // TODO - we could support shuffling V2 in the Flipped input.
15246 assert(V2.isUndef() &&
15247 "This last part of this routine only works on single input shuffles");
15248
15249 SmallVector<int> InLaneMask;
15250 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15251
15252 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15253 "In-lane shuffle mask expected");
15254
15255 // If we're not using both lanes in each lane and the inlane mask is not
15256 // repeating, then we're better off splitting.
15257 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15258 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15259 /*SimpleOnly*/ false);
15260
15261 // Flip the lanes, and shuffle the results which should now be in-lane.
15262 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15263 SDValue Flipped = DAG.getBitcast(PVT, V1);
15264 Flipped =
15265 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15266 Flipped = DAG.getBitcast(VT, Flipped);
15267 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15268}
15269
15270/// Handle lowering 2-lane 128-bit shuffles.
15272 SDValue V2, ArrayRef<int> Mask,
15273 const APInt &Zeroable,
15274 const X86Subtarget &Subtarget,
15275 SelectionDAG &DAG) {
15276 if (V2.isUndef()) {
15277 // Attempt to match VBROADCAST*128 subvector broadcast load.
15278 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15279 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15280 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15282 MVT MemVT = VT.getHalfNumVectorElementsVT();
15283 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15284 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
15286 VT, MemVT, Ld, Ofs, DAG))
15287 return BcstLd;
15288 }
15289
15290 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15291 if (Subtarget.hasAVX2())
15292 return SDValue();
15293 }
15294
15295 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15296
15297 SmallVector<int, 4> WidenedMask;
15298 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15299 return SDValue();
15300
15301 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15302 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15303
15304 // Try to use an insert into a zero vector.
15305 if (WidenedMask[0] == 0 && IsHighZero) {
15306 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15307 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15308 DAG.getVectorIdxConstant(0, DL));
15309 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15310 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15311 DAG.getVectorIdxConstant(0, DL));
15312 }
15313
15314 // TODO: If minimizing size and one of the inputs is a zero vector and the
15315 // the zero vector has only one use, we could use a VPERM2X128 to save the
15316 // instruction bytes needed to explicitly generate the zero vector.
15317
15318 // Blends are faster and handle all the non-lane-crossing cases.
15319 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15320 Subtarget, DAG))
15321 return Blend;
15322
15323 // If either input operand is a zero vector, use VPERM2X128 because its mask
15324 // allows us to replace the zero input with an implicit zero.
15325 if (!IsLowZero && !IsHighZero) {
15326 // Check for patterns which can be matched with a single insert of a 128-bit
15327 // subvector.
15328 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15329 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15330
15331 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15332 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15333 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
15334 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15335 SDValue SubVec =
15336 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15337 DAG.getVectorIdxConstant(0, DL));
15338 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15339 DAG.getVectorIdxConstant(2, DL));
15340 }
15341 }
15342
15343 // Try to use SHUF128 if possible.
15344 if (Subtarget.hasVLX()) {
15345 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15346 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15347 ((WidenedMask[1] % 2) << 1);
15348 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15349 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15350 }
15351 }
15352 }
15353
15354 // Otherwise form a 128-bit permutation. After accounting for undefs,
15355 // convert the 64-bit shuffle mask selection values into 128-bit
15356 // selection bits by dividing the indexes by 2 and shifting into positions
15357 // defined by a vperm2*128 instruction's immediate control byte.
15358
15359 // The immediate permute control byte looks like this:
15360 // [1:0] - select 128 bits from sources for low half of destination
15361 // [2] - ignore
15362 // [3] - zero low half of destination
15363 // [5:4] - select 128 bits from sources for high half of destination
15364 // [6] - ignore
15365 // [7] - zero high half of destination
15366
15367 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15368 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15369
15370 unsigned PermMask = 0;
15371 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15372 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15373
15374 // Check the immediate mask and replace unused sources with undef.
15375 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15376 V1 = DAG.getUNDEF(VT);
15377 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15378 V2 = DAG.getUNDEF(VT);
15379
15380 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15381 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15382}
15383
15384/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15385/// shuffling each lane.
15386///
15387/// This attempts to create a repeated lane shuffle where each lane uses one
15388/// or two of the lanes of the inputs. The lanes of the input vectors are
15389/// shuffled in one or two independent shuffles to get the lanes into the
15390/// position needed by the final shuffle.
15392 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15393 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15394 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15395
15396 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15397 return SDValue();
15398
15399 int NumElts = Mask.size();
15400 int NumLanes = VT.getSizeInBits() / 128;
15401 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15402 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15403 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15404
15405 // First pass will try to fill in the RepeatMask from lanes that need two
15406 // sources.
15407 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15408 int Srcs[2] = {-1, -1};
15409 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15410 for (int i = 0; i != NumLaneElts; ++i) {
15411 int M = Mask[(Lane * NumLaneElts) + i];
15412 if (M < 0)
15413 continue;
15414 // Determine which of the possible input lanes (NumLanes from each source)
15415 // this element comes from. Assign that as one of the sources for this
15416 // lane. We can assign up to 2 sources for this lane. If we run out
15417 // sources we can't do anything.
15418 int LaneSrc = M / NumLaneElts;
15419 int Src;
15420 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15421 Src = 0;
15422 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15423 Src = 1;
15424 else
15425 return SDValue();
15426
15427 Srcs[Src] = LaneSrc;
15428 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15429 }
15430
15431 // If this lane has two sources, see if it fits with the repeat mask so far.
15432 if (Srcs[1] < 0)
15433 continue;
15434
15435 LaneSrcs[Lane][0] = Srcs[0];
15436 LaneSrcs[Lane][1] = Srcs[1];
15437
15438 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15439 assert(M1.size() == M2.size() && "Unexpected mask size");
15440 for (int i = 0, e = M1.size(); i != e; ++i)
15441 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15442 return false;
15443 return true;
15444 };
15445
15446 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15447 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15448 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15449 int M = Mask[i];
15450 if (M < 0)
15451 continue;
15452 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15453 "Unexpected mask element");
15454 MergedMask[i] = M;
15455 }
15456 };
15457
15458 if (MatchMasks(InLaneMask, RepeatMask)) {
15459 // Merge this lane mask into the final repeat mask.
15460 MergeMasks(InLaneMask, RepeatMask);
15461 continue;
15462 }
15463
15464 // Didn't find a match. Swap the operands and try again.
15465 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15467
15468 if (MatchMasks(InLaneMask, RepeatMask)) {
15469 // Merge this lane mask into the final repeat mask.
15470 MergeMasks(InLaneMask, RepeatMask);
15471 continue;
15472 }
15473
15474 // Couldn't find a match with the operands in either order.
15475 return SDValue();
15476 }
15477
15478 // Now handle any lanes with only one source.
15479 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15480 // If this lane has already been processed, skip it.
15481 if (LaneSrcs[Lane][0] >= 0)
15482 continue;
15483
15484 for (int i = 0; i != NumLaneElts; ++i) {
15485 int M = Mask[(Lane * NumLaneElts) + i];
15486 if (M < 0)
15487 continue;
15488
15489 // If RepeatMask isn't defined yet we can define it ourself.
15490 if (RepeatMask[i] < 0)
15491 RepeatMask[i] = M % NumLaneElts;
15492
15493 if (RepeatMask[i] < NumElts) {
15494 if (RepeatMask[i] != M % NumLaneElts)
15495 return SDValue();
15496 LaneSrcs[Lane][0] = M / NumLaneElts;
15497 } else {
15498 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15499 return SDValue();
15500 LaneSrcs[Lane][1] = M / NumLaneElts;
15501 }
15502 }
15503
15504 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15505 return SDValue();
15506 }
15507
15508 SmallVector<int, 16> NewMask(NumElts, -1);
15509 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15510 int Src = LaneSrcs[Lane][0];
15511 for (int i = 0; i != NumLaneElts; ++i) {
15512 int M = -1;
15513 if (Src >= 0)
15514 M = Src * NumLaneElts + i;
15515 NewMask[Lane * NumLaneElts + i] = M;
15516 }
15517 }
15518 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15519 // Ensure we didn't get back the shuffle we started with.
15520 // FIXME: This is a hack to make up for some splat handling code in
15521 // getVectorShuffle.
15522 if (isa<ShuffleVectorSDNode>(NewV1) &&
15523 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15524 return SDValue();
15525
15526 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15527 int Src = LaneSrcs[Lane][1];
15528 for (int i = 0; i != NumLaneElts; ++i) {
15529 int M = -1;
15530 if (Src >= 0)
15531 M = Src * NumLaneElts + i;
15532 NewMask[Lane * NumLaneElts + i] = M;
15533 }
15534 }
15535 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15536 // Ensure we didn't get back the shuffle we started with.
15537 // FIXME: This is a hack to make up for some splat handling code in
15538 // getVectorShuffle.
15539 if (isa<ShuffleVectorSDNode>(NewV2) &&
15540 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15541 return SDValue();
15542
15543 for (int i = 0; i != NumElts; ++i) {
15544 if (Mask[i] < 0) {
15545 NewMask[i] = -1;
15546 continue;
15547 }
15548 NewMask[i] = RepeatMask[i % NumLaneElts];
15549 if (NewMask[i] < 0)
15550 continue;
15551
15552 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15553 }
15554 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15555}
15556
15557/// If the input shuffle mask results in a vector that is undefined in all upper
15558/// or lower half elements and that mask accesses only 2 halves of the
15559/// shuffle's operands, return true. A mask of half the width with mask indexes
15560/// adjusted to access the extracted halves of the original shuffle operands is
15561/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15562/// lower half of each input operand is accessed.
15563static bool
15565 int &HalfIdx1, int &HalfIdx2) {
15566 assert((Mask.size() == HalfMask.size() * 2) &&
15567 "Expected input mask to be twice as long as output");
15568
15569 // Exactly one half of the result must be undef to allow narrowing.
15570 bool UndefLower = isUndefLowerHalf(Mask);
15571 bool UndefUpper = isUndefUpperHalf(Mask);
15572 if (UndefLower == UndefUpper)
15573 return false;
15574
15575 unsigned HalfNumElts = HalfMask.size();
15576 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15577 HalfIdx1 = -1;
15578 HalfIdx2 = -1;
15579 for (unsigned i = 0; i != HalfNumElts; ++i) {
15580 int M = Mask[i + MaskIndexOffset];
15581 if (M < 0) {
15582 HalfMask[i] = M;
15583 continue;
15584 }
15585
15586 // Determine which of the 4 half vectors this element is from.
15587 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15588 int HalfIdx = M / HalfNumElts;
15589
15590 // Determine the element index into its half vector source.
15591 int HalfElt = M % HalfNumElts;
15592
15593 // We can shuffle with up to 2 half vectors, set the new 'half'
15594 // shuffle mask accordingly.
15595 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15596 HalfMask[i] = HalfElt;
15597 HalfIdx1 = HalfIdx;
15598 continue;
15599 }
15600 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15601 HalfMask[i] = HalfElt + HalfNumElts;
15602 HalfIdx2 = HalfIdx;
15603 continue;
15604 }
15605
15606 // Too many half vectors referenced.
15607 return false;
15608 }
15609
15610 return true;
15611}
15612
15613/// Given the output values from getHalfShuffleMask(), create a half width
15614/// shuffle of extracted vectors followed by an insert back to full width.
15616 ArrayRef<int> HalfMask, int HalfIdx1,
15617 int HalfIdx2, bool UndefLower,
15618 SelectionDAG &DAG, bool UseConcat = false) {
15619 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15620 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15621
15622 MVT VT = V1.getSimpleValueType();
15623 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15624 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15625
15626 auto getHalfVector = [&](int HalfIdx) {
15627 if (HalfIdx < 0)
15628 return DAG.getUNDEF(HalfVT);
15629 SDValue V = (HalfIdx < 2 ? V1 : V2);
15630 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15631 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15632 DAG.getVectorIdxConstant(HalfIdx, DL));
15633 };
15634
15635 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15636 SDValue Half1 = getHalfVector(HalfIdx1);
15637 SDValue Half2 = getHalfVector(HalfIdx2);
15638 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15639 if (UseConcat) {
15640 SDValue Op0 = V;
15641 SDValue Op1 = DAG.getUNDEF(HalfVT);
15642 if (UndefLower)
15643 std::swap(Op0, Op1);
15644 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15645 }
15646
15647 unsigned Offset = UndefLower ? HalfNumElts : 0;
15648 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15650}
15651
15652/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15653/// This allows for fast cases such as subvector extraction/insertion
15654/// or shuffling smaller vector types which can lower more efficiently.
15656 SDValue V2, ArrayRef<int> Mask,
15657 const X86Subtarget &Subtarget,
15658 SelectionDAG &DAG) {
15659 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15660 "Expected 256-bit or 512-bit vector");
15661
15662 bool UndefLower = isUndefLowerHalf(Mask);
15663 if (!UndefLower && !isUndefUpperHalf(Mask))
15664 return SDValue();
15665
15666 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15667 "Completely undef shuffle mask should have been simplified already");
15668
15669 // Upper half is undef and lower half is whole upper subvector.
15670 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15671 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15672 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15673 if (!UndefLower &&
15674 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15675 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15676 DAG.getVectorIdxConstant(HalfNumElts, DL));
15677 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15678 DAG.getVectorIdxConstant(0, DL));
15679 }
15680
15681 // Lower half is undef and upper half is whole lower subvector.
15682 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15683 if (UndefLower &&
15684 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15685 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15686 DAG.getVectorIdxConstant(0, DL));
15687 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15688 DAG.getVectorIdxConstant(HalfNumElts, DL));
15689 }
15690
15691 int HalfIdx1, HalfIdx2;
15692 SmallVector<int, 8> HalfMask(HalfNumElts);
15693 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15694 return SDValue();
15695
15696 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15697
15698 // Only shuffle the halves of the inputs when useful.
15699 unsigned NumLowerHalves =
15700 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15701 unsigned NumUpperHalves =
15702 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15703 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15704
15705 // Determine the larger pattern of undef/halves, then decide if it's worth
15706 // splitting the shuffle based on subtarget capabilities and types.
15707 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15708 if (!UndefLower) {
15709 // XXXXuuuu: no insert is needed.
15710 // Always extract lowers when setting lower - these are all free subreg ops.
15711 if (NumUpperHalves == 0)
15712 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15713 UndefLower, DAG);
15714
15715 if (NumUpperHalves == 1) {
15716 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15717 if (Subtarget.hasAVX2()) {
15718 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15719 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15720 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15721 (!isSingleSHUFPSMask(HalfMask) ||
15722 Subtarget.hasFastVariableCrossLaneShuffle()))
15723 return SDValue();
15724 // If this is an unary shuffle (assume that the 2nd operand is
15725 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15726 // are better off extracting the upper half of 1 operand and using a
15727 // narrow shuffle.
15728 if (EltWidth == 64 && V2.isUndef())
15729 return SDValue();
15730 // If this is an unary vXi8 shuffle with inplace halves, then perform as
15731 // full width pshufb, and then merge.
15732 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
15733 return SDValue();
15734 }
15735 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15736 if (Subtarget.hasAVX512() && VT.is512BitVector())
15737 return SDValue();
15738 // Extract + narrow shuffle is better than the wide alternative.
15739 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15740 UndefLower, DAG);
15741 }
15742
15743 // Don't extract both uppers, instead shuffle and then extract.
15744 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15745 return SDValue();
15746 }
15747
15748 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15749 if (NumUpperHalves == 0) {
15750 // AVX2 has efficient 64-bit element cross-lane shuffles.
15751 // TODO: Refine to account for unary shuffle, splat, and other masks?
15752 if (Subtarget.hasAVX2() && EltWidth == 64)
15753 return SDValue();
15754 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15755 if (Subtarget.hasAVX512() && VT.is512BitVector())
15756 return SDValue();
15757 // Narrow shuffle + insert is better than the wide alternative.
15758 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15759 UndefLower, DAG);
15760 }
15761
15762 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15763 return SDValue();
15764}
15765
15766/// Handle case where shuffle sources are coming from the same 128-bit lane and
15767/// every lane can be represented as the same repeating mask - allowing us to
15768/// shuffle the sources with the repeating shuffle and then permute the result
15769/// to the destination lanes.
15771 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15772 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15773 int NumElts = VT.getVectorNumElements();
15774 int NumLanes = VT.getSizeInBits() / 128;
15775 int NumLaneElts = NumElts / NumLanes;
15776
15777 // On AVX2 we may be able to just shuffle the lowest elements and then
15778 // broadcast the result.
15779 if (Subtarget.hasAVX2()) {
15780 for (unsigned BroadcastSize : {16, 32, 64}) {
15781 if (BroadcastSize <= VT.getScalarSizeInBits())
15782 continue;
15783 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15784
15785 // Attempt to match a repeating pattern every NumBroadcastElts,
15786 // accounting for UNDEFs but only references the lowest 128-bit
15787 // lane of the inputs.
15788 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15789 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15790 for (int j = 0; j != NumBroadcastElts; ++j) {
15791 int M = Mask[i + j];
15792 if (M < 0)
15793 continue;
15794 int &R = RepeatMask[j];
15795 if (0 != ((M % NumElts) / NumLaneElts))
15796 return false;
15797 if (0 <= R && R != M)
15798 return false;
15799 R = M;
15800 }
15801 return true;
15802 };
15803
15804 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15805 if (!FindRepeatingBroadcastMask(RepeatMask))
15806 continue;
15807
15808 // Shuffle the (lowest) repeated elements in place for broadcast.
15809 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15810
15811 // Shuffle the actual broadcast.
15812 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15813 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15814 for (int j = 0; j != NumBroadcastElts; ++j)
15815 BroadcastMask[i + j] = j;
15816
15817 // Avoid returning the same shuffle operation. For example,
15818 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15819 if (BroadcastMask == Mask)
15820 return SDValue();
15821
15822 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15823 BroadcastMask);
15824 }
15825 }
15826
15827 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15828 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15829 return SDValue();
15830
15831 // Bail if we already have a repeated lane shuffle mask.
15832 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15833 return SDValue();
15834
15835 // Helper to look for repeated mask in each split sublane, and that those
15836 // sublanes can then be permuted into place.
15837 auto ShuffleSubLanes = [&](int SubLaneScale) {
15838 int NumSubLanes = NumLanes * SubLaneScale;
15839 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15840
15841 // Check that all the sources are coming from the same lane and see if we
15842 // can form a repeating shuffle mask (local to each sub-lane). At the same
15843 // time, determine the source sub-lane for each destination sub-lane.
15844 int TopSrcSubLane = -1;
15845 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15846 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15847 SubLaneScale,
15848 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15849
15850 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15851 // Extract the sub-lane mask, check that it all comes from the same lane
15852 // and normalize the mask entries to come from the first lane.
15853 int SrcLane = -1;
15854 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15855 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15856 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15857 if (M < 0)
15858 continue;
15859 int Lane = (M % NumElts) / NumLaneElts;
15860 if ((0 <= SrcLane) && (SrcLane != Lane))
15861 return SDValue();
15862 SrcLane = Lane;
15863 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15864 SubLaneMask[Elt] = LocalM;
15865 }
15866
15867 // Whole sub-lane is UNDEF.
15868 if (SrcLane < 0)
15869 continue;
15870
15871 // Attempt to match against the candidate repeated sub-lane masks.
15872 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15873 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15874 for (int i = 0; i != NumSubLaneElts; ++i) {
15875 if (M1[i] < 0 || M2[i] < 0)
15876 continue;
15877 if (M1[i] != M2[i])
15878 return false;
15879 }
15880 return true;
15881 };
15882
15883 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15884 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15885 continue;
15886
15887 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15888 for (int i = 0; i != NumSubLaneElts; ++i) {
15889 int M = SubLaneMask[i];
15890 if (M < 0)
15891 continue;
15892 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15893 "Unexpected mask element");
15894 RepeatedSubLaneMask[i] = M;
15895 }
15896
15897 // Track the top most source sub-lane - by setting the remaining to
15898 // UNDEF we can greatly simplify shuffle matching.
15899 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15900 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15901 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15902 break;
15903 }
15904
15905 // Bail if we failed to find a matching repeated sub-lane mask.
15906 if (Dst2SrcSubLanes[DstSubLane] < 0)
15907 return SDValue();
15908 }
15909 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15910 "Unexpected source lane");
15911
15912 // Create a repeating shuffle mask for the entire vector.
15913 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15914 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15915 int Lane = SubLane / SubLaneScale;
15916 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15917 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15918 int M = RepeatedSubLaneMask[Elt];
15919 if (M < 0)
15920 continue;
15921 int Idx = (SubLane * NumSubLaneElts) + Elt;
15922 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15923 }
15924 }
15925
15926 // Shuffle each source sub-lane to its destination.
15927 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15928 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15929 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15930 if (SrcSubLane < 0)
15931 continue;
15932 for (int j = 0; j != NumSubLaneElts; ++j)
15933 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15934 }
15935
15936 // Avoid returning the same shuffle operation.
15937 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15938 if (RepeatedMask == Mask || SubLaneMask == Mask)
15939 return SDValue();
15940
15941 SDValue RepeatedShuffle =
15942 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15943
15944 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15945 SubLaneMask);
15946 };
15947
15948 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15949 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15950 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15951 // Otherwise we can only permute whole 128-bit lanes.
15952 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15953 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15954 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15955 MinSubLaneScale = 2;
15956 MaxSubLaneScale =
15957 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15958 }
15959 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15960 MinSubLaneScale = MaxSubLaneScale = 4;
15961
15962 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15963 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15964 return Shuffle;
15965
15966 return SDValue();
15967}
15968
15970 bool &ForceV1Zero, bool &ForceV2Zero,
15971 unsigned &ShuffleImm, ArrayRef<int> Mask,
15972 const APInt &Zeroable) {
15973 int NumElts = VT.getVectorNumElements();
15974 assert(VT.getScalarSizeInBits() == 64 &&
15975 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15976 "Unexpected data type for VSHUFPD");
15977 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15978 "Illegal shuffle mask");
15979
15980 bool ZeroLane[2] = { true, true };
15981 for (int i = 0; i < NumElts; ++i)
15982 ZeroLane[i & 1] &= Zeroable[i];
15983
15984 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15985 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15986 bool IsSHUFPD = true;
15987 bool IsCommutable = true;
15988 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
15989 for (int i = 0; i < NumElts; ++i) {
15990 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15991 continue;
15992 if (Mask[i] < 0)
15993 return false;
15994 int Val = (i & 6) + NumElts * (i & 1);
15995 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15996 if (Mask[i] < Val || Mask[i] > Val + 1)
15997 IsSHUFPD = false;
15998 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15999 IsCommutable = false;
16000 SHUFPDMask[i] = Mask[i] % 2;
16001 }
16002
16003 if (!IsSHUFPD && !IsCommutable)
16004 return false;
16005
16006 if (!IsSHUFPD && IsCommutable)
16007 std::swap(V1, V2);
16008
16009 ForceV1Zero = ZeroLane[0];
16010 ForceV2Zero = ZeroLane[1];
16011 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16012 return true;
16013}
16014
16016 SDValue V2, ArrayRef<int> Mask,
16017 const APInt &Zeroable,
16018 const X86Subtarget &Subtarget,
16019 SelectionDAG &DAG) {
16020 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16021 "Unexpected data type for VSHUFPD");
16022
16023 unsigned Immediate = 0;
16024 bool ForceV1Zero = false, ForceV2Zero = false;
16025 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16026 Mask, Zeroable))
16027 return SDValue();
16028
16029 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16030 if (ForceV1Zero)
16031 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16032 if (ForceV2Zero)
16033 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16034
16035 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16036 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16037}
16038
16039// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16040// by zeroable elements in the remaining 24 elements. Turn this into two
16041// vmovqb instructions shuffled together.
16043 SDValue V1, SDValue V2,
16044 ArrayRef<int> Mask,
16045 const APInt &Zeroable,
16046 SelectionDAG &DAG) {
16047 assert(VT == MVT::v32i8 && "Unexpected type!");
16048
16049 // The first 8 indices should be every 8th element.
16050 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16051 return SDValue();
16052
16053 // Remaining elements need to be zeroable.
16054 if (Zeroable.countl_one() < (Mask.size() - 8))
16055 return SDValue();
16056
16057 V1 = DAG.getBitcast(MVT::v4i64, V1);
16058 V2 = DAG.getBitcast(MVT::v4i64, V2);
16059
16060 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16061 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16062
16063 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16064 // the upper bits of the result using an unpckldq.
16065 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16066 { 0, 1, 2, 3, 16, 17, 18, 19,
16067 4, 5, 6, 7, 20, 21, 22, 23 });
16068 // Insert the unpckldq into a zero vector to widen to v32i8.
16069 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16070 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16071 DAG.getVectorIdxConstant(0, DL));
16072}
16073
16074// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16075// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16076// =>
16077// ul = unpckl v1, v2
16078// uh = unpckh v1, v2
16079// a = vperm ul, uh
16080// b = vperm ul, uh
16081//
16082// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16083// and permute. We cannot directly match v3 because it is split into two
16084// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16085// pair of 256-bit shuffles and makes sure the masks are consecutive.
16086//
16087// Once unpck and permute nodes are created, the permute corresponding to this
16088// shuffle is returned, while the other permute replaces the other half of the
16089// shuffle in the selection dag.
16091 SDValue V1, SDValue V2,
16092 ArrayRef<int> Mask,
16093 SelectionDAG &DAG) {
16094 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16095 VT != MVT::v32i8)
16096 return SDValue();
16097 // <B0, B1, B0+1, B1+1, ..., >
16098 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16099 unsigned Begin1) {
16100 size_t Size = Mask.size();
16101 assert(Size % 2 == 0 && "Expected even mask size");
16102 for (unsigned I = 0; I < Size; I += 2) {
16103 if (Mask[I] != (int)(Begin0 + I / 2) ||
16104 Mask[I + 1] != (int)(Begin1 + I / 2))
16105 return false;
16106 }
16107 return true;
16108 };
16109 // Check which half is this shuffle node
16110 int NumElts = VT.getVectorNumElements();
16111 size_t FirstQtr = NumElts / 2;
16112 size_t ThirdQtr = NumElts + NumElts / 2;
16113 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16114 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16115 if (!IsFirstHalf && !IsSecondHalf)
16116 return SDValue();
16117
16118 // Find the intersection between shuffle users of V1 and V2.
16119 SmallVector<SDNode *, 2> Shuffles;
16120 for (SDNode *User : V1->users())
16121 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16122 User->getOperand(1) == V2)
16123 Shuffles.push_back(User);
16124 // Limit user size to two for now.
16125 if (Shuffles.size() != 2)
16126 return SDValue();
16127 // Find out which half of the 512-bit shuffles is each smaller shuffle
16128 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16129 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16130 SDNode *FirstHalf;
16131 SDNode *SecondHalf;
16132 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16133 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16134 FirstHalf = Shuffles[0];
16135 SecondHalf = Shuffles[1];
16136 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16137 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16138 FirstHalf = Shuffles[1];
16139 SecondHalf = Shuffles[0];
16140 } else {
16141 return SDValue();
16142 }
16143 // Lower into unpck and perm. Return the perm of this shuffle and replace
16144 // the other.
16145 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16146 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16147 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16148 DAG.getTargetConstant(0x20, DL, MVT::i8));
16149 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16150 DAG.getTargetConstant(0x31, DL, MVT::i8));
16151 if (IsFirstHalf) {
16152 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16153 return Perm1;
16154 }
16155 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16156 return Perm2;
16157}
16158
16159/// Handle lowering of 4-lane 64-bit floating point shuffles.
16160///
16161/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16162/// isn't available.
16164 const APInt &Zeroable, SDValue V1, SDValue V2,
16165 const X86Subtarget &Subtarget,
16166 SelectionDAG &DAG) {
16167 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16168 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16169 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16170
16171 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16172 Subtarget, DAG))
16173 return V;
16174
16175 if (V2.isUndef()) {
16176 // Check for being able to broadcast a single element.
16177 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16178 Mask, Subtarget, DAG))
16179 return Broadcast;
16180
16181 // Use low duplicate instructions for masks that match their pattern.
16182 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16183 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16184
16185 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16186 // Non-half-crossing single input shuffles can be lowered with an
16187 // interleaved permutation.
16188 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16189 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16190 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16191 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16192 }
16193
16194 // With AVX2 we have direct support for this permutation.
16195 if (Subtarget.hasAVX2())
16196 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16197 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16198
16199 // Try to create an in-lane repeating shuffle mask and then shuffle the
16200 // results into the target lanes.
16202 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16203 return V;
16204
16205 // Try to permute the lanes and then use a per-lane permute.
16206 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16207 Mask, DAG, Subtarget))
16208 return V;
16209
16210 // Otherwise, fall back.
16211 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16212 DAG, Subtarget);
16213 }
16214
16215 // Use dedicated unpack instructions for masks that match their pattern.
16216 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16217 return V;
16218
16219 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16220 Zeroable, Subtarget, DAG))
16221 return Blend;
16222
16223 // Check if the blend happens to exactly fit that of SHUFPD.
16224 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16225 Zeroable, Subtarget, DAG))
16226 return Op;
16227
16228 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16229 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16230 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16231 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16232
16233 // If we have lane crossing shuffles AND they don't all come from the lower
16234 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16235 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16236 // canonicalize to a blend of splat which isn't necessary for this combine.
16237 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16238 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16239 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16240 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16241 (!Subtarget.hasAVX2() ||
16242 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16243 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16244
16245 // If we have one input in place, then we can permute the other input and
16246 // blend the result.
16247 if (V1IsInPlace || V2IsInPlace)
16248 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16249 Zeroable, Subtarget, DAG);
16250
16251 // Try to create an in-lane repeating shuffle mask and then shuffle the
16252 // results into the target lanes.
16254 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16255 return V;
16256
16257 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16258 // shuffle. However, if we have AVX2 and either inputs are already in place,
16259 // we will be able to shuffle even across lanes the other input in a single
16260 // instruction so skip this pattern.
16261 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16263 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16264 return V;
16265
16266 // If we have VLX support, we can use VEXPAND.
16267 if (Subtarget.hasVLX())
16268 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16269 Zeroable, Subtarget, DAG))
16270 return V;
16271
16272 // If we have AVX2 then we always want to lower with a blend because an v4 we
16273 // can fully permute the elements.
16274 if (Subtarget.hasAVX2())
16275 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16276 Zeroable, Subtarget, DAG);
16277
16278 // Otherwise fall back on generic lowering.
16279 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16280 Subtarget, DAG);
16281}
16282
16283/// Handle lowering of 4-lane 64-bit integer shuffles.
16284///
16285/// This routine is only called when we have AVX2 and thus a reasonable
16286/// instruction set for v4i64 shuffling..
16288 const APInt &Zeroable, SDValue V1, SDValue V2,
16289 const X86Subtarget &Subtarget,
16290 SelectionDAG &DAG) {
16291 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16292 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16293 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16294 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16295
16296 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16297 Subtarget, DAG))
16298 return V;
16299
16300 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16301 Zeroable, Subtarget, DAG))
16302 return Blend;
16303
16304 // Check for being able to broadcast a single element.
16305 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16306 Subtarget, DAG))
16307 return Broadcast;
16308
16309 // Try to use shift instructions if fast.
16310 if (Subtarget.preferLowerShuffleAsShift())
16311 if (SDValue Shift =
16312 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16313 Subtarget, DAG, /*BitwiseOnly*/ true))
16314 return Shift;
16315
16316 if (V2.isUndef()) {
16317 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16318 // can use lower latency instructions that will operate on both lanes.
16319 SmallVector<int, 2> RepeatedMask;
16320 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16321 SmallVector<int, 4> PSHUFDMask;
16322 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16323 return DAG.getBitcast(
16324 MVT::v4i64,
16325 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16326 DAG.getBitcast(MVT::v8i32, V1),
16327 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16328 }
16329
16330 // AVX2 provides a direct instruction for permuting a single input across
16331 // lanes.
16332 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16333 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16334 }
16335
16336 // Try to use shift instructions.
16337 if (SDValue Shift =
16338 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16339 DAG, /*BitwiseOnly*/ false))
16340 return Shift;
16341
16342 // If we have VLX support, we can use VALIGN or VEXPAND.
16343 if (Subtarget.hasVLX()) {
16344 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16345 Zeroable, Subtarget, DAG))
16346 return Rotate;
16347
16348 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16349 Zeroable, Subtarget, DAG))
16350 return V;
16351 }
16352
16353 // Try to use PALIGNR.
16354 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16355 Subtarget, DAG))
16356 return Rotate;
16357
16358 // Use dedicated unpack instructions for masks that match their pattern.
16359 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16360 return V;
16361
16362 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16363 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16364
16365 // If we have one input in place, then we can permute the other input and
16366 // blend the result.
16367 if (V1IsInPlace || V2IsInPlace)
16368 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16369 Zeroable, Subtarget, DAG);
16370
16371 // Try to create an in-lane repeating shuffle mask and then shuffle the
16372 // results into the target lanes.
16374 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16375 return V;
16376
16377 // Try to lower to PERMQ(BLENDD(V1,V2)).
16378 if (SDValue V =
16379 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16380 return V;
16381
16382 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16383 // shuffle. However, if we have AVX2 and either inputs are already in place,
16384 // we will be able to shuffle even across lanes the other input in a single
16385 // instruction so skip this pattern.
16386 if (!V1IsInPlace && !V2IsInPlace)
16388 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16389 return Result;
16390
16391 // Otherwise fall back on generic blend lowering.
16392 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16393 Zeroable, Subtarget, DAG);
16394}
16395
16396/// Handle lowering of 8-lane 32-bit floating point shuffles.
16397///
16398/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16399/// isn't available.
16401 const APInt &Zeroable, SDValue V1, SDValue V2,
16402 const X86Subtarget &Subtarget,
16403 SelectionDAG &DAG) {
16404 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16405 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16406 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16407
16408 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16409 Zeroable, Subtarget, DAG))
16410 return Blend;
16411
16412 // Check for being able to broadcast a single element.
16413 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16414 Subtarget, DAG))
16415 return Broadcast;
16416
16417 if (!Subtarget.hasAVX2()) {
16418 SmallVector<int> InLaneMask;
16419 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16420
16421 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16422 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16423 /*SimpleOnly*/ true))
16424 return R;
16425 }
16426 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16427 Zeroable, Subtarget, DAG))
16428 return DAG.getBitcast(MVT::v8f32, ZExt);
16429
16430 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16431 // options to efficiently lower the shuffle.
16432 SmallVector<int, 4> RepeatedMask;
16433 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16434 assert(RepeatedMask.size() == 4 &&
16435 "Repeated masks must be half the mask width!");
16436
16437 // Use even/odd duplicate instructions for masks that match their pattern.
16438 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16439 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16440 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16441 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16442
16443 if (V2.isUndef())
16444 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16445 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16446
16447 // Use dedicated unpack instructions for masks that match their pattern.
16448 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16449 return V;
16450
16451 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16452 // have already handled any direct blends.
16453 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16454 }
16455
16456 // Try to create an in-lane repeating shuffle mask and then shuffle the
16457 // results into the target lanes.
16459 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16460 return V;
16461
16462 // If we have a single input shuffle with different shuffle patterns in the
16463 // two 128-bit lanes use the variable mask to VPERMILPS.
16464 if (V2.isUndef()) {
16465 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16466 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16467 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16468 }
16469 if (Subtarget.hasAVX2()) {
16470 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16471 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16472 }
16473 // Otherwise, fall back.
16474 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16475 DAG, Subtarget);
16476 }
16477
16478 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16479 // shuffle.
16481 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16482 return Result;
16483
16484 // If we have VLX support, we can use VEXPAND.
16485 if (Subtarget.hasVLX())
16486 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16487 Zeroable, Subtarget, DAG))
16488 return V;
16489
16490 // Try to match an interleave of two v8f32s and lower them as unpck and
16491 // permutes using ymms. This needs to go before we try to split the vectors.
16492 //
16493 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
16494 // this path inadvertently.
16495 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
16496 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16497 Mask, DAG))
16498 return V;
16499
16500 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16501 // since after split we get a more efficient code using vpunpcklwd and
16502 // vpunpckhwd instrs than vblend.
16503 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16504 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16505 Subtarget, DAG);
16506
16507 // If we have AVX2 then we always want to lower with a blend because at v8 we
16508 // can fully permute the elements.
16509 if (Subtarget.hasAVX2())
16510 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16511 Zeroable, Subtarget, DAG);
16512
16513 // Otherwise fall back on generic lowering.
16514 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16515 Subtarget, DAG);
16516}
16517
16518/// Handle lowering of 8-lane 32-bit integer shuffles.
16519///
16520/// This routine is only called when we have AVX2 and thus a reasonable
16521/// instruction set for v8i32 shuffling..
16523 const APInt &Zeroable, SDValue V1, SDValue V2,
16524 const X86Subtarget &Subtarget,
16525 SelectionDAG &DAG) {
16526 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16527 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16528 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16529 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16530
16531 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16532
16533 // Whenever we can lower this as a zext, that instruction is strictly faster
16534 // than any alternative. It also allows us to fold memory operands into the
16535 // shuffle in many cases.
16536 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16537 Zeroable, Subtarget, DAG))
16538 return ZExt;
16539
16540 // Try to match an interleave of two v8i32s and lower them as unpck and
16541 // permutes using ymms. This needs to go before we try to split the vectors.
16542 if (!Subtarget.hasAVX512())
16543 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16544 Mask, DAG))
16545 return V;
16546
16547 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16548 // since after split we get a more efficient code than vblend by using
16549 // vpunpcklwd and vpunpckhwd instrs.
16550 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16551 !Subtarget.hasAVX512())
16552 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16553 Subtarget, DAG);
16554
16555 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16556 Zeroable, Subtarget, DAG))
16557 return Blend;
16558
16559 // Check for being able to broadcast a single element.
16560 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16561 Subtarget, DAG))
16562 return Broadcast;
16563
16564 // Try to use shift instructions if fast.
16565 if (Subtarget.preferLowerShuffleAsShift()) {
16566 if (SDValue Shift =
16567 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16568 Subtarget, DAG, /*BitwiseOnly*/ true))
16569 return Shift;
16570 if (NumV2Elements == 0)
16571 if (SDValue Rotate =
16572 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16573 return Rotate;
16574 }
16575
16576 // If the shuffle mask is repeated in each 128-bit lane we can use more
16577 // efficient instructions that mirror the shuffles across the two 128-bit
16578 // lanes.
16579 SmallVector<int, 4> RepeatedMask;
16580 bool Is128BitLaneRepeatedShuffle =
16581 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16582 if (Is128BitLaneRepeatedShuffle) {
16583 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16584 if (V2.isUndef())
16585 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16586 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16587
16588 // Use dedicated unpack instructions for masks that match their pattern.
16589 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16590 return V;
16591 }
16592
16593 // Try to use shift instructions.
16594 if (SDValue Shift =
16595 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16596 DAG, /*BitwiseOnly*/ false))
16597 return Shift;
16598
16599 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16600 if (SDValue Rotate =
16601 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16602 return Rotate;
16603
16604 // If we have VLX support, we can use VALIGN or EXPAND.
16605 if (Subtarget.hasVLX()) {
16606 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16607 Zeroable, Subtarget, DAG))
16608 return Rotate;
16609
16610 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16611 Zeroable, Subtarget, DAG))
16612 return V;
16613 }
16614
16615 // Try to use byte rotation instructions.
16616 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16617 Subtarget, DAG))
16618 return Rotate;
16619
16620 // Try to create an in-lane repeating shuffle mask and then shuffle the
16621 // results into the target lanes.
16623 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16624 return V;
16625
16626 if (V2.isUndef()) {
16627 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16628 // because that should be faster than the variable permute alternatives.
16629 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16630 return V;
16631
16632 // If the shuffle patterns aren't repeated but it's a single input, directly
16633 // generate a cross-lane VPERMD instruction.
16634 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16635 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16636 }
16637
16638 // Assume that a single SHUFPS is faster than an alternative sequence of
16639 // multiple instructions (even if the CPU has a domain penalty).
16640 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16641 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16642 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16643 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16644 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16645 CastV1, CastV2, DAG);
16646 return DAG.getBitcast(MVT::v8i32, ShufPS);
16647 }
16648
16649 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16650 // shuffle.
16652 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16653 return Result;
16654
16655 // Otherwise fall back on generic blend lowering.
16656 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16657 Zeroable, Subtarget, DAG);
16658}
16659
16660/// Handle lowering of 16-lane 16-bit integer shuffles.
16661///
16662/// This routine is only called when we have AVX2 and thus a reasonable
16663/// instruction set for v16i16 shuffling..
16665 const APInt &Zeroable, SDValue V1, SDValue V2,
16666 const X86Subtarget &Subtarget,
16667 SelectionDAG &DAG) {
16668 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16669 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16670 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16671 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16672
16673 // Whenever we can lower this as a zext, that instruction is strictly faster
16674 // than any alternative. It also allows us to fold memory operands into the
16675 // shuffle in many cases.
16677 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16678 return ZExt;
16679
16680 // Check for being able to broadcast a single element.
16681 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16682 Subtarget, DAG))
16683 return Broadcast;
16684
16685 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16686 Zeroable, Subtarget, DAG))
16687 return Blend;
16688
16689 // Use dedicated unpack instructions for masks that match their pattern.
16690 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16691 return V;
16692
16693 // Use dedicated pack instructions for masks that match their pattern.
16694 if (SDValue V =
16695 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16696 return V;
16697
16698 // Try to use lower using a truncation.
16699 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16700 Subtarget, DAG))
16701 return V;
16702
16703 // Try to use shift instructions.
16704 if (SDValue Shift =
16705 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16706 Subtarget, DAG, /*BitwiseOnly*/ false))
16707 return Shift;
16708
16709 // Try to use byte rotation instructions.
16710 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16711 Subtarget, DAG))
16712 return Rotate;
16713
16714 // Try to create an in-lane repeating shuffle mask and then shuffle the
16715 // results into the target lanes.
16717 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16718 return V;
16719
16720 if (V2.isUndef()) {
16721 // Try to use bit rotation instructions.
16722 if (SDValue Rotate =
16723 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16724 return Rotate;
16725
16726 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16727 // because that should be faster than the variable permute alternatives.
16728 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
16729 return V;
16730
16731 // There are no generalized cross-lane shuffle operations available on i16
16732 // element types.
16733 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16735 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16736 return V;
16737
16738 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16739 DAG, Subtarget);
16740 }
16741
16742 SmallVector<int, 8> RepeatedMask;
16743 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16744 // As this is a single-input shuffle, the repeated mask should be
16745 // a strictly valid v8i16 mask that we can pass through to the v8i16
16746 // lowering to handle even the v16 case.
16748 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16749 }
16750 }
16751
16752 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16753 Zeroable, Subtarget, DAG))
16754 return PSHUFB;
16755
16756 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16757 if (Subtarget.hasBWI())
16758 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16759
16760 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16761 // shuffle.
16763 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16764 return Result;
16765
16766 // Try to permute the lanes and then use a per-lane permute.
16768 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16769 return V;
16770
16771 // Try to match an interleave of two v16i16s and lower them as unpck and
16772 // permutes using ymms.
16773 if (!Subtarget.hasAVX512())
16774 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16775 Mask, DAG))
16776 return V;
16777
16778 // Otherwise fall back on generic lowering.
16779 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16780 Subtarget, DAG);
16781}
16782
16783/// Handle lowering of 32-lane 8-bit integer shuffles.
16784///
16785/// This routine is only called when we have AVX2 and thus a reasonable
16786/// instruction set for v32i8 shuffling..
16788 const APInt &Zeroable, SDValue V1, SDValue V2,
16789 const X86Subtarget &Subtarget,
16790 SelectionDAG &DAG) {
16791 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16792 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16793 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16794 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16795
16796 // Whenever we can lower this as a zext, that instruction is strictly faster
16797 // than any alternative. It also allows us to fold memory operands into the
16798 // shuffle in many cases.
16799 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16800 Zeroable, Subtarget, DAG))
16801 return ZExt;
16802
16803 // Check for being able to broadcast a single element.
16804 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16805 Subtarget, DAG))
16806 return Broadcast;
16807
16808 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16809 Zeroable, Subtarget, DAG))
16810 return Blend;
16811
16812 // Use dedicated unpack instructions for masks that match their pattern.
16813 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
16814 return V;
16815
16816 // Use dedicated pack instructions for masks that match their pattern.
16817 if (SDValue V =
16818 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16819 return V;
16820
16821 // Try to use lower using a truncation.
16822 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16823 Subtarget, DAG))
16824 return V;
16825
16826 // Try to use shift instructions.
16827 if (SDValue Shift =
16828 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16829 DAG, /*BitwiseOnly*/ false))
16830 return Shift;
16831
16832 // Try to use byte rotation instructions.
16833 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16834 Subtarget, DAG))
16835 return Rotate;
16836
16837 // Try to use bit rotation instructions.
16838 if (V2.isUndef())
16839 if (SDValue Rotate =
16840 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16841 return Rotate;
16842
16843 // Try to create an in-lane repeating shuffle mask and then shuffle the
16844 // results into the target lanes.
16846 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16847 return V;
16848
16849 // There are no generalized cross-lane shuffle operations available on i8
16850 // element types.
16851 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16852 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16853 // because that should be faster than the variable permute alternatives.
16854 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
16855 return V;
16856
16858 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16859 return V;
16860
16861 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16862 DAG, Subtarget);
16863 }
16864
16865 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16866 Zeroable, Subtarget, DAG))
16867 return PSHUFB;
16868
16869 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16870 if (Subtarget.hasVBMI())
16871 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16872
16873 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16874 // shuffle.
16876 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16877 return Result;
16878
16879 // Try to permute the lanes and then use a per-lane permute.
16881 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16882 return V;
16883
16884 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16885 // by zeroable elements in the remaining 24 elements. Turn this into two
16886 // vmovqb instructions shuffled together.
16887 if (Subtarget.hasVLX())
16888 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16889 Mask, Zeroable, DAG))
16890 return V;
16891
16892 // Try to match an interleave of two v32i8s and lower them as unpck and
16893 // permutes using ymms.
16894 if (!Subtarget.hasAVX512())
16895 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16896 Mask, DAG))
16897 return V;
16898
16899 // Otherwise fall back on generic lowering.
16900 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16901 Subtarget, DAG);
16902}
16903
16904/// High-level routine to lower various 256-bit x86 vector shuffles.
16905///
16906/// This routine either breaks down the specific type of a 256-bit x86 vector
16907/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16908/// together based on the available instructions.
16910 SDValue V1, SDValue V2, const APInt &Zeroable,
16911 const X86Subtarget &Subtarget,
16912 SelectionDAG &DAG) {
16913 // If we have a single input to the zero element, insert that into V1 if we
16914 // can do so cheaply.
16915 int NumElts = VT.getVectorNumElements();
16916 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16917
16918 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16920 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16921 return Insertion;
16922
16923 // Handle special cases where the lower or upper half is UNDEF.
16924 if (SDValue V =
16925 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16926 return V;
16927
16928 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16929 // can check for those subtargets here and avoid much of the subtarget
16930 // querying in the per-vector-type lowering routines. With AVX1 we have
16931 // essentially *zero* ability to manipulate a 256-bit vector with integer
16932 // types. Since we'll use floating point types there eventually, just
16933 // immediately cast everything to a float and operate entirely in that domain.
16934 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16935 int ElementBits = VT.getScalarSizeInBits();
16936 if (ElementBits < 32) {
16937 // No floating point type available, if we can't use the bit operations
16938 // for masking/blending then decompose into 128-bit vectors.
16939 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16940 Subtarget, DAG))
16941 return V;
16942 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16943 return V;
16944 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16945 }
16946
16947 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16949 V1 = DAG.getBitcast(FpVT, V1);
16950 V2 = DAG.getBitcast(FpVT, V2);
16951 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16952 }
16953
16954 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16955 V1 = DAG.getBitcast(MVT::v16i16, V1);
16956 V2 = DAG.getBitcast(MVT::v16i16, V2);
16957 return DAG.getBitcast(VT,
16958 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16959 }
16960
16961 switch (VT.SimpleTy) {
16962 case MVT::v4f64:
16963 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16964 case MVT::v4i64:
16965 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16966 case MVT::v8f32:
16967 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16968 case MVT::v8i32:
16969 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16970 case MVT::v16i16:
16971 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16972 case MVT::v32i8:
16973 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16974
16975 default:
16976 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16977 }
16978}
16979
16980/// Try to lower a vector shuffle as a 128-bit shuffles.
16982 const APInt &Zeroable, SDValue V1, SDValue V2,
16983 const X86Subtarget &Subtarget,
16984 SelectionDAG &DAG) {
16985 assert(VT.getScalarSizeInBits() == 64 &&
16986 "Unexpected element type size for 128bit shuffle.");
16987
16988 // To handle 256 bit vector requires VLX and most probably
16989 // function lowerV2X128VectorShuffle() is better solution.
16990 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16991
16992 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16993 SmallVector<int, 4> Widened128Mask;
16994 if (!canWidenShuffleElements(Mask, Widened128Mask))
16995 return SDValue();
16996 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16997
16998 // Try to use an insert into a zero vector.
16999 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17000 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17001 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17002 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17003 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17004 DAG.getVectorIdxConstant(0, DL));
17005 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17006 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17007 DAG.getVectorIdxConstant(0, DL));
17008 }
17009
17010 // Check for patterns which can be matched with a single insert of a 256-bit
17011 // subvector.
17012 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17013 if (OnlyUsesV1 ||
17014 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17015 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17016 SDValue SubVec =
17017 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17018 DAG.getVectorIdxConstant(0, DL));
17019 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17020 DAG.getVectorIdxConstant(4, DL));
17021 }
17022
17023 // See if this is an insertion of the lower 128-bits of V2 into V1.
17024 bool IsInsert = true;
17025 int V2Index = -1;
17026 for (int i = 0; i < 4; ++i) {
17027 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17028 if (Widened128Mask[i] < 0)
17029 continue;
17030
17031 // Make sure all V1 subvectors are in place.
17032 if (Widened128Mask[i] < 4) {
17033 if (Widened128Mask[i] != i) {
17034 IsInsert = false;
17035 break;
17036 }
17037 } else {
17038 // Make sure we only have a single V2 index and its the lowest 128-bits.
17039 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17040 IsInsert = false;
17041 break;
17042 }
17043 V2Index = i;
17044 }
17045 }
17046 if (IsInsert && V2Index >= 0) {
17047 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17048 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17049 DAG.getVectorIdxConstant(0, DL));
17050 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17051 }
17052
17053 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17054 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17055 // possible we at least ensure the lanes stay sequential to help later
17056 // combines.
17057 SmallVector<int, 2> Widened256Mask;
17058 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17059 Widened128Mask.clear();
17060 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17061 }
17062
17063 // Try to lower to vshuf64x2/vshuf32x4.
17064 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17065 int PermMask[4] = {-1, -1, -1, -1};
17066 // Ensure elements came from the same Op.
17067 for (int i = 0; i < 4; ++i) {
17068 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17069 if (Widened128Mask[i] < 0)
17070 continue;
17071
17072 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17073 unsigned OpIndex = i / 2;
17074 if (Ops[OpIndex].isUndef())
17075 Ops[OpIndex] = Op;
17076 else if (Ops[OpIndex] != Op)
17077 return SDValue();
17078
17079 PermMask[i] = Widened128Mask[i] % 4;
17080 }
17081
17082 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17083 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17084}
17085
17086/// Handle lowering of 8-lane 64-bit floating point shuffles.
17088 const APInt &Zeroable, SDValue V1, SDValue V2,
17089 const X86Subtarget &Subtarget,
17090 SelectionDAG &DAG) {
17091 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17092 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17093 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17094
17095 if (V2.isUndef()) {
17096 // Use low duplicate instructions for masks that match their pattern.
17097 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17098 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17099
17100 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17101 // Non-half-crossing single input shuffles can be lowered with an
17102 // interleaved permutation.
17103 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17104 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17105 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17106 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17107 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17108 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17109 }
17110
17111 SmallVector<int, 4> RepeatedMask;
17112 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17113 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17114 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17115 }
17116
17117 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17118 V2, Subtarget, DAG))
17119 return Shuf128;
17120
17121 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17122 return Unpck;
17123
17124 // Check if the blend happens to exactly fit that of SHUFPD.
17125 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17126 Zeroable, Subtarget, DAG))
17127 return Op;
17128
17129 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17130 Subtarget, DAG))
17131 return V;
17132
17133 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17134 Zeroable, Subtarget, DAG))
17135 return Blend;
17136
17137 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17138}
17139
17140/// Handle lowering of 16-lane 32-bit floating point shuffles.
17142 const APInt &Zeroable, SDValue V1, SDValue V2,
17143 const X86Subtarget &Subtarget,
17144 SelectionDAG &DAG) {
17145 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17146 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17147 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17148
17149 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17150 // options to efficiently lower the shuffle.
17151 SmallVector<int, 4> RepeatedMask;
17152 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17153 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17154
17155 // Use even/odd duplicate instructions for masks that match their pattern.
17156 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17157 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17158 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17159 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17160
17161 if (V2.isUndef())
17162 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17163 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17164
17165 // Use dedicated unpack instructions for masks that match their pattern.
17166 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17167 return V;
17168
17169 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17170 Zeroable, Subtarget, DAG))
17171 return Blend;
17172
17173 // Otherwise, fall back to a SHUFPS sequence.
17174 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17175 }
17176
17177 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17178 Zeroable, Subtarget, DAG))
17179 return Blend;
17180
17182 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17183 return DAG.getBitcast(MVT::v16f32, ZExt);
17184
17185 // Try to create an in-lane repeating shuffle mask and then shuffle the
17186 // results into the target lanes.
17188 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17189 return V;
17190
17191 // If we have a single input shuffle with different shuffle patterns in the
17192 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17193 if (V2.isUndef() &&
17194 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17195 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17196 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17197 }
17198
17199 // If we have AVX512F support, we can use VEXPAND.
17200 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17201 Zeroable, Subtarget, DAG))
17202 return V;
17203
17204 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17205}
17206
17207/// Handle lowering of 8-lane 64-bit integer shuffles.
17209 const APInt &Zeroable, SDValue V1, SDValue V2,
17210 const X86Subtarget &Subtarget,
17211 SelectionDAG &DAG) {
17212 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17213 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17214 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17215
17216 // Try to use shift instructions if fast.
17217 if (Subtarget.preferLowerShuffleAsShift())
17218 if (SDValue Shift =
17219 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17220 Subtarget, DAG, /*BitwiseOnly*/ true))
17221 return Shift;
17222
17223 if (V2.isUndef()) {
17224 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17225 // can use lower latency instructions that will operate on all four
17226 // 128-bit lanes.
17227 SmallVector<int, 2> Repeated128Mask;
17228 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17229 SmallVector<int, 4> PSHUFDMask;
17230 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17231 return DAG.getBitcast(
17232 MVT::v8i64,
17233 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17234 DAG.getBitcast(MVT::v16i32, V1),
17235 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17236 }
17237
17238 SmallVector<int, 4> Repeated256Mask;
17239 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17240 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17241 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17242 }
17243
17244 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17245 V2, Subtarget, DAG))
17246 return Shuf128;
17247
17248 // Try to use shift instructions.
17249 if (SDValue Shift =
17250 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17251 DAG, /*BitwiseOnly*/ false))
17252 return Shift;
17253
17254 // Try to use VALIGN.
17255 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17256 Zeroable, Subtarget, DAG))
17257 return Rotate;
17258
17259 // Try to use PALIGNR.
17260 if (Subtarget.hasBWI())
17261 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17262 Subtarget, DAG))
17263 return Rotate;
17264
17265 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17266 return Unpck;
17267
17268 // If we have AVX512F support, we can use VEXPAND.
17269 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17270 Subtarget, DAG))
17271 return V;
17272
17273 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17274 Zeroable, Subtarget, DAG))
17275 return Blend;
17276
17277 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17278}
17279
17280/// Handle lowering of 16-lane 32-bit integer shuffles.
17282 const APInt &Zeroable, SDValue V1, SDValue V2,
17283 const X86Subtarget &Subtarget,
17284 SelectionDAG &DAG) {
17285 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17286 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17287 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17288
17289 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17290
17291 // Whenever we can lower this as a zext, that instruction is strictly faster
17292 // than any alternative. It also allows us to fold memory operands into the
17293 // shuffle in many cases.
17295 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17296 return ZExt;
17297
17298 // Try to use shift instructions if fast.
17299 if (Subtarget.preferLowerShuffleAsShift()) {
17300 if (SDValue Shift =
17301 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17302 Subtarget, DAG, /*BitwiseOnly*/ true))
17303 return Shift;
17304 if (NumV2Elements == 0)
17305 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17306 Subtarget, DAG))
17307 return Rotate;
17308 }
17309
17310 // If the shuffle mask is repeated in each 128-bit lane we can use more
17311 // efficient instructions that mirror the shuffles across the four 128-bit
17312 // lanes.
17313 SmallVector<int, 4> RepeatedMask;
17314 bool Is128BitLaneRepeatedShuffle =
17315 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17316 if (Is128BitLaneRepeatedShuffle) {
17317 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17318 if (V2.isUndef())
17319 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17320 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17321
17322 // Use dedicated unpack instructions for masks that match their pattern.
17323 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17324 return V;
17325 }
17326
17327 // Try to use shift instructions.
17328 if (SDValue Shift =
17329 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17330 Subtarget, DAG, /*BitwiseOnly*/ false))
17331 return Shift;
17332
17333 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17334 if (SDValue Rotate =
17335 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17336 return Rotate;
17337
17338 // Try to use VALIGN.
17339 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17340 Zeroable, Subtarget, DAG))
17341 return Rotate;
17342
17343 // Try to use byte rotation instructions.
17344 if (Subtarget.hasBWI())
17345 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17346 Subtarget, DAG))
17347 return Rotate;
17348
17349 // Assume that a single SHUFPS is faster than using a permv shuffle.
17350 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17351 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17352 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17353 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17354 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17355 CastV1, CastV2, DAG);
17356 return DAG.getBitcast(MVT::v16i32, ShufPS);
17357 }
17358
17359 // Try to create an in-lane repeating shuffle mask and then shuffle the
17360 // results into the target lanes.
17362 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17363 return V;
17364
17365 // If we have AVX512F support, we can use VEXPAND.
17366 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17367 Zeroable, Subtarget, DAG))
17368 return V;
17369
17370 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17371 Zeroable, Subtarget, DAG))
17372 return Blend;
17373
17374 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17375}
17376
17377/// Handle lowering of 32-lane 16-bit integer shuffles.
17379 const APInt &Zeroable, SDValue V1, SDValue V2,
17380 const X86Subtarget &Subtarget,
17381 SelectionDAG &DAG) {
17382 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17383 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17384 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17385 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17386
17387 // Whenever we can lower this as a zext, that instruction is strictly faster
17388 // than any alternative. It also allows us to fold memory operands into the
17389 // shuffle in many cases.
17391 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17392 return ZExt;
17393
17394 // Use dedicated unpack instructions for masks that match their pattern.
17395 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17396 return V;
17397
17398 // Use dedicated pack instructions for masks that match their pattern.
17399 if (SDValue V =
17400 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17401 return V;
17402
17403 // Try to use shift instructions.
17404 if (SDValue Shift =
17405 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17406 Subtarget, DAG, /*BitwiseOnly*/ false))
17407 return Shift;
17408
17409 // Try to use byte rotation instructions.
17410 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17411 Subtarget, DAG))
17412 return Rotate;
17413
17414 if (V2.isUndef()) {
17415 // Try to use bit rotation instructions.
17416 if (SDValue Rotate =
17417 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17418 return Rotate;
17419
17420 SmallVector<int, 8> RepeatedMask;
17421 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17422 // As this is a single-input shuffle, the repeated mask should be
17423 // a strictly valid v8i16 mask that we can pass through to the v8i16
17424 // lowering to handle even the v32 case.
17425 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17426 RepeatedMask, Subtarget, DAG);
17427 }
17428 }
17429
17430 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17431 Zeroable, Subtarget, DAG))
17432 return Blend;
17433
17434 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17435 Zeroable, Subtarget, DAG))
17436 return PSHUFB;
17437
17438 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17439 // shuffle.
17440 if (!V2.isUndef())
17442 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17443 return Result;
17444
17445 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17446}
17447
17448/// Handle lowering of 64-lane 8-bit integer shuffles.
17450 const APInt &Zeroable, SDValue V1, SDValue V2,
17451 const X86Subtarget &Subtarget,
17452 SelectionDAG &DAG) {
17453 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17454 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17455 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17456 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17457
17458 // Whenever we can lower this as a zext, that instruction is strictly faster
17459 // than any alternative. It also allows us to fold memory operands into the
17460 // shuffle in many cases.
17462 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17463 return ZExt;
17464
17465 // Use dedicated unpack instructions for masks that match their pattern.
17466 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17467 return V;
17468
17469 // Use dedicated pack instructions for masks that match their pattern.
17470 if (SDValue V =
17471 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17472 return V;
17473
17474 // Try to use shift instructions.
17475 if (SDValue Shift =
17476 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17477 DAG, /*BitwiseOnly*/ false))
17478 return Shift;
17479
17480 // Try to use byte rotation instructions.
17481 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17482 Subtarget, DAG))
17483 return Rotate;
17484
17485 // Try to use bit rotation instructions.
17486 if (V2.isUndef())
17487 if (SDValue Rotate =
17488 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17489 return Rotate;
17490
17491 // Lower as AND if possible.
17492 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17493 Zeroable, Subtarget, DAG))
17494 return Masked;
17495
17496 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17497 Zeroable, Subtarget, DAG))
17498 return PSHUFB;
17499
17500 // Try to create an in-lane repeating shuffle mask and then shuffle the
17501 // results into the target lanes.
17503 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17504 return V;
17505
17507 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17508 return Result;
17509
17510 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17511 Zeroable, Subtarget, DAG))
17512 return Blend;
17513
17514 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17515 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17516 // PALIGNR will be cheaper than the second PSHUFB+OR.
17517 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17518 Mask, Subtarget, DAG))
17519 return V;
17520
17521 // If we can't directly blend but can use PSHUFB, that will be better as it
17522 // can both shuffle and set up the inefficient blend.
17523 bool V1InUse, V2InUse;
17524 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17525 DAG, V1InUse, V2InUse);
17526 }
17527
17528 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17529 // shuffle.
17530 if (!V2.isUndef())
17532 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17533 return Result;
17534
17535 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17536 if (Subtarget.hasVBMI())
17537 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17538
17539 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17540}
17541
17542/// High-level routine to lower various 512-bit x86 vector shuffles.
17543///
17544/// This routine either breaks down the specific type of a 512-bit x86 vector
17545/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17546/// together based on the available instructions.
17548 MVT VT, SDValue V1, SDValue V2,
17549 const APInt &Zeroable,
17550 const X86Subtarget &Subtarget,
17551 SelectionDAG &DAG) {
17552 assert(Subtarget.hasAVX512() &&
17553 "Cannot lower 512-bit vectors w/ basic ISA!");
17554
17555 // If we have a single input to the zero element, insert that into V1 if we
17556 // can do so cheaply.
17557 int NumElts = Mask.size();
17558 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17559
17560 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17562 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17563 return Insertion;
17564
17565 // Handle special cases where the lower or upper half is UNDEF.
17566 if (SDValue V =
17567 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17568 return V;
17569
17570 // Check for being able to broadcast a single element.
17571 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17572 Subtarget, DAG))
17573 return Broadcast;
17574
17575 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17576 // Try using bit ops for masking and blending before falling back to
17577 // splitting.
17578 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17579 Subtarget, DAG))
17580 return V;
17581 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17582 return V;
17583
17584 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17585 }
17586
17587 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17588 if (!Subtarget.hasBWI())
17589 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17590 /*SimpleOnly*/ false);
17591
17592 V1 = DAG.getBitcast(MVT::v32i16, V1);
17593 V2 = DAG.getBitcast(MVT::v32i16, V2);
17594 return DAG.getBitcast(VT,
17595 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17596 }
17597
17598 // Dispatch to each element type for lowering. If we don't have support for
17599 // specific element type shuffles at 512 bits, immediately split them and
17600 // lower them. Each lowering routine of a given type is allowed to assume that
17601 // the requisite ISA extensions for that element type are available.
17602 switch (VT.SimpleTy) {
17603 case MVT::v8f64:
17604 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17605 case MVT::v16f32:
17606 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17607 case MVT::v8i64:
17608 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17609 case MVT::v16i32:
17610 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17611 case MVT::v32i16:
17612 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17613 case MVT::v64i8:
17614 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17615
17616 default:
17617 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17618 }
17619}
17620
17622 MVT VT, SDValue V1, SDValue V2,
17623 const X86Subtarget &Subtarget,
17624 SelectionDAG &DAG) {
17625 // Shuffle should be unary.
17626 if (!V2.isUndef())
17627 return SDValue();
17628
17629 int ShiftAmt = -1;
17630 int NumElts = Mask.size();
17631 for (int i = 0; i != NumElts; ++i) {
17632 int M = Mask[i];
17633 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17634 "Unexpected mask index.");
17635 if (M < 0)
17636 continue;
17637
17638 // The first non-undef element determines our shift amount.
17639 if (ShiftAmt < 0) {
17640 ShiftAmt = M - i;
17641 // Need to be shifting right.
17642 if (ShiftAmt <= 0)
17643 return SDValue();
17644 }
17645 // All non-undef elements must shift by the same amount.
17646 if (ShiftAmt != M - i)
17647 return SDValue();
17648 }
17649 assert(ShiftAmt >= 0 && "All undef?");
17650
17651 // Great we found a shift right.
17652 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17653 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17654 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17655 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17656 DAG.getVectorIdxConstant(0, DL));
17657}
17658
17659// Determine if this shuffle can be implemented with a KSHIFT instruction.
17660// Returns the shift amount if possible or -1 if not. This is a simplified
17661// version of matchShuffleAsShift.
17662static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17663 int MaskOffset, const APInt &Zeroable) {
17664 int Size = Mask.size();
17665
17666 auto CheckZeros = [&](int Shift, bool Left) {
17667 for (int j = 0; j < Shift; ++j)
17668 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17669 return false;
17670
17671 return true;
17672 };
17673
17674 auto MatchShift = [&](int Shift, bool Left) {
17675 unsigned Pos = Left ? Shift : 0;
17676 unsigned Low = Left ? 0 : Shift;
17677 unsigned Len = Size - Shift;
17678 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17679 };
17680
17681 for (int Shift = 1; Shift != Size; ++Shift)
17682 for (bool Left : {true, false})
17683 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17685 return Shift;
17686 }
17687
17688 return -1;
17689}
17690
17691
17692// Lower vXi1 vector shuffles.
17693// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17694// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17695// vector, shuffle and then truncate it back.
17697 MVT VT, SDValue V1, SDValue V2,
17698 const APInt &Zeroable,
17699 const X86Subtarget &Subtarget,
17700 SelectionDAG &DAG) {
17701 assert(Subtarget.hasAVX512() &&
17702 "Cannot lower 512-bit vectors w/o basic ISA!");
17703
17704 int NumElts = Mask.size();
17705 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17706
17707 // Try to recognize shuffles that are just padding a subvector with zeros.
17708 int SubvecElts = 0;
17709 int Src = -1;
17710 for (int i = 0; i != NumElts; ++i) {
17711 if (Mask[i] >= 0) {
17712 // Grab the source from the first valid mask. All subsequent elements need
17713 // to use this same source.
17714 if (Src < 0)
17715 Src = Mask[i] / NumElts;
17716 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17717 break;
17718 }
17719
17720 ++SubvecElts;
17721 }
17722 assert(SubvecElts != NumElts && "Identity shuffle?");
17723
17724 // Clip to a power 2.
17725 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17726
17727 // Make sure the number of zeroable bits in the top at least covers the bits
17728 // not covered by the subvector.
17729 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17730 assert(Src >= 0 && "Expected a source!");
17731 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17732 SDValue Extract =
17733 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
17734 DAG.getVectorIdxConstant(0, DL));
17735 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17736 DAG.getConstant(0, DL, VT), Extract,
17737 DAG.getVectorIdxConstant(0, DL));
17738 }
17739
17740 // Try a simple shift right with undef elements. Later we'll try with zeros.
17741 if (SDValue Shift =
17742 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
17743 return Shift;
17744
17745 // Try to match KSHIFTs.
17746 unsigned Offset = 0;
17747 for (SDValue V : {V1, V2}) {
17748 unsigned Opcode;
17749 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17750 if (ShiftAmt >= 0) {
17751 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17752 MVT WideVT = Res.getSimpleValueType();
17753 // Widened right shifts need two shifts to ensure we shift in zeroes.
17754 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17755 int WideElts = WideVT.getVectorNumElements();
17756 // Shift left to put the original vector in the MSBs of the new size.
17757 Res =
17758 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17759 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17760 // Increase the shift amount to account for the left shift.
17761 ShiftAmt += WideElts - NumElts;
17762 }
17763
17764 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17765 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17766 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17767 DAG.getVectorIdxConstant(0, DL));
17768 }
17769 Offset += NumElts; // Increment for next iteration.
17770 }
17771
17772 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
17773 // ops instead.
17774 // TODO: What other unary shuffles would benefit from this?
17775 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17776 SDValue Op0 = V1.getOperand(0);
17777 SDValue Op1 = V1.getOperand(1);
17778 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17779 EVT OpVT = Op0.getValueType();
17780 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17781 return DAG.getSetCC(
17782 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17783 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17784 }
17785
17786 MVT ExtVT;
17787 switch (VT.SimpleTy) {
17788 default:
17789 llvm_unreachable("Expected a vector of i1 elements");
17790 case MVT::v2i1:
17791 ExtVT = MVT::v2i64;
17792 break;
17793 case MVT::v4i1:
17794 ExtVT = MVT::v4i32;
17795 break;
17796 case MVT::v8i1:
17797 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17798 // shuffle.
17799 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17800 break;
17801 case MVT::v16i1:
17802 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17803 // 256-bit operation available.
17804 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17805 break;
17806 case MVT::v32i1:
17807 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17808 // 256-bit operation available.
17809 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17810 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17811 break;
17812 case MVT::v64i1:
17813 // Fall back to scalarization. FIXME: We can do better if the shuffle
17814 // can be partitioned cleanly.
17815 if (!Subtarget.useBWIRegs())
17816 return SDValue();
17817 ExtVT = MVT::v64i8;
17818 break;
17819 }
17820
17821 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17822 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17823
17824 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17825 // i1 was sign extended we can use X86ISD::CVT2MASK.
17826 int NumElems = VT.getVectorNumElements();
17827 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17828 (Subtarget.hasDQI() && (NumElems < 32)))
17829 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17830 Shuffle, ISD::SETGT);
17831
17832 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17833}
17834
17835/// Helper function that returns true if the shuffle mask should be
17836/// commuted to improve canonicalization.
17838 int NumElements = Mask.size();
17839
17840 int NumV1Elements = 0, NumV2Elements = 0;
17841 for (int M : Mask)
17842 if (M < 0)
17843 continue;
17844 else if (M < NumElements)
17845 ++NumV1Elements;
17846 else
17847 ++NumV2Elements;
17848
17849 // Commute the shuffle as needed such that more elements come from V1 than
17850 // V2. This allows us to match the shuffle pattern strictly on how many
17851 // elements come from V1 without handling the symmetric cases.
17852 if (NumV2Elements > NumV1Elements)
17853 return true;
17854
17855 assert(NumV1Elements > 0 && "No V1 indices");
17856
17857 if (NumV2Elements == 0)
17858 return false;
17859
17860 // When the number of V1 and V2 elements are the same, try to minimize the
17861 // number of uses of V2 in the low half of the vector. When that is tied,
17862 // ensure that the sum of indices for V1 is equal to or lower than the sum
17863 // indices for V2. When those are equal, try to ensure that the number of odd
17864 // indices for V1 is lower than the number of odd indices for V2.
17865 if (NumV1Elements == NumV2Elements) {
17866 int LowV1Elements = 0, LowV2Elements = 0;
17867 for (int M : Mask.slice(0, NumElements / 2))
17868 if (M >= NumElements)
17869 ++LowV2Elements;
17870 else if (M >= 0)
17871 ++LowV1Elements;
17872 if (LowV2Elements > LowV1Elements)
17873 return true;
17874 if (LowV2Elements == LowV1Elements) {
17875 int SumV1Indices = 0, SumV2Indices = 0;
17876 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17877 if (Mask[i] >= NumElements)
17878 SumV2Indices += i;
17879 else if (Mask[i] >= 0)
17880 SumV1Indices += i;
17881 if (SumV2Indices < SumV1Indices)
17882 return true;
17883 if (SumV2Indices == SumV1Indices) {
17884 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17885 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17886 if (Mask[i] >= NumElements)
17887 NumV2OddIndices += i % 2;
17888 else if (Mask[i] >= 0)
17889 NumV1OddIndices += i % 2;
17890 if (NumV2OddIndices < NumV1OddIndices)
17891 return true;
17892 }
17893 }
17894 }
17895
17896 return false;
17897}
17898
17900 const X86Subtarget &Subtarget) {
17901 if (!Subtarget.hasAVX512())
17902 return false;
17903
17904 if (!V.getValueType().isSimple())
17905 return false;
17906
17907 MVT VT = V.getSimpleValueType().getScalarType();
17908 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17909 return false;
17910
17911 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17912 // are preferable to blendw/blendvb/masked-mov.
17913 if ((VT == MVT::i16 || VT == MVT::i8) &&
17914 V.getSimpleValueType().getSizeInBits() < 512)
17915 return false;
17916
17917 auto HasMaskOperation = [&](SDValue V) {
17918 // TODO: Currently we only check limited opcode. We probably extend
17919 // it to all binary operation by checking TLI.isBinOp().
17920 switch (V->getOpcode()) {
17921 default:
17922 return false;
17923 case ISD::ADD:
17924 case ISD::SUB:
17925 case ISD::AND:
17926 case ISD::XOR:
17927 case ISD::OR:
17928 case ISD::SMAX:
17929 case ISD::SMIN:
17930 case ISD::UMAX:
17931 case ISD::UMIN:
17932 case ISD::ABS:
17933 case ISD::SHL:
17934 case ISD::SRL:
17935 case ISD::SRA:
17936 case ISD::MUL:
17937 break;
17938 }
17939 if (!V->hasOneUse())
17940 return false;
17941
17942 return true;
17943 };
17944
17945 if (HasMaskOperation(V))
17946 return true;
17947
17948 return false;
17949}
17950
17951// Forward declaration.
17954 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17955 const X86Subtarget &Subtarget);
17956
17957 /// Top-level lowering for x86 vector shuffles.
17958///
17959/// This handles decomposition, canonicalization, and lowering of all x86
17960/// vector shuffles. Most of the specific lowering strategies are encapsulated
17961/// above in helper routines. The canonicalization attempts to widen shuffles
17962/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17963/// s.t. only one of the two inputs needs to be tested, etc.
17965 SelectionDAG &DAG) {
17966 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17967 ArrayRef<int> OrigMask = SVOp->getMask();
17968 SDValue V1 = Op.getOperand(0);
17969 SDValue V2 = Op.getOperand(1);
17970 MVT VT = Op.getSimpleValueType();
17971 int NumElements = VT.getVectorNumElements();
17972 SDLoc DL(Op);
17973 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17974
17975 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17976 "Can't lower MMX shuffles");
17977
17978 bool V1IsUndef = V1.isUndef();
17979 bool V2IsUndef = V2.isUndef();
17980 if (V1IsUndef && V2IsUndef)
17981 return DAG.getUNDEF(VT);
17982
17983 // When we create a shuffle node we put the UNDEF node to second operand,
17984 // but in some cases the first operand may be transformed to UNDEF.
17985 // In this case we should just commute the node.
17986 if (V1IsUndef)
17987 return DAG.getCommutedVectorShuffle(*SVOp);
17988
17989 // Check for non-undef masks pointing at an undef vector and make the masks
17990 // undef as well. This makes it easier to match the shuffle based solely on
17991 // the mask.
17992 if (V2IsUndef &&
17993 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17994 SmallVector<int, 8> NewMask(OrigMask);
17995 for (int &M : NewMask)
17996 if (M >= NumElements)
17997 M = -1;
17998 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17999 }
18000
18001 // Check for illegal shuffle mask element index values.
18002 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18003 (void)MaskUpperLimit;
18004 assert(llvm::all_of(OrigMask,
18005 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18006 "Out of bounds shuffle index");
18007
18008 // We actually see shuffles that are entirely re-arrangements of a set of
18009 // zero inputs. This mostly happens while decomposing complex shuffles into
18010 // simple ones. Directly lower these as a buildvector of zeros.
18011 APInt KnownUndef, KnownZero;
18012 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18013
18014 APInt Zeroable = KnownUndef | KnownZero;
18015 if (Zeroable.isAllOnes())
18016 return getZeroVector(VT, Subtarget, DAG, DL);
18017
18018 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18019
18020 // Try to collapse shuffles into using a vector type with fewer elements but
18021 // wider element types. We cap this to not form integers or floating point
18022 // elements wider than 64 bits. It does not seem beneficial to form i128
18023 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18024 SmallVector<int, 16> WidenedMask;
18025 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18026 !canCombineAsMaskOperation(V1, Subtarget) &&
18027 !canCombineAsMaskOperation(V2, Subtarget) &&
18028 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18029 // Shuffle mask widening should not interfere with a broadcast opportunity
18030 // by obfuscating the operands with bitcasts.
18031 // TODO: Avoid lowering directly from this top-level function: make this
18032 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18033 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18034 Subtarget, DAG))
18035 return Broadcast;
18036
18037 MVT NewEltVT = VT.isFloatingPoint()
18040 int NewNumElts = NumElements / 2;
18041 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18042 // Make sure that the new vector type is legal. For example, v2f64 isn't
18043 // legal on SSE1.
18044 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18045 if (V2IsZero) {
18046 // Modify the new Mask to take all zeros from the all-zero vector.
18047 // Choose indices that are blend-friendly.
18048 bool UsedZeroVector = false;
18049 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18050 "V2's non-undef elements are used?!");
18051 for (int i = 0; i != NewNumElts; ++i)
18052 if (WidenedMask[i] == SM_SentinelZero) {
18053 WidenedMask[i] = i + NewNumElts;
18054 UsedZeroVector = true;
18055 }
18056 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18057 // some elements to be undef.
18058 if (UsedZeroVector)
18059 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18060 }
18061 V1 = DAG.getBitcast(NewVT, V1);
18062 V2 = DAG.getBitcast(NewVT, V2);
18063 return DAG.getBitcast(
18064 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18065 }
18066 }
18067
18068 SmallVector<SDValue> Ops = {V1, V2};
18069 SmallVector<int> Mask(OrigMask);
18070
18071 // Canonicalize the shuffle with any horizontal ops inputs.
18072 // NOTE: This may update Ops and Mask.
18074 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18075 return DAG.getBitcast(VT, HOp);
18076
18077 V1 = DAG.getBitcast(VT, Ops[0]);
18078 V2 = DAG.getBitcast(VT, Ops[1]);
18079 assert(NumElements == (int)Mask.size() &&
18080 "canonicalizeShuffleMaskWithHorizOp "
18081 "shouldn't alter the shuffle mask size");
18082
18083 // Commute the shuffle if it will improve canonicalization.
18086 std::swap(V1, V2);
18087 }
18088
18089 // For each vector width, delegate to a specialized lowering routine.
18090 if (VT.is128BitVector())
18091 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18092
18093 if (VT.is256BitVector())
18094 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18095
18096 if (VT.is512BitVector())
18097 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18098
18099 if (Is1BitVector)
18100 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18101
18102 llvm_unreachable("Unimplemented!");
18103}
18104
18105// As legal vpcompress instructions depend on various AVX512 extensions, try to
18106// convert illegal vector sizes to legal ones to avoid expansion.
18108 SelectionDAG &DAG) {
18109 assert(Subtarget.hasAVX512() &&
18110 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18111
18112 SDLoc DL(Op);
18113 SDValue Vec = Op.getOperand(0);
18114 SDValue Mask = Op.getOperand(1);
18115 SDValue Passthru = Op.getOperand(2);
18116
18117 EVT VecVT = Vec.getValueType();
18118 EVT ElementVT = VecVT.getVectorElementType();
18119 unsigned NumElements = VecVT.getVectorNumElements();
18120 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18121 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18122
18123 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18124 // compressed as 512-bit vectors in AVX512F.
18125 if (NumVecBits != 128 && NumVecBits != 256)
18126 return SDValue();
18127
18128 if (NumElementBits == 32 || NumElementBits == 64) {
18129 unsigned NumLargeElements = 512 / NumElementBits;
18130 MVT LargeVecVT =
18131 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18132 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18133
18134 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18135 DAG, DL);
18136 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18137 Subtarget, DAG, DL);
18138 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18139 : widenSubVector(LargeVecVT, Passthru,
18140 /*ZeroNewElements=*/false,
18141 Subtarget, DAG, DL);
18142
18143 SDValue Compressed =
18144 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18145 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18146 DAG.getConstant(0, DL, MVT::i64));
18147 }
18148
18149 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18150 VecVT == MVT::v16i16) {
18151 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18152 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18153
18154 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18155 Passthru = Passthru.isUndef()
18156 ? DAG.getUNDEF(LargeVecVT)
18157 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18158
18159 SDValue Compressed =
18160 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18161 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18162 }
18163
18164 return SDValue();
18165}
18166
18167/// Try to lower a VSELECT instruction to a vector shuffle.
18169 const X86Subtarget &Subtarget,
18170 SelectionDAG &DAG) {
18171 SDValue Cond = Op.getOperand(0);
18172 SDValue LHS = Op.getOperand(1);
18173 SDValue RHS = Op.getOperand(2);
18174 MVT VT = Op.getSimpleValueType();
18175
18176 // Only non-legal VSELECTs reach this lowering, convert those into generic
18177 // shuffles and re-use the shuffle lowering path for blends.
18181 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18182 }
18183
18184 return SDValue();
18185}
18186
18187SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18188 SDValue Cond = Op.getOperand(0);
18189 SDValue LHS = Op.getOperand(1);
18190 SDValue RHS = Op.getOperand(2);
18191
18192 SDLoc dl(Op);
18193 MVT VT = Op.getSimpleValueType();
18194 if (isSoftF16(VT, Subtarget)) {
18196 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18197 DAG.getBitcast(NVT, LHS),
18198 DAG.getBitcast(NVT, RHS)));
18199 }
18200
18201 // A vselect where all conditions and data are constants can be optimized into
18202 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18206 return SDValue();
18207
18208 // Try to lower this to a blend-style vector shuffle. This can handle all
18209 // constant condition cases.
18210 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18211 return BlendOp;
18212
18213 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18214 // with patterns on the mask registers on AVX-512.
18215 MVT CondVT = Cond.getSimpleValueType();
18216 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18217 if (CondEltSize == 1)
18218 return Op;
18219
18220 // Variable blends are only legal from SSE4.1 onward.
18221 if (!Subtarget.hasSSE41())
18222 return SDValue();
18223
18224 unsigned EltSize = VT.getScalarSizeInBits();
18225 unsigned NumElts = VT.getVectorNumElements();
18226
18227 // Expand v32i16/v64i8 without BWI.
18228 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18229 return SDValue();
18230
18231 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18232 // into an i1 condition so that we can use the mask-based 512-bit blend
18233 // instructions.
18234 if (VT.getSizeInBits() == 512) {
18235 // Build a mask by testing the condition against zero.
18236 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18237 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18238 DAG.getConstant(0, dl, CondVT),
18239 ISD::SETNE);
18240 // Now return a new VSELECT using the mask.
18241 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18242 }
18243
18244 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18245 if (CondEltSize != EltSize) {
18246 // If we don't have a sign splat, rely on the expansion.
18247 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18248 return SDValue();
18249
18250 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18251 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18252 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18253 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18254 }
18255
18256 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18257 // are free to split, then better to split before expanding the
18258 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18259 // TODO: This is very similar to narrowVectorSelect.
18260 // TODO: Add Load splitting to isFreeToSplitVector ?
18261 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18262 !Subtarget.hasXOP()) {
18263 bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG);
18264 bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) ||
18265 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18266 bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) ||
18267 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18268 if (FreeCond && (FreeLHS || FreeRHS))
18269 return splitVectorOp(Op, DAG, dl);
18270 }
18271
18272 // Only some types will be legal on some subtargets. If we can emit a legal
18273 // VSELECT-matching blend, return Op, and but if we need to expand, return
18274 // a null value.
18275 switch (VT.SimpleTy) {
18276 default:
18277 // Most of the vector types have blends past SSE4.1.
18278 return Op;
18279
18280 case MVT::v32i8:
18281 // The byte blends for AVX vectors were introduced only in AVX2.
18282 if (Subtarget.hasAVX2())
18283 return Op;
18284
18285 return SDValue();
18286
18287 case MVT::v8i16:
18288 case MVT::v16i16: {
18289 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18290 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18291 Cond = DAG.getBitcast(CastVT, Cond);
18292 LHS = DAG.getBitcast(CastVT, LHS);
18293 RHS = DAG.getBitcast(CastVT, RHS);
18294 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18295 return DAG.getBitcast(VT, Select);
18296 }
18297 }
18298}
18299
18301 MVT VT = Op.getSimpleValueType();
18302 SDValue Vec = Op.getOperand(0);
18303 SDValue Idx = Op.getOperand(1);
18304 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18305 SDLoc dl(Op);
18306
18308 return SDValue();
18309
18310 if (VT.getSizeInBits() == 8) {
18311 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18312 // we're going to zero extend the register or fold the store.
18315 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18316 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18317 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18318
18319 unsigned IdxVal = Idx->getAsZExtVal();
18320 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18321 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18322 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18323 }
18324
18325 if (VT == MVT::f32) {
18326 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18327 // the result back to FR32 register. It's only worth matching if the
18328 // result has a single use which is a store or a bitcast to i32. And in
18329 // the case of a store, it's not worth it if the index is a constant 0,
18330 // because a MOVSSmr can be used instead, which is smaller and faster.
18331 if (!Op.hasOneUse())
18332 return SDValue();
18333 SDNode *User = *Op.getNode()->user_begin();
18334 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18335 (User->getOpcode() != ISD::BITCAST ||
18336 User->getValueType(0) != MVT::i32))
18337 return SDValue();
18338 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18339 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18340 return DAG.getBitcast(MVT::f32, Extract);
18341 }
18342
18343 if (VT == MVT::i32 || VT == MVT::i64)
18344 return Op;
18345
18346 return SDValue();
18347}
18348
18349/// Extract one bit from mask vector, like v16i1 or v8i1.
18350/// AVX-512 feature.
18352 const X86Subtarget &Subtarget) {
18353 SDValue Vec = Op.getOperand(0);
18354 SDLoc dl(Vec);
18355 MVT VecVT = Vec.getSimpleValueType();
18356 SDValue Idx = Op.getOperand(1);
18357 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18358 MVT EltVT = Op.getSimpleValueType();
18359
18360 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18361 "Unexpected vector type in ExtractBitFromMaskVector");
18362
18363 // variable index can't be handled in mask registers,
18364 // extend vector to VR512/128
18365 if (!IdxC) {
18366 unsigned NumElts = VecVT.getVectorNumElements();
18367 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18368 // than extending to 128/256bit.
18369 if (NumElts == 1) {
18370 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18372 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18373 }
18374 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18375 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18376 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18377 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18378 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18379 }
18380
18381 unsigned IdxVal = IdxC->getZExtValue();
18382 if (IdxVal == 0) // the operation is legal
18383 return Op;
18384
18385 // Extend to natively supported kshift.
18386 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18387
18388 // Use kshiftr instruction to move to the lower element.
18389 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18390 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18391
18392 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18393 DAG.getVectorIdxConstant(0, dl));
18394}
18395
18396// Helper to find all the extracted elements from a vector.
18398 MVT VT = N->getSimpleValueType(0);
18399 unsigned NumElts = VT.getVectorNumElements();
18400 APInt DemandedElts = APInt::getZero(NumElts);
18401 for (SDNode *User : N->users()) {
18402 switch (User->getOpcode()) {
18403 case X86ISD::PEXTRB:
18404 case X86ISD::PEXTRW:
18406 if (!isa<ConstantSDNode>(User->getOperand(1))) {
18407 DemandedElts.setAllBits();
18408 return DemandedElts;
18409 }
18410 DemandedElts.setBit(User->getConstantOperandVal(1));
18411 break;
18412 case ISD::BITCAST: {
18413 if (!User->getValueType(0).isSimple() ||
18414 !User->getValueType(0).isVector()) {
18415 DemandedElts.setAllBits();
18416 return DemandedElts;
18417 }
18418 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18419 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18420 break;
18421 }
18422 default:
18423 DemandedElts.setAllBits();
18424 return DemandedElts;
18425 }
18426 }
18427 return DemandedElts;
18428}
18429
18430SDValue
18431X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18432 SelectionDAG &DAG) const {
18433 SDLoc dl(Op);
18434 SDValue Vec = Op.getOperand(0);
18435 MVT VecVT = Vec.getSimpleValueType();
18436 SDValue Idx = Op.getOperand(1);
18437 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18438
18439 if (VecVT.getVectorElementType() == MVT::i1)
18440 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18441
18442 if (!IdxC) {
18443 // Its more profitable to go through memory (1 cycles throughput)
18444 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18445 // IACA tool was used to get performance estimation
18446 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18447 //
18448 // example : extractelement <16 x i8> %a, i32 %i
18449 //
18450 // Block Throughput: 3.00 Cycles
18451 // Throughput Bottleneck: Port5
18452 //
18453 // | Num Of | Ports pressure in cycles | |
18454 // | Uops | 0 - DV | 5 | 6 | 7 | |
18455 // ---------------------------------------------
18456 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18457 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18458 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18459 // Total Num Of Uops: 4
18460 //
18461 //
18462 // Block Throughput: 1.00 Cycles
18463 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18464 //
18465 // | | Ports pressure in cycles | |
18466 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18467 // ---------------------------------------------------------
18468 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18469 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18470 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18471 // Total Num Of Uops: 4
18472
18473 return SDValue();
18474 }
18475
18476 unsigned IdxVal = IdxC->getZExtValue();
18477
18478 // If this is a 256-bit vector result, first extract the 128-bit vector and
18479 // then extract the element from the 128-bit vector.
18480 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18481 // Get the 128-bit vector.
18482 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18483 MVT EltVT = VecVT.getVectorElementType();
18484
18485 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18486 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18487
18488 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18489 // this can be done with a mask.
18490 IdxVal &= ElemsPerChunk - 1;
18491 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18492 DAG.getVectorIdxConstant(IdxVal, dl));
18493 }
18494
18495 assert(VecVT.is128BitVector() && "Unexpected vector length");
18496
18497 MVT VT = Op.getSimpleValueType();
18498
18499 if (VT == MVT::i16) {
18500 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18501 // we're going to zero extend the register or fold the store (SSE41 only).
18502 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18503 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18504 if (Subtarget.hasFP16())
18505 return Op;
18506
18507 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18508 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18509 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18510 }
18511
18512 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18513 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18514 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18515 }
18516
18517 if (Subtarget.hasSSE41())
18518 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18519 return Res;
18520
18521 // Only extract a single element from a v16i8 source - determine the common
18522 // DWORD/WORD that all extractions share, and extract the sub-byte.
18523 // TODO: Add QWORD MOVQ extraction?
18524 if (VT == MVT::i8) {
18525 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18526 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18527
18528 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18529 int DWordIdx = IdxVal / 4;
18530 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18531 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18532 DAG.getBitcast(MVT::v4i32, Vec),
18533 DAG.getVectorIdxConstant(DWordIdx, dl));
18534 int ShiftVal = (IdxVal % 4) * 8;
18535 if (ShiftVal != 0)
18536 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18537 DAG.getConstant(ShiftVal, dl, MVT::i8));
18538 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18539 }
18540
18541 int WordIdx = IdxVal / 2;
18542 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18543 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18544 DAG.getBitcast(MVT::v8i16, Vec),
18545 DAG.getVectorIdxConstant(WordIdx, dl));
18546 int ShiftVal = (IdxVal % 2) * 8;
18547 if (ShiftVal != 0)
18548 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18549 DAG.getConstant(ShiftVal, dl, MVT::i8));
18550 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18551 }
18552 }
18553
18554 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18555 if (IdxVal == 0)
18556 return Op;
18557
18558 // Shuffle the element to the lowest element, then movss or movsh.
18560 Mask[0] = static_cast<int>(IdxVal);
18561 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18562 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18563 DAG.getVectorIdxConstant(0, dl));
18564 }
18565
18566 if (VT.getSizeInBits() == 64) {
18567 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18568 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18569 // to match extract_elt for f64.
18570 if (IdxVal == 0)
18571 return Op;
18572
18573 // UNPCKHPD the element to the lowest double word, then movsd.
18574 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18575 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18576 int Mask[2] = { 1, -1 };
18577 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18578 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18579 DAG.getVectorIdxConstant(0, dl));
18580 }
18581
18582 return SDValue();
18583}
18584
18585/// Insert one bit to mask vector, like v16i1 or v8i1.
18586/// AVX-512 feature.
18588 const X86Subtarget &Subtarget) {
18589 SDLoc dl(Op);
18590 SDValue Vec = Op.getOperand(0);
18591 SDValue Elt = Op.getOperand(1);
18592 SDValue Idx = Op.getOperand(2);
18593 MVT VecVT = Vec.getSimpleValueType();
18594
18595 if (!isa<ConstantSDNode>(Idx)) {
18596 // Non constant index. Extend source and destination,
18597 // insert element and then truncate the result.
18598 unsigned NumElts = VecVT.getVectorNumElements();
18599 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18600 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18601 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18602 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18603 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18604 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18605 }
18606
18607 // Copy into a k-register, extract to v1i1 and insert_subvector.
18608 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18609 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18610}
18611
18612SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18613 SelectionDAG &DAG) const {
18614 MVT VT = Op.getSimpleValueType();
18615 MVT EltVT = VT.getVectorElementType();
18616 unsigned NumElts = VT.getVectorNumElements();
18617 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18618
18619 if (EltVT == MVT::i1)
18620 return InsertBitToMaskVector(Op, DAG, Subtarget);
18621
18622 SDLoc dl(Op);
18623 SDValue N0 = Op.getOperand(0);
18624 SDValue N1 = Op.getOperand(1);
18625 SDValue N2 = Op.getOperand(2);
18626 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18627
18628 if (EltVT == MVT::bf16) {
18630 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18631 DAG.getBitcast(IVT, N0),
18632 DAG.getBitcast(MVT::i16, N1), N2);
18633 return DAG.getBitcast(VT, Res);
18634 }
18635
18636 if (!N2C) {
18637 // Variable insertion indices, usually we're better off spilling to stack,
18638 // but AVX512 can use a variable compare+select by comparing against all
18639 // possible vector indices, and FP insertion has less gpr->simd traffic.
18640 if (!(Subtarget.hasBWI() ||
18641 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18642 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18643 return SDValue();
18644
18645 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18646 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18647 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18648 return SDValue();
18649
18650 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18651 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18652 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18653
18654 SmallVector<SDValue, 16> RawIndices;
18655 for (unsigned I = 0; I != NumElts; ++I)
18656 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18657 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18658
18659 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18660 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18662 }
18663
18664 if (N2C->getAPIntValue().uge(NumElts))
18665 return SDValue();
18666 uint64_t IdxVal = N2C->getZExtValue();
18667
18668 bool IsZeroElt = X86::isZeroNode(N1);
18669 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18670
18671 if (IsZeroElt || IsAllOnesElt) {
18672 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18673 // We don't deal with i8 0 since it appears to be handled elsewhere.
18674 if (IsAllOnesElt &&
18675 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18676 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18677 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18678 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18679 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18680 CstVectorElts[IdxVal] = OnesCst;
18681 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18682 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18683 }
18684 // See if we can do this more efficiently with a blend shuffle with a
18685 // rematerializable vector.
18686 if (Subtarget.hasSSE41() &&
18687 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18688 SmallVector<int, 8> BlendMask;
18689 for (unsigned i = 0; i != NumElts; ++i)
18690 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18691 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18692 : getOnesVector(VT, DAG, dl);
18693 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18694 }
18695 }
18696
18697 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18698 // into that, and then insert the subvector back into the result.
18699 if (VT.is256BitVector() || VT.is512BitVector()) {
18700 // With a 256-bit vector, we can insert into the zero element efficiently
18701 // using a blend if we have AVX or AVX2 and the right data type.
18702 if (VT.is256BitVector() && IdxVal == 0) {
18703 // TODO: It is worthwhile to cast integer to floating point and back
18704 // and incur a domain crossing penalty if that's what we'll end up
18705 // doing anyway after extracting to a 128-bit vector.
18706 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18707 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18708 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18709 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18710 DAG.getTargetConstant(1, dl, MVT::i8));
18711 }
18712 }
18713
18714 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18715 assert(isPowerOf2_32(NumEltsIn128) &&
18716 "Vectors will always have power-of-two number of elements.");
18717
18718 // If we are not inserting into the low 128-bit vector chunk,
18719 // then prefer the broadcast+blend sequence.
18720 // FIXME: relax the profitability check iff all N1 uses are insertions.
18721 if (IdxVal >= NumEltsIn128 &&
18722 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18723 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18724 X86::mayFoldLoad(N1, Subtarget)))) {
18725 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18726 SmallVector<int, 8> BlendMask;
18727 for (unsigned i = 0; i != NumElts; ++i)
18728 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18729 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18730 }
18731
18732 // Get the desired 128-bit vector chunk.
18733 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18734
18735 // Insert the element into the desired chunk.
18736 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18737 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18738
18739 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18740 DAG.getVectorIdxConstant(IdxIn128, dl));
18741
18742 // Insert the changed part back into the bigger vector
18743 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18744 }
18745 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18746
18747 // This will be just movw/movd/movq/movsh/movss/movsd.
18748 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18749 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18750 EltVT == MVT::f16 || EltVT == MVT::i64) {
18751 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18752 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18753 }
18754
18755 // We can't directly insert an i8 or i16 into a vector, so zero extend
18756 // it to i32 first.
18757 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18758 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18759 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18760 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18761 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18762 return DAG.getBitcast(VT, N1);
18763 }
18764 }
18765
18766 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18767 // argument. SSE41 required for pinsrb.
18768 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18769 unsigned Opc;
18770 if (VT == MVT::v8i16) {
18771 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18772 Opc = X86ISD::PINSRW;
18773 } else {
18774 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18775 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18776 Opc = X86ISD::PINSRB;
18777 }
18778
18779 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18780 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18781 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18782 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18783 }
18784
18785 if (Subtarget.hasSSE41()) {
18786 if (EltVT == MVT::f32) {
18787 // Bits [7:6] of the constant are the source select. This will always be
18788 // zero here. The DAG Combiner may combine an extract_elt index into
18789 // these bits. For example (insert (extract, 3), 2) could be matched by
18790 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18791 // Bits [5:4] of the constant are the destination select. This is the
18792 // value of the incoming immediate.
18793 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18794 // combine either bitwise AND or insert of float 0.0 to set these bits.
18795
18796 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18797 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18798 // If this is an insertion of 32-bits into the low 32-bits of
18799 // a vector, we prefer to generate a blend with immediate rather
18800 // than an insertps. Blends are simpler operations in hardware and so
18801 // will always have equal or better performance than insertps.
18802 // But if optimizing for size and there's a load folding opportunity,
18803 // generate insertps because blendps does not have a 32-bit memory
18804 // operand form.
18805 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18806 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18807 DAG.getTargetConstant(1, dl, MVT::i8));
18808 }
18809 // Create this as a scalar to vector..
18810 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18811 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18812 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18813 }
18814
18815 // PINSR* works with constant index.
18816 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18817 return Op;
18818 }
18819
18820 return SDValue();
18821}
18822
18824 SelectionDAG &DAG) {
18825 SDLoc dl(Op);
18826 MVT OpVT = Op.getSimpleValueType();
18827
18828 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18829 // combines.
18830 if (X86::isZeroNode(Op.getOperand(0)))
18831 return getZeroVector(OpVT, Subtarget, DAG, dl);
18832
18833 // If this is a 256-bit vector result, first insert into a 128-bit
18834 // vector and then insert into the 256-bit vector.
18835 if (!OpVT.is128BitVector()) {
18836 // Insert into a 128-bit vector.
18837 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18839 OpVT.getVectorNumElements() / SizeFactor);
18840
18841 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18842
18843 // Insert the 128-bit vector.
18844 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18845 }
18846 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18847 "Expected an SSE type!");
18848
18849 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18850 // tblgen.
18851 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18852 return Op;
18853
18854 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18855 return DAG.getBitcast(
18856 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18857}
18858
18859// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18860// simple superregister reference or explicit instructions to insert
18861// the upper bits of a vector.
18863 SelectionDAG &DAG) {
18864 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18865
18866 return insert1BitVector(Op, DAG, Subtarget);
18867}
18868
18870 SelectionDAG &DAG) {
18871 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18872 "Only vXi1 extract_subvectors need custom lowering");
18873
18874 SDLoc dl(Op);
18875 SDValue Vec = Op.getOperand(0);
18876 uint64_t IdxVal = Op.getConstantOperandVal(1);
18877
18878 if (IdxVal == 0) // the operation is legal
18879 return Op;
18880
18881 // Extend to natively supported kshift.
18882 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18883
18884 // Shift to the LSB.
18885 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18886 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18887
18888 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18889 DAG.getVectorIdxConstant(0, dl));
18890}
18891
18892// Returns the appropriate wrapper opcode for a global reference.
18893unsigned X86TargetLowering::getGlobalWrapperKind(
18894 const GlobalValue *GV, const unsigned char OpFlags) const {
18895 // References to absolute symbols are never PC-relative.
18896 if (GV && GV->isAbsoluteSymbolRef())
18897 return X86ISD::Wrapper;
18898
18899 // The following OpFlags under RIP-rel PIC use RIP.
18900 if (Subtarget.isPICStyleRIPRel() &&
18901 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18902 OpFlags == X86II::MO_DLLIMPORT))
18903 return X86ISD::WrapperRIP;
18904
18905 // GOTPCREL references must always use RIP.
18906 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18907 return X86ISD::WrapperRIP;
18908
18909 return X86ISD::Wrapper;
18910}
18911
18912// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18913// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18914// one of the above mentioned nodes. It has to be wrapped because otherwise
18915// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18916// be used to form addressing mode. These wrapped nodes will be selected
18917// into MOV32ri.
18918SDValue
18919X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18920 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18921
18922 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18923 // global base reg.
18924 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18925
18926 auto PtrVT = getPointerTy(DAG.getDataLayout());
18928 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18929 SDLoc DL(CP);
18930 Result =
18931 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18932 // With PIC, the address is actually $g + Offset.
18933 if (OpFlag) {
18934 Result =
18935 DAG.getNode(ISD::ADD, DL, PtrVT,
18936 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18937 }
18938
18939 return Result;
18940}
18941
18942SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18943 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18944
18945 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18946 // global base reg.
18947 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18948
18949 auto PtrVT = getPointerTy(DAG.getDataLayout());
18950 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18951 SDLoc DL(JT);
18952 Result =
18953 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18954
18955 // With PIC, the address is actually $g + Offset.
18956 if (OpFlag)
18957 Result =
18958 DAG.getNode(ISD::ADD, DL, PtrVT,
18959 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18960
18961 return Result;
18962}
18963
18964SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18965 SelectionDAG &DAG) const {
18966 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18967}
18968
18969SDValue
18970X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18971 // Create the TargetBlockAddressAddress node.
18972 unsigned char OpFlags =
18974 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18975 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18976 SDLoc dl(Op);
18977 auto PtrVT = getPointerTy(DAG.getDataLayout());
18978 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18979 Result =
18980 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18981
18982 // With PIC, the address is actually $g + Offset.
18983 if (isGlobalRelativeToPICBase(OpFlags)) {
18984 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18985 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18986 }
18987
18988 return Result;
18989}
18990
18991/// Creates target global address or external symbol nodes for calls or
18992/// other uses.
18993SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18994 bool ForCall) const {
18995 // Unpack the global address or external symbol.
18996 SDLoc dl(Op);
18997 const GlobalValue *GV = nullptr;
18998 int64_t Offset = 0;
18999 const char *ExternalSym = nullptr;
19000 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19001 GV = G->getGlobal();
19002 Offset = G->getOffset();
19003 } else {
19004 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19005 ExternalSym = ES->getSymbol();
19006 }
19007
19008 // Calculate some flags for address lowering.
19010 unsigned char OpFlags;
19011 if (ForCall)
19012 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19013 else
19014 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19015 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19016 bool NeedsLoad = isGlobalStubReference(OpFlags);
19017
19019 auto PtrVT = getPointerTy(DAG.getDataLayout());
19021
19022 if (GV) {
19023 // Create a target global address if this is a global. If possible, fold the
19024 // offset into the global address reference. Otherwise, ADD it on later.
19025 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19026 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19027 // relocation will compute to a negative value, which is invalid.
19028 int64_t GlobalOffset = 0;
19029 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19031 std::swap(GlobalOffset, Offset);
19032 }
19033 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19034 } else {
19035 // If this is not a global address, this must be an external symbol.
19036 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19037 }
19038
19039 // If this is a direct call, avoid the wrapper if we don't need to do any
19040 // loads or adds. This allows SDAG ISel to match direct calls.
19041 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19042 return Result;
19043
19044 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19045
19046 // With PIC, the address is actually $g + Offset.
19047 if (HasPICReg) {
19048 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19049 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19050 }
19051
19052 // For globals that require a load from a stub to get the address, emit the
19053 // load.
19054 if (NeedsLoad)
19055 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19057
19058 // If there was a non-zero offset that we didn't fold, create an explicit
19059 // addition for it.
19060 if (Offset != 0)
19061 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19062 DAG.getSignedConstant(Offset, dl, PtrVT));
19063
19064 return Result;
19065}
19066
19067SDValue
19068X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19069 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19070}
19071
19073 const EVT PtrVT, unsigned ReturnReg,
19074 unsigned char OperandFlags,
19075 bool LoadGlobalBaseReg = false,
19076 bool LocalDynamic = false) {
19078 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19079 SDLoc dl(GA);
19080 SDValue TGA;
19081 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19082 SDValue Chain = DAG.getEntryNode();
19083 SDValue Ret;
19084 if (LocalDynamic && UseTLSDESC) {
19085 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19086 // Reuse existing GetTLSADDR node if we can find it.
19087 if (TGA->hasOneUse()) {
19088 // TLSDESC uses TGA.
19089 SDNode *TLSDescOp = *TGA->user_begin();
19090 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19091 "Unexpected TLSDESC DAG");
19092 // CALLSEQ_END uses TGA via a chain and glue.
19093 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19094 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19095 "Unexpected TLSDESC DAG");
19096 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19097 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19098 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19099 "Unexpected TLSDESC DAG");
19100 Ret = SDValue(CopyFromRegOp, 0);
19101 }
19102 } else {
19103 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19104 GA->getOffset(), OperandFlags);
19105 }
19106
19107 if (!Ret) {
19108 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19109 : LocalDynamic ? X86ISD::TLSBASEADDR
19111
19112 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19113 if (LoadGlobalBaseReg) {
19114 SDValue InGlue;
19115 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19116 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19117 InGlue);
19118 InGlue = Chain.getValue(1);
19119 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19120 } else {
19121 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19122 }
19123 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19124
19125 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19126 MFI.setHasCalls(true);
19127
19128 SDValue Glue = Chain.getValue(1);
19129 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19130 }
19131
19132 if (!UseTLSDESC)
19133 return Ret;
19134
19135 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19136 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19137
19139 SDValue Offset =
19140 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19142 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19143}
19144
19145// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19146static SDValue
19148 const EVT PtrVT) {
19149 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19150 /*LoadGlobalBaseReg=*/true);
19151}
19152
19153// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19154static SDValue
19156 const EVT PtrVT) {
19157 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19158}
19159
19160// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19161static SDValue
19163 const EVT PtrVT) {
19164 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19165}
19166
19168 SelectionDAG &DAG, const EVT PtrVT,
19169 bool Is64Bit, bool Is64BitLP64) {
19170 SDLoc dl(GA);
19171
19172 // Get the start address of the TLS block for this module.
19176
19177 SDValue Base;
19178 if (Is64Bit) {
19179 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19180 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19181 /*LoadGlobalBaseReg=*/false,
19182 /*LocalDynamic=*/true);
19183 } else {
19184 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19185 /*LoadGlobalBaseReg=*/true,
19186 /*LocalDynamic=*/true);
19187 }
19188
19189 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19190 // of Base.
19191
19192 // Build x@dtpoff.
19193 unsigned char OperandFlags = X86II::MO_DTPOFF;
19194 unsigned WrapperKind = X86ISD::Wrapper;
19195 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19196 GA->getValueType(0),
19197 GA->getOffset(), OperandFlags);
19198 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19199
19200 // Add x@dtpoff with the base.
19201 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19202}
19203
19204// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19206 const EVT PtrVT, TLSModel::Model model,
19207 bool is64Bit, bool isPIC) {
19208 SDLoc dl(GA);
19209
19210 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19213
19214 SDValue ThreadPointer =
19215 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19217
19218 unsigned char OperandFlags = 0;
19219 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19220 // initialexec.
19221 unsigned WrapperKind = X86ISD::Wrapper;
19222 if (model == TLSModel::LocalExec) {
19223 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19224 } else if (model == TLSModel::InitialExec) {
19225 if (is64Bit) {
19226 OperandFlags = X86II::MO_GOTTPOFF;
19227 WrapperKind = X86ISD::WrapperRIP;
19228 } else {
19229 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19230 }
19231 } else {
19232 llvm_unreachable("Unexpected model");
19233 }
19234
19235 // emit "addl x@ntpoff,%eax" (local exec)
19236 // or "addl x@indntpoff,%eax" (initial exec)
19237 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19238 SDValue TGA =
19239 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19240 GA->getOffset(), OperandFlags);
19241 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19242
19243 if (model == TLSModel::InitialExec) {
19244 if (isPIC && !is64Bit) {
19245 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19246 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19247 Offset);
19248 }
19249
19250 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19252 }
19253
19254 // The address of the thread local variable is the add of the thread
19255 // pointer with the offset of the variable.
19256 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19257}
19258
19259SDValue
19260X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19261
19262 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19263
19264 if (DAG.getTarget().useEmulatedTLS())
19265 return LowerToTLSEmulatedModel(GA, DAG);
19266
19267 const GlobalValue *GV = GA->getGlobal();
19268 auto PtrVT = getPointerTy(DAG.getDataLayout());
19269 bool PositionIndependent = isPositionIndependent();
19270
19271 if (Subtarget.isTargetELF()) {
19272 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19273 switch (model) {
19275 if (Subtarget.is64Bit()) {
19276 if (Subtarget.isTarget64BitLP64())
19277 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19278 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19279 }
19280 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19282 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19283 Subtarget.isTarget64BitLP64());
19286 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19287 PositionIndependent);
19288 }
19289 llvm_unreachable("Unknown TLS model.");
19290 }
19291
19292 if (Subtarget.isTargetDarwin()) {
19293 // Darwin only has one model of TLS. Lower to that.
19294 unsigned char OpFlag = 0;
19295 unsigned WrapperKind = 0;
19296
19297 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19298 // global base reg.
19299 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19300 if (PIC32) {
19301 OpFlag = X86II::MO_TLVP_PIC_BASE;
19302 WrapperKind = X86ISD::Wrapper;
19303 } else {
19304 OpFlag = X86II::MO_TLVP;
19305 WrapperKind = X86ISD::WrapperRIP;
19306 }
19307 SDLoc DL(Op);
19309 GA->getValueType(0),
19310 GA->getOffset(), OpFlag);
19311 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19312
19313 // With PIC32, the address is actually $g + Offset.
19314 if (PIC32)
19315 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19316 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19317 Offset);
19318
19319 // Lowering the machine isd will make sure everything is in the right
19320 // location.
19321 SDValue Chain = DAG.getEntryNode();
19322 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19323 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19324 SDValue Args[] = { Chain, Offset };
19325 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19326 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19327
19328 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19330 MFI.setAdjustsStack(true);
19331
19332 // And our return value (tls address) is in the standard call return value
19333 // location.
19334 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19335 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19336 }
19337
19338 if (Subtarget.isOSWindows()) {
19339 // Just use the implicit TLS architecture
19340 // Need to generate something similar to:
19341 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19342 // ; from TEB
19343 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19344 // mov rcx, qword [rdx+rcx*8]
19345 // mov eax, .tls$:tlsvar
19346 // [rax+rcx] contains the address
19347 // Windows 64bit: gs:0x58
19348 // Windows 32bit: fs:__tls_array
19349
19350 SDLoc dl(GA);
19351 SDValue Chain = DAG.getEntryNode();
19352
19353 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19354 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19355 // use its literal value of 0x2C.
19357 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19359
19360 SDValue TlsArray = Subtarget.is64Bit()
19361 ? DAG.getIntPtrConstant(0x58, dl)
19362 : (Subtarget.isTargetWindowsGNU()
19363 ? DAG.getIntPtrConstant(0x2C, dl)
19364 : DAG.getExternalSymbol("_tls_array", PtrVT));
19365
19367 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19368
19369 SDValue res;
19371 res = ThreadPointer;
19372 } else {
19373 // Load the _tls_index variable
19374 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19375 if (Subtarget.is64Bit())
19376 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19377 MachinePointerInfo(), MVT::i32);
19378 else
19379 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19380
19381 const DataLayout &DL = DAG.getDataLayout();
19382 SDValue Scale =
19383 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19384 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19385
19386 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19387 }
19388
19389 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19390
19391 // Get the offset of start of .tls section
19392 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19393 GA->getValueType(0),
19395 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19396
19397 // The address of the thread local variable is the add of the thread
19398 // pointer with the offset of the variable.
19399 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19400 }
19401
19402 llvm_unreachable("TLS not implemented for this target.");
19403}
19404
19406 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19407 const TargetMachine &TM = getTargetMachine();
19408 TLSModel::Model Model = TM.getTLSModel(&GV);
19409 switch (Model) {
19412 // We can include the %fs segment register in addressing modes.
19413 return true;
19416 // These models do not result in %fs relative addresses unless
19417 // TLS descriptior are used.
19418 //
19419 // Even in the case of TLS descriptors we currently have no way to model
19420 // the difference between %fs access and the computations needed for the
19421 // offset and returning `true` for TLS-desc currently duplicates both
19422 // which is detrimental :-/
19423 return false;
19424 }
19425 }
19426 return false;
19427}
19428
19429/// Lower SRA_PARTS and friends, which return two i32 values
19430/// and take a 2 x i32 value to shift plus a shift amount.
19431/// TODO: Can this be moved to general expansion code?
19433 SDValue Lo, Hi;
19434 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19435 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19436}
19437
19438// Try to use a packed vector operation to handle i64 on 32-bit targets when
19439// AVX512DQ is enabled.
19441 SelectionDAG &DAG,
19442 const X86Subtarget &Subtarget) {
19443 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19444 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19445 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19446 Op.getOpcode() == ISD::UINT_TO_FP) &&
19447 "Unexpected opcode!");
19448 bool IsStrict = Op->isStrictFPOpcode();
19449 unsigned OpNo = IsStrict ? 1 : 0;
19450 SDValue Src = Op.getOperand(OpNo);
19451 MVT SrcVT = Src.getSimpleValueType();
19452 MVT VT = Op.getSimpleValueType();
19453
19454 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19455 (VT != MVT::f32 && VT != MVT::f64))
19456 return SDValue();
19457
19458 // Pack the i64 into a vector, do the operation and extract.
19459
19460 // Using 256-bit to ensure result is 128-bits for f32 case.
19461 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19462 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19463 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19464
19465 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19466 if (IsStrict) {
19467 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19468 {Op.getOperand(0), InVec});
19469 SDValue Chain = CvtVec.getValue(1);
19470 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19471 DAG.getVectorIdxConstant(0, dl));
19472 return DAG.getMergeValues({Value, Chain}, dl);
19473 }
19474
19475 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19476
19477 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19478 DAG.getVectorIdxConstant(0, dl));
19479}
19480
19481// Try to use a packed vector operation to handle i64 on 32-bit targets.
19483 const X86Subtarget &Subtarget) {
19484 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19485 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19486 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19487 Op.getOpcode() == ISD::UINT_TO_FP) &&
19488 "Unexpected opcode!");
19489 bool IsStrict = Op->isStrictFPOpcode();
19490 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19491 MVT SrcVT = Src.getSimpleValueType();
19492 MVT VT = Op.getSimpleValueType();
19493
19494 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19495 return SDValue();
19496
19497 // Pack the i64 into a vector, do the operation and extract.
19498
19499 assert(Subtarget.hasFP16() && "Expected FP16");
19500
19501 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19502 if (IsStrict) {
19503 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19504 {Op.getOperand(0), InVec});
19505 SDValue Chain = CvtVec.getValue(1);
19506 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19507 DAG.getVectorIdxConstant(0, dl));
19508 return DAG.getMergeValues({Value, Chain}, dl);
19509 }
19510
19511 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19512
19513 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19514 DAG.getVectorIdxConstant(0, dl));
19515}
19516
19517static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19518 const X86Subtarget &Subtarget) {
19519 switch (Opcode) {
19520 case ISD::SINT_TO_FP:
19521 // TODO: Handle wider types with AVX/AVX512.
19522 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19523 return false;
19524 // CVTDQ2PS or (V)CVTDQ2PD
19525 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19526
19527 case ISD::UINT_TO_FP:
19528 // TODO: Handle wider types and i64 elements.
19529 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19530 return false;
19531 // VCVTUDQ2PS or VCVTUDQ2PD
19532 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19533
19534 default:
19535 return false;
19536 }
19537}
19538
19539/// Given a scalar cast operation that is extracted from a vector, try to
19540/// vectorize the cast op followed by extraction. This will avoid an expensive
19541/// round-trip between XMM and GPR.
19543 SelectionDAG &DAG,
19544 const X86Subtarget &Subtarget) {
19545 // TODO: This could be enhanced to handle smaller integer types by peeking
19546 // through an extend.
19547 SDValue Extract = Cast.getOperand(0);
19548 MVT DestVT = Cast.getSimpleValueType();
19549 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19550 !isa<ConstantSDNode>(Extract.getOperand(1)))
19551 return SDValue();
19552
19553 // See if we have a 128-bit vector cast op for this type of cast.
19554 SDValue VecOp = Extract.getOperand(0);
19555 MVT FromVT = VecOp.getSimpleValueType();
19556 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19557 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19558 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19559 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19560 return SDValue();
19561
19562 // If we are extracting from a non-zero element, first shuffle the source
19563 // vector to allow extracting from element zero.
19564 if (!isNullConstant(Extract.getOperand(1))) {
19565 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19566 Mask[0] = Extract.getConstantOperandVal(1);
19567 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19568 }
19569 // If the source vector is wider than 128-bits, extract the low part. Do not
19570 // create an unnecessarily wide vector cast op.
19571 if (FromVT != Vec128VT)
19572 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19573
19574 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19575 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19576 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19577 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19578 DAG.getVectorIdxConstant(0, DL));
19579}
19580
19581/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19582/// try to vectorize the cast ops. This will avoid an expensive round-trip
19583/// between XMM and GPR.
19584static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19585 SelectionDAG &DAG,
19586 const X86Subtarget &Subtarget) {
19587 // TODO: Allow FP_TO_UINT.
19588 SDValue CastToInt = CastToFP.getOperand(0);
19589 MVT VT = CastToFP.getSimpleValueType();
19590 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19591 return SDValue();
19592
19593 MVT IntVT = CastToInt.getSimpleValueType();
19594 SDValue X = CastToInt.getOperand(0);
19595 MVT SrcVT = X.getSimpleValueType();
19596 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19597 return SDValue();
19598
19599 // See if we have 128-bit vector cast instructions for this type of cast.
19600 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19601 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19602 IntVT != MVT::i32)
19603 return SDValue();
19604
19605 unsigned SrcSize = SrcVT.getSizeInBits();
19606 unsigned IntSize = IntVT.getSizeInBits();
19607 unsigned VTSize = VT.getSizeInBits();
19608 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19609 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19610 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19611
19612 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19613 unsigned ToIntOpcode =
19614 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19615 unsigned ToFPOpcode =
19616 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19617
19618 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19619 //
19620 // We are not defining the high elements (for example, zero them) because
19621 // that could nullify any performance advantage that we hoped to gain from
19622 // this vector op hack. We do not expect any adverse effects (like denorm
19623 // penalties) with cast ops.
19624 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19625 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19626 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19627 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19628 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19629}
19630
19632 SelectionDAG &DAG,
19633 const X86Subtarget &Subtarget) {
19634 bool IsStrict = Op->isStrictFPOpcode();
19635 MVT VT = Op->getSimpleValueType(0);
19636 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19637
19638 if (Subtarget.hasDQI()) {
19639 assert(!Subtarget.hasVLX() && "Unexpected features");
19640
19641 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19642 Src.getSimpleValueType() == MVT::v4i64) &&
19643 "Unsupported custom type");
19644
19645 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19646 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19647 "Unexpected VT!");
19648 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19649
19650 // Need to concat with zero vector for strict fp to avoid spurious
19651 // exceptions.
19652 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19653 : DAG.getUNDEF(MVT::v8i64);
19654 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19655 DAG.getVectorIdxConstant(0, DL));
19656 SDValue Res, Chain;
19657 if (IsStrict) {
19658 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19659 {Op->getOperand(0), Src});
19660 Chain = Res.getValue(1);
19661 } else {
19662 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19663 }
19664
19665 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19666 DAG.getVectorIdxConstant(0, DL));
19667
19668 if (IsStrict)
19669 return DAG.getMergeValues({Res, Chain}, DL);
19670 return Res;
19671 }
19672
19673 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19674 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19675 if (VT != MVT::v4f32 || IsSigned)
19676 return SDValue();
19677
19678 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19679 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19680 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19681 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19682 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19683 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19684 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19685 SmallVector<SDValue, 4> SignCvts(4);
19686 SmallVector<SDValue, 4> Chains(4);
19687 for (int i = 0; i != 4; ++i) {
19688 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19689 DAG.getVectorIdxConstant(i, DL));
19690 if (IsStrict) {
19691 SignCvts[i] =
19692 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19693 {Op.getOperand(0), Elt});
19694 Chains[i] = SignCvts[i].getValue(1);
19695 } else {
19696 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19697 }
19698 }
19699 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19700
19701 SDValue Slow, Chain;
19702 if (IsStrict) {
19703 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19704 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19705 {Chain, SignCvt, SignCvt});
19706 Chain = Slow.getValue(1);
19707 } else {
19708 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19709 }
19710
19711 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19712 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19713
19714 if (IsStrict)
19715 return DAG.getMergeValues({Cvt, Chain}, DL);
19716
19717 return Cvt;
19718}
19719
19721 SelectionDAG &DAG) {
19722 bool IsStrict = Op->isStrictFPOpcode();
19723 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19724 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19725 MVT VT = Op.getSimpleValueType();
19726 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
19727
19728 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
19729 if (IsStrict)
19730 return DAG.getNode(
19731 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19732 {Chain,
19733 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19734 Rnd});
19735 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19736 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19737}
19738
19739static bool isLegalConversion(MVT VT, bool IsSigned,
19740 const X86Subtarget &Subtarget) {
19741 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19742 return true;
19743 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19744 return true;
19745 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19746 return true;
19747 if (Subtarget.useAVX512Regs()) {
19748 if (VT == MVT::v16i32)
19749 return true;
19750 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19751 return true;
19752 }
19753 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19754 (VT == MVT::v2i64 || VT == MVT::v4i64))
19755 return true;
19756 return false;
19757}
19758
19759SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19760 SelectionDAG &DAG) const {
19761 bool IsStrict = Op->isStrictFPOpcode();
19762 unsigned OpNo = IsStrict ? 1 : 0;
19763 SDValue Src = Op.getOperand(OpNo);
19764 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19765 MVT SrcVT = Src.getSimpleValueType();
19766 MVT VT = Op.getSimpleValueType();
19767 SDLoc dl(Op);
19768
19769 if (isSoftF16(VT, Subtarget))
19770 return promoteXINT_TO_FP(Op, dl, DAG);
19771 else if (isLegalConversion(SrcVT, true, Subtarget))
19772 return Op;
19773
19774 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19775 return LowerWin64_INT128_TO_FP(Op, DAG);
19776
19777 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19778 return Extract;
19779
19780 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
19781 return R;
19782
19783 if (SrcVT.isVector()) {
19784 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19785 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19786 // source for strict FP.
19787 if (IsStrict)
19788 return DAG.getNode(
19789 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19790 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19791 DAG.getUNDEF(SrcVT))});
19792 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19793 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19794 DAG.getUNDEF(SrcVT)));
19795 }
19796 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19797 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19798
19799 return SDValue();
19800 }
19801
19802 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19803 "Unknown SINT_TO_FP to lower!");
19804
19805 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19806
19807 // These are really Legal; return the operand so the caller accepts it as
19808 // Legal.
19809 if (SrcVT == MVT::i32 && UseSSEReg)
19810 return Op;
19811 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19812 return Op;
19813
19814 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19815 return V;
19816 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19817 return V;
19818
19819 // SSE doesn't have an i16 conversion so we need to promote.
19820 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19821 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19822 if (IsStrict)
19823 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19824 {Chain, Ext});
19825
19826 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19827 }
19828
19829 if (VT == MVT::f128 || !Subtarget.hasX87())
19830 return SDValue();
19831
19832 SDValue ValueToStore = Src;
19833 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19834 // Bitcasting to f64 here allows us to do a single 64-bit store from
19835 // an SSE register, avoiding the store forwarding penalty that would come
19836 // with two 32-bit stores.
19837 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19838
19839 unsigned Size = SrcVT.getStoreSize();
19840 Align Alignment(Size);
19842 auto PtrVT = getPointerTy(MF.getDataLayout());
19843 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19844 MachinePointerInfo MPI =
19846 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19847 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19848 std::pair<SDValue, SDValue> Tmp =
19849 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19850
19851 if (IsStrict)
19852 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19853
19854 return Tmp.first;
19855}
19856
19857std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19858 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19859 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19860 // Build the FILD
19861 SDVTList Tys;
19862 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19863 if (useSSE)
19864 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19865 else
19866 Tys = DAG.getVTList(DstVT, MVT::Other);
19867
19868 SDValue FILDOps[] = {Chain, Pointer};
19869 SDValue Result =
19870 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19871 Alignment, MachineMemOperand::MOLoad);
19872 Chain = Result.getValue(1);
19873
19874 if (useSSE) {
19876 unsigned SSFISize = DstVT.getStoreSize();
19877 int SSFI =
19878 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19879 auto PtrVT = getPointerTy(MF.getDataLayout());
19880 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19881 Tys = DAG.getVTList(MVT::Other);
19882 SDValue FSTOps[] = {Chain, Result, StackSlot};
19885 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19886
19887 Chain =
19888 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19889 Result = DAG.getLoad(
19890 DstVT, DL, Chain, StackSlot,
19892 Chain = Result.getValue(1);
19893 }
19894
19895 return { Result, Chain };
19896}
19897
19898/// Horizontal vector math instructions may be slower than normal math with
19899/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19900/// implementation, and likely shuffle complexity of the alternate sequence.
19901static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19902 const X86Subtarget &Subtarget) {
19903 bool IsOptimizingSize = DAG.shouldOptForSize();
19904 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19905 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19906}
19907
19908/// 64-bit unsigned integer to double expansion.
19910 SelectionDAG &DAG,
19911 const X86Subtarget &Subtarget) {
19912 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19913 // when converting 0 when rounding toward negative infinity. Caller will
19914 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19915 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19916 // This algorithm is not obvious. Here it is what we're trying to output:
19917 /*
19918 movq %rax, %xmm0
19919 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19920 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19921 #ifdef __SSE3__
19922 haddpd %xmm0, %xmm0
19923 #else
19924 pshufd $0x4e, %xmm0, %xmm1
19925 addpd %xmm1, %xmm0
19926 #endif
19927 */
19928
19929 LLVMContext *Context = DAG.getContext();
19930
19931 // Build some magic constants.
19932 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19933 Constant *C0 = ConstantDataVector::get(*Context, CV0);
19934 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19935 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19936
19938 CV1.push_back(
19939 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19940 APInt(64, 0x4330000000000000ULL))));
19941 CV1.push_back(
19942 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19943 APInt(64, 0x4530000000000000ULL))));
19944 Constant *C1 = ConstantVector::get(CV1);
19945 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19946
19947 // Load the 64-bit value into an XMM register.
19948 SDValue XR1 =
19949 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19950 SDValue CLod0 = DAG.getLoad(
19951 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19953 SDValue Unpck1 =
19954 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19955
19956 SDValue CLod1 = DAG.getLoad(
19957 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19959 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19960 // TODO: Are there any fast-math-flags to propagate here?
19961 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19962 SDValue Result;
19963
19964 if (Subtarget.hasSSE3() &&
19965 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19966 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19967 } else {
19968 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19969 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19970 }
19971 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19972 DAG.getVectorIdxConstant(0, dl));
19973 return Result;
19974}
19975
19976/// 32-bit unsigned integer to float expansion.
19978 SelectionDAG &DAG,
19979 const X86Subtarget &Subtarget) {
19980 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19981 // FP constant to bias correct the final result.
19982 SDValue Bias = DAG.getConstantFP(
19983 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19984
19985 // Load the 32-bit value into an XMM register.
19986 SDValue Load =
19987 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19988
19989 // Zero out the upper parts of the register.
19990 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19991
19992 // Or the load with the bias.
19993 SDValue Or = DAG.getNode(
19994 ISD::OR, dl, MVT::v2i64,
19995 DAG.getBitcast(MVT::v2i64, Load),
19996 DAG.getBitcast(MVT::v2i64,
19997 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19998 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19999 DAG.getBitcast(MVT::v2f64, Or),
20000 DAG.getVectorIdxConstant(0, dl));
20001
20002 if (Op.getNode()->isStrictFPOpcode()) {
20003 // Subtract the bias.
20004 // TODO: Are there any fast-math-flags to propagate here?
20005 SDValue Chain = Op.getOperand(0);
20006 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20007 {Chain, Or, Bias});
20008
20009 if (Op.getValueType() == Sub.getValueType())
20010 return Sub;
20011
20012 // Handle final rounding.
20013 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20014 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20015
20016 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20017 }
20018
20019 // Subtract the bias.
20020 // TODO: Are there any fast-math-flags to propagate here?
20021 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20022
20023 // Handle final rounding.
20024 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20025}
20026
20028 SelectionDAG &DAG,
20029 const X86Subtarget &Subtarget) {
20030 if (Op.getSimpleValueType() != MVT::v2f64)
20031 return SDValue();
20032
20033 bool IsStrict = Op->isStrictFPOpcode();
20034
20035 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20036 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20037
20038 if (Subtarget.hasAVX512()) {
20039 if (!Subtarget.hasVLX()) {
20040 // Let generic type legalization widen this.
20041 if (!IsStrict)
20042 return SDValue();
20043 // Otherwise pad the integer input with 0s and widen the operation.
20044 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20045 DAG.getConstant(0, DL, MVT::v2i32));
20046 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20047 {Op.getOperand(0), N0});
20048 SDValue Chain = Res.getValue(1);
20049 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20050 DAG.getVectorIdxConstant(0, DL));
20051 return DAG.getMergeValues({Res, Chain}, DL);
20052 }
20053
20054 // Legalize to v4i32 type.
20055 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20056 DAG.getUNDEF(MVT::v2i32));
20057 if (IsStrict)
20058 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20059 {Op.getOperand(0), N0});
20060 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20061 }
20062
20063 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20064 // This gives us the floating point equivalent of 2^52 + the i32 integer
20065 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20066 // point leaving just our i32 integers in double format.
20067 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20068 SDValue VBias = DAG.getConstantFP(
20069 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20070 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20071 DAG.getBitcast(MVT::v2i64, VBias));
20072 Or = DAG.getBitcast(MVT::v2f64, Or);
20073
20074 if (IsStrict)
20075 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20076 {Op.getOperand(0), Or, VBias});
20077 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20078}
20079
20081 SelectionDAG &DAG,
20082 const X86Subtarget &Subtarget) {
20083 bool IsStrict = Op->isStrictFPOpcode();
20084 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20085 MVT VecIntVT = V.getSimpleValueType();
20086 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20087 "Unsupported custom type");
20088
20089 if (Subtarget.hasAVX512()) {
20090 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20091 assert(!Subtarget.hasVLX() && "Unexpected features");
20092 MVT VT = Op->getSimpleValueType(0);
20093
20094 // v8i32->v8f64 is legal with AVX512 so just return it.
20095 if (VT == MVT::v8f64)
20096 return Op;
20097
20098 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20099 "Unexpected VT!");
20100 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20101 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20102 // Need to concat with zero vector for strict fp to avoid spurious
20103 // exceptions.
20104 SDValue Tmp =
20105 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20106 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20107 DAG.getVectorIdxConstant(0, DL));
20108 SDValue Res, Chain;
20109 if (IsStrict) {
20110 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20111 {Op->getOperand(0), V});
20112 Chain = Res.getValue(1);
20113 } else {
20114 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20115 }
20116
20117 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20118 DAG.getVectorIdxConstant(0, DL));
20119
20120 if (IsStrict)
20121 return DAG.getMergeValues({Res, Chain}, DL);
20122 return Res;
20123 }
20124
20125 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20126 Op->getSimpleValueType(0) == MVT::v4f64) {
20127 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20128 Constant *Bias = ConstantFP::get(
20129 *DAG.getContext(),
20130 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20131 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20132 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20133 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20134 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20135 SDValue VBias = DAG.getMemIntrinsicNode(
20136 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20139
20140 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20141 DAG.getBitcast(MVT::v4i64, VBias));
20142 Or = DAG.getBitcast(MVT::v4f64, Or);
20143
20144 if (IsStrict)
20145 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20146 {Op.getOperand(0), Or, VBias});
20147 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20148 }
20149
20150 // The algorithm is the following:
20151 // #ifdef __SSE4_1__
20152 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20153 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20154 // (uint4) 0x53000000, 0xaa);
20155 // #else
20156 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20157 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20158 // #endif
20159 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20160 // return (float4) lo + fhi;
20161
20162 bool Is128 = VecIntVT == MVT::v4i32;
20163 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20164 // If we convert to something else than the supported type, e.g., to v4f64,
20165 // abort early.
20166 if (VecFloatVT != Op->getSimpleValueType(0))
20167 return SDValue();
20168
20169 // In the #idef/#else code, we have in common:
20170 // - The vector of constants:
20171 // -- 0x4b000000
20172 // -- 0x53000000
20173 // - A shift:
20174 // -- v >> 16
20175
20176 // Create the splat vector for 0x4b000000.
20177 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20178 // Create the splat vector for 0x53000000.
20179 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20180
20181 // Create the right shift.
20182 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20183 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20184
20185 SDValue Low, High;
20186 if (Subtarget.hasSSE41()) {
20187 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20188 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20189 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20190 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20191 // Low will be bitcasted right away, so do not bother bitcasting back to its
20192 // original type.
20193 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20194 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20195 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20196 // (uint4) 0x53000000, 0xaa);
20197 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20198 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20199 // High will be bitcasted right away, so do not bother bitcasting back to
20200 // its original type.
20201 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20202 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20203 } else {
20204 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20205 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20206 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20207 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20208
20209 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20210 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20211 }
20212
20213 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20214 SDValue VecCstFSub = DAG.getConstantFP(
20215 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20216
20217 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20218 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20219 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20220 // enabled. See PR24512.
20221 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20222 // TODO: Are there any fast-math-flags to propagate here?
20223 // (float4) lo;
20224 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20225 // return (float4) lo + fhi;
20226 if (IsStrict) {
20227 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20228 {Op.getOperand(0), HighBitcast, VecCstFSub});
20229 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20230 {FHigh.getValue(1), LowBitcast, FHigh});
20231 }
20232
20233 SDValue FHigh =
20234 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20235 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20236}
20237
20239 const X86Subtarget &Subtarget) {
20240 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20241 SDValue N0 = Op.getOperand(OpNo);
20242 MVT SrcVT = N0.getSimpleValueType();
20243
20244 switch (SrcVT.SimpleTy) {
20245 default:
20246 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20247 case MVT::v2i32:
20248 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20249 case MVT::v4i32:
20250 case MVT::v8i32:
20251 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20252 case MVT::v2i64:
20253 case MVT::v4i64:
20254 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20255 }
20256}
20257
20258SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20259 SelectionDAG &DAG) const {
20260 bool IsStrict = Op->isStrictFPOpcode();
20261 unsigned OpNo = IsStrict ? 1 : 0;
20262 SDValue Src = Op.getOperand(OpNo);
20263 SDLoc dl(Op);
20264 auto PtrVT = getPointerTy(DAG.getDataLayout());
20265 MVT SrcVT = Src.getSimpleValueType();
20266 MVT DstVT = Op->getSimpleValueType(0);
20267 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20268
20269 // Bail out when we don't have native conversion instructions.
20270 if (DstVT == MVT::f128)
20271 return SDValue();
20272
20273 if (isSoftF16(DstVT, Subtarget))
20274 return promoteXINT_TO_FP(Op, dl, DAG);
20275 else if (isLegalConversion(SrcVT, false, Subtarget))
20276 return Op;
20277
20278 if (DstVT.isVector())
20279 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20280
20281 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20282 return LowerWin64_INT128_TO_FP(Op, DAG);
20283
20284 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20285 return Extract;
20286
20287 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20288 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20289 // Conversions from unsigned i32 to f32/f64 are legal,
20290 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20291 return Op;
20292 }
20293
20294 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20295 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20296 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20297 if (IsStrict)
20298 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20299 {Chain, Src});
20300 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20301 }
20302
20303 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20304 return V;
20305 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20306 return V;
20307
20308 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20309 // infinity. It produces -0.0, so disable under strictfp.
20310 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20311 !IsStrict)
20312 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20313 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20314 // negative infinity. So disable under strictfp. Using FILD instead.
20315 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20316 !IsStrict)
20317 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20318 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20319 (DstVT == MVT::f32 || DstVT == MVT::f64))
20320 return SDValue();
20321
20322 // Make a 64-bit buffer, and use it to build an FILD.
20323 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20324 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20325 Align SlotAlign(8);
20326 MachinePointerInfo MPI =
20328 if (SrcVT == MVT::i32) {
20329 SDValue OffsetSlot =
20330 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20331 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20332 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20333 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20334 std::pair<SDValue, SDValue> Tmp =
20335 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20336 if (IsStrict)
20337 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20338
20339 return Tmp.first;
20340 }
20341
20342 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20343 SDValue ValueToStore = Src;
20344 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20345 // Bitcasting to f64 here allows us to do a single 64-bit store from
20346 // an SSE register, avoiding the store forwarding penalty that would come
20347 // with two 32-bit stores.
20348 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20349 }
20350 SDValue Store =
20351 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20352 // For i64 source, we need to add the appropriate power of 2 if the input
20353 // was negative. We must be careful to do the computation in x87 extended
20354 // precision, not in SSE.
20355 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20356 SDValue Ops[] = {Store, StackSlot};
20357 SDValue Fild =
20358 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20359 SlotAlign, MachineMemOperand::MOLoad);
20360 Chain = Fild.getValue(1);
20361
20362 // Check whether the sign bit is set.
20363 SDValue SignSet = DAG.getSetCC(
20364 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20365 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20366
20367 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20368 APInt FF(64, 0x5F80000000000000ULL);
20369 SDValue FudgePtr =
20370 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20371 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20372
20373 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20374 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20375 SDValue Four = DAG.getIntPtrConstant(4, dl);
20376 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20377 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20378
20379 // Load the value out, extending it from f32 to f80.
20380 SDValue Fudge = DAG.getExtLoad(
20381 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20383 CPAlignment);
20384 Chain = Fudge.getValue(1);
20385 // Extend everything to 80 bits to force it to be done on x87.
20386 // TODO: Are there any fast-math-flags to propagate here?
20387 if (IsStrict) {
20388 unsigned Opc = ISD::STRICT_FADD;
20389 // Windows needs the precision control changed to 80bits around this add.
20390 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20392
20393 SDValue Add =
20394 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20395 // STRICT_FP_ROUND can't handle equal types.
20396 if (DstVT == MVT::f80)
20397 return Add;
20398 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20399 {Add.getValue(1), Add,
20400 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20401 }
20402 unsigned Opc = ISD::FADD;
20403 // Windows needs the precision control changed to 80bits around this add.
20404 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20405 Opc = X86ISD::FP80_ADD;
20406
20407 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20408 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20409 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20410}
20411
20412// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20413// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20414// just return an SDValue().
20415// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20416// to i16, i32 or i64, and we lower it to a legal sequence and return the
20417// result.
20418SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20419 bool IsSigned,
20420 SDValue &Chain) const {
20421 bool IsStrict = Op->isStrictFPOpcode();
20422 SDLoc DL(Op);
20423
20424 EVT DstTy = Op.getValueType();
20425 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20426 EVT TheVT = Value.getValueType();
20427 auto PtrVT = getPointerTy(DAG.getDataLayout());
20428
20429 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20430 // f16 must be promoted before using the lowering in this routine.
20431 // fp128 does not use this lowering.
20432 return SDValue();
20433 }
20434
20435 // If using FIST to compute an unsigned i64, we'll need some fixup
20436 // to handle values above the maximum signed i64. A FIST is always
20437 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20438 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20439
20440 // FIXME: This does not generate an invalid exception if the input does not
20441 // fit in i32. PR44019
20442 if (!IsSigned && DstTy != MVT::i64) {
20443 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20444 // The low 32 bits of the fist result will have the correct uint32 result.
20445 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20446 DstTy = MVT::i64;
20447 }
20448
20449 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20450 DstTy.getSimpleVT() >= MVT::i16 &&
20451 "Unknown FP_TO_INT to lower!");
20452
20453 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20454 // stack slot.
20456 unsigned MemSize = DstTy.getStoreSize();
20457 int SSFI =
20458 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20459 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20460
20461 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20462
20463 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20464
20465 if (UnsignedFixup) {
20466 //
20467 // Conversion to unsigned i64 is implemented with a select,
20468 // depending on whether the source value fits in the range
20469 // of a signed i64. Let Thresh be the FP equivalent of
20470 // 0x8000000000000000ULL.
20471 //
20472 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20473 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20474 // FistSrc = (Value - FltOfs);
20475 // Fist-to-mem64 FistSrc
20476 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20477 // to XOR'ing the high 32 bits with Adjust.
20478 //
20479 // Being a power of 2, Thresh is exactly representable in all FP formats.
20480 // For X87 we'd like to use the smallest FP type for this constant, but
20481 // for DAG type consistency we have to match the FP operand type.
20482
20483 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20485 bool LosesInfo = false;
20486 if (TheVT == MVT::f64)
20487 // The rounding mode is irrelevant as the conversion should be exact.
20489 &LosesInfo);
20490 else if (TheVT == MVT::f80)
20491 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20492 APFloat::rmNearestTiesToEven, &LosesInfo);
20493
20494 assert(Status == APFloat::opOK && !LosesInfo &&
20495 "FP conversion should have been exact");
20496
20497 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20498
20499 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20500 *DAG.getContext(), TheVT);
20501 SDValue Cmp;
20502 if (IsStrict) {
20503 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20504 /*IsSignaling*/ true);
20505 Chain = Cmp.getValue(1);
20506 } else {
20507 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20508 }
20509
20510 // Our preferred lowering of
20511 //
20512 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20513 //
20514 // is
20515 //
20516 // (Value >= Thresh) << 63
20517 //
20518 // but since we can get here after LegalOperations, DAGCombine might do the
20519 // wrong thing if we create a select. So, directly create the preferred
20520 // version.
20521 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20522 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20523 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20524
20525 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20526 DAG.getConstantFP(0.0, DL, TheVT));
20527
20528 if (IsStrict) {
20529 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20530 { Chain, Value, FltOfs });
20531 Chain = Value.getValue(1);
20532 } else
20533 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20534 }
20535
20537
20538 // FIXME This causes a redundant load/store if the SSE-class value is already
20539 // in memory, such as if it is on the callstack.
20540 if (isScalarFPTypeInSSEReg(TheVT)) {
20541 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20542 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20543 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20544 SDValue Ops[] = { Chain, StackSlot };
20545
20546 unsigned FLDSize = TheVT.getStoreSize();
20547 assert(FLDSize <= MemSize && "Stack slot not big enough");
20549 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20550 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20551 Chain = Value.getValue(1);
20552 }
20553
20554 // Build the FP_TO_INT*_IN_MEM
20556 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20557 SDValue Ops[] = { Chain, Value, StackSlot };
20559 DAG.getVTList(MVT::Other),
20560 Ops, DstTy, MMO);
20561
20562 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20563 Chain = Res.getValue(1);
20564
20565 // If we need an unsigned fixup, XOR the result with adjust.
20566 if (UnsignedFixup)
20567 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20568
20569 return Res;
20570}
20571
20573 const X86Subtarget &Subtarget) {
20574 MVT VT = Op.getSimpleValueType();
20575 SDValue In = Op.getOperand(0);
20576 MVT InVT = In.getSimpleValueType();
20577 unsigned Opc = Op.getOpcode();
20578
20579 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20580 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20581 "Unexpected extension opcode");
20583 "Expected same number of elements");
20584 assert((VT.getVectorElementType() == MVT::i16 ||
20585 VT.getVectorElementType() == MVT::i32 ||
20586 VT.getVectorElementType() == MVT::i64) &&
20587 "Unexpected element type");
20588 assert((InVT.getVectorElementType() == MVT::i8 ||
20589 InVT.getVectorElementType() == MVT::i16 ||
20590 InVT.getVectorElementType() == MVT::i32) &&
20591 "Unexpected element type");
20592
20593 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20594
20595 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20596 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20597 return splitVectorIntUnary(Op, DAG, dl);
20598 }
20599
20600 if (Subtarget.hasInt256())
20601 return Op;
20602
20603 // Optimize vectors in AVX mode:
20604 //
20605 // v8i16 -> v8i32
20606 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20607 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20608 // Concat upper and lower parts.
20609 //
20610 // v4i32 -> v4i64
20611 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20612 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20613 // Concat upper and lower parts.
20614 //
20615 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20616 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20617
20618 // Short-circuit if we can determine that each 128-bit half is the same value.
20619 // Otherwise, this is difficult to match and optimize.
20620 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20621 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20622 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20623
20624 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20625 SDValue Undef = DAG.getUNDEF(InVT);
20626 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20627 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20628 OpHi = DAG.getBitcast(HalfVT, OpHi);
20629
20630 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20631}
20632
20633// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20634static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20635 const SDLoc &dl, SelectionDAG &DAG) {
20636 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20637 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20638 DAG.getVectorIdxConstant(0, dl));
20639 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20640 DAG.getVectorIdxConstant(8, dl));
20641 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20642 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20643 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20644 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20645}
20646
20648 const X86Subtarget &Subtarget,
20649 SelectionDAG &DAG) {
20650 MVT VT = Op->getSimpleValueType(0);
20651 SDValue In = Op->getOperand(0);
20652 MVT InVT = In.getSimpleValueType();
20653 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20654 unsigned NumElts = VT.getVectorNumElements();
20655
20656 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20657 // avoids a constant pool load.
20658 if (VT.getVectorElementType() != MVT::i8) {
20659 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20660 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20661 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20662 }
20663
20664 // Extend VT if BWI is not supported.
20665 MVT ExtVT = VT;
20666 if (!Subtarget.hasBWI()) {
20667 // If v16i32 is to be avoided, we'll need to split and concatenate.
20668 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20669 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20670
20671 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20672 }
20673
20674 // Widen to 512-bits if VLX is not supported.
20675 MVT WideVT = ExtVT;
20676 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20677 NumElts *= 512 / ExtVT.getSizeInBits();
20678 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20679 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
20680 DAG.getVectorIdxConstant(0, DL));
20681 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
20682 }
20683
20684 SDValue One = DAG.getConstant(1, DL, WideVT);
20685 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20686
20687 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20688
20689 // Truncate if we had to extend above.
20690 if (VT != ExtVT) {
20691 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20692 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20693 }
20694
20695 // Extract back to 128/256-bit if we widened.
20696 if (WideVT != VT)
20697 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20698 DAG.getVectorIdxConstant(0, DL));
20699
20700 return SelectedVal;
20701}
20702
20704 SelectionDAG &DAG) {
20705 SDValue In = Op.getOperand(0);
20706 MVT SVT = In.getSimpleValueType();
20707 SDLoc DL(Op);
20708
20709 if (SVT.getVectorElementType() == MVT::i1)
20710 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
20711
20712 assert(Subtarget.hasAVX() && "Expected AVX support");
20713 return LowerAVXExtend(Op, DL, DAG, Subtarget);
20714}
20715
20716/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20717/// It makes use of the fact that vectors with enough leading sign/zero bits
20718/// prevent the PACKSS/PACKUS from saturating the results.
20719/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20720/// within each 128-bit lane.
20721static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20722 const SDLoc &DL, SelectionDAG &DAG,
20723 const X86Subtarget &Subtarget) {
20724 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20725 "Unexpected PACK opcode");
20726 assert(DstVT.isVector() && "VT not a vector?");
20727
20728 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20729 if (!Subtarget.hasSSE2())
20730 return SDValue();
20731
20732 EVT SrcVT = In.getValueType();
20733
20734 // No truncation required, we might get here due to recursive calls.
20735 if (SrcVT == DstVT)
20736 return In;
20737
20738 unsigned NumElems = SrcVT.getVectorNumElements();
20739 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20740 return SDValue();
20741
20742 unsigned DstSizeInBits = DstVT.getSizeInBits();
20743 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20744 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20745 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20746
20747 LLVMContext &Ctx = *DAG.getContext();
20748 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20749 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20750
20751 // Pack to the largest type possible:
20752 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20753 EVT InVT = MVT::i16, OutVT = MVT::i8;
20754 if (SrcVT.getScalarSizeInBits() > 16 &&
20755 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20756 InVT = MVT::i32;
20757 OutVT = MVT::i16;
20758 }
20759
20760 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20761 // On pre-AVX512, pack the src in both halves to help value tracking.
20762 if (SrcSizeInBits <= 128) {
20763 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20764 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20765 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20766 SDValue LHS = DAG.getBitcast(InVT, In);
20767 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20768 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20769 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20770 Res = DAG.getBitcast(PackedVT, Res);
20771 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20772 }
20773
20774 // Split lower/upper subvectors.
20775 SDValue Lo, Hi;
20776 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20777
20778 // If Hi is undef, then don't bother packing it and widen the result instead.
20779 if (Hi.isUndef()) {
20780 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20781 if (SDValue Res =
20782 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20783 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20784 }
20785
20786 unsigned SubSizeInBits = SrcSizeInBits / 2;
20787 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20788 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20789
20790 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20791 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20792 Lo = DAG.getBitcast(InVT, Lo);
20793 Hi = DAG.getBitcast(InVT, Hi);
20794 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20795 return DAG.getBitcast(DstVT, Res);
20796 }
20797
20798 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20799 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20800 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20801 Lo = DAG.getBitcast(InVT, Lo);
20802 Hi = DAG.getBitcast(InVT, Hi);
20803 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20804
20805 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20806 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20807 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20809 int Scale = 64 / OutVT.getScalarSizeInBits();
20810 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20811 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20812
20813 if (DstVT.is256BitVector())
20814 return DAG.getBitcast(DstVT, Res);
20815
20816 // If 512bit -> 128bit truncate another stage.
20817 Res = DAG.getBitcast(PackedVT, Res);
20818 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20819 }
20820
20821 // Recursively pack lower/upper subvectors, concat result and pack again.
20822 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20823
20824 if (PackedVT.is128BitVector()) {
20825 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20826 // type legalization.
20827 SDValue Res =
20828 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20829 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20830 }
20831
20832 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20833 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20834 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20835 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20836 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20837}
20838
20839/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20840/// e.g. trunc <8 x i32> X to <8 x i16> -->
20841/// MaskX = X & 0xffff (clear high bits to prevent saturation)
20842/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20844 const X86Subtarget &Subtarget,
20845 SelectionDAG &DAG) {
20846 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20847 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20848}
20849
20850/// Truncate using inreg sign extension and X86ISD::PACKSS.
20852 const X86Subtarget &Subtarget,
20853 SelectionDAG &DAG) {
20854 EVT SrcVT = In.getValueType();
20855 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20856 DAG.getValueType(DstVT));
20857 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20858}
20859
20860/// Helper to determine if \p In truncated to \p DstVT has the necessary
20861/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20862/// possibly by converting a SRL node to SRA for sign extension.
20863static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20864 SDValue In, const SDLoc &DL,
20865 SelectionDAG &DAG,
20866 const X86Subtarget &Subtarget,
20867 const SDNodeFlags Flags = SDNodeFlags()) {
20868 // Requires SSE2.
20869 if (!Subtarget.hasSSE2())
20870 return SDValue();
20871
20872 EVT SrcVT = In.getValueType();
20873 EVT DstSVT = DstVT.getVectorElementType();
20874 EVT SrcSVT = SrcVT.getVectorElementType();
20875 unsigned NumDstEltBits = DstSVT.getSizeInBits();
20876 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
20877
20878 // Check we have a truncation suited for PACKSS/PACKUS.
20879 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20880 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20881 return SDValue();
20882
20883 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
20884 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
20885
20886 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20887 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20888 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20889 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20890 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20891 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20892 return SDValue();
20893
20894 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20895 // split this for packing.
20896 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20897 !isFreeToSplitVector(In.getNode(), DAG) &&
20898 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20899 return SDValue();
20900
20901 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20902 if (Subtarget.hasAVX512() && NumStages > 1)
20903 return SDValue();
20904
20905 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20906 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20907
20908 // Truncate with PACKUS if we are truncating a vector with leading zero
20909 // bits that extend all the way to the packed/truncated value.
20910 // e.g. Masks, zext_in_reg, etc.
20911 // Pre-SSE41 we can only use PACKUSWB.
20912 KnownBits Known = DAG.computeKnownBits(In);
20913 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
20914 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20915 PackOpcode = X86ISD::PACKUS;
20916 return In;
20917 }
20918
20919 // Truncate with PACKSS if we are truncating a vector with sign-bits
20920 // that extend all the way to the packed/truncated value.
20921 // e.g. Comparison result, sext_in_reg, etc.
20922 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20923
20924 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20925 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20926 // see through BITCASTs later on and combines/simplifications can't then use
20927 // it.
20928 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20929 !Subtarget.hasAVX512())
20930 return SDValue();
20931
20932 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20933 if (Flags.hasNoSignedWrap() || MinSignBits < NumSignBits) {
20934 PackOpcode = X86ISD::PACKSS;
20935 return In;
20936 }
20937
20938 // If we have a srl that only generates signbits that we will discard in
20939 // the truncation then we can use PACKSS by converting the srl to a sra.
20940 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20941 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20942 if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
20943 if (*ShAmt == MinSignBits) {
20944 PackOpcode = X86ISD::PACKSS;
20945 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20946 }
20947 }
20948
20949 return SDValue();
20950}
20951
20952/// This function lowers a vector truncation of 'extended sign-bits' or
20953/// 'extended zero-bits' values.
20954/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20956 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
20957 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
20958 MVT SrcVT = In.getSimpleValueType();
20959 MVT DstSVT = DstVT.getVectorElementType();
20960 MVT SrcSVT = SrcVT.getVectorElementType();
20961 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20962 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20963 return SDValue();
20964
20965 // If the upper half of the source is undef, then attempt to split and
20966 // only truncate the lower half.
20967 if (DstVT.getSizeInBits() >= 128) {
20968 SmallVector<SDValue> LowerOps;
20969 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20970 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20971 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20972 Subtarget, DAG))
20973 return widenSubVector(Res, false, Subtarget, DAG, DL,
20974 DstVT.getSizeInBits());
20975 }
20976 }
20977
20978 unsigned PackOpcode;
20979 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
20980 Subtarget, Flags))
20981 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20982
20983 return SDValue();
20984}
20985
20986/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20987/// X86ISD::PACKUS/X86ISD::PACKSS operations.
20989 const X86Subtarget &Subtarget,
20990 SelectionDAG &DAG) {
20991 MVT SrcVT = In.getSimpleValueType();
20992 MVT DstSVT = DstVT.getVectorElementType();
20993 MVT SrcSVT = SrcVT.getVectorElementType();
20994 unsigned NumElems = DstVT.getVectorNumElements();
20995 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20996 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20997 NumElems >= 8))
20998 return SDValue();
20999
21000 // SSSE3's pshufb results in less instructions in the cases below.
21001 if (Subtarget.hasSSSE3() && NumElems == 8) {
21002 if (SrcSVT == MVT::i16)
21003 return SDValue();
21004 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21005 return SDValue();
21006 }
21007
21008 // If the upper half of the source is undef, then attempt to split and
21009 // only truncate the lower half.
21010 if (DstVT.getSizeInBits() >= 128) {
21011 SmallVector<SDValue> LowerOps;
21012 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21013 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21014 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21015 return widenSubVector(Res, false, Subtarget, DAG, DL,
21016 DstVT.getSizeInBits());
21017 }
21018 }
21019
21020 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21021 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21022 // truncate 2 x v4i32 to v8i16.
21023 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21024 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21025
21026 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21027 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21028
21029 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21030 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21031 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21032 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21033 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21034 }
21035
21036 return SDValue();
21037}
21038
21040 SelectionDAG &DAG,
21041 const X86Subtarget &Subtarget) {
21042 MVT VT = Op.getSimpleValueType();
21043 SDValue In = Op.getOperand(0);
21044 MVT InVT = In.getSimpleValueType();
21045 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21046
21047 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21048 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21049 if (InVT.getScalarSizeInBits() <= 16) {
21050 if (Subtarget.hasBWI()) {
21051 // legal, will go to VPMOVB2M, VPMOVW2M
21052 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21053 // We need to shift to get the lsb into sign position.
21054 // Shift packed bytes not supported natively, bitcast to word
21055 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21056 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21057 DAG.getBitcast(ExtVT, In),
21058 DAG.getConstant(ShiftInx, DL, ExtVT));
21059 In = DAG.getBitcast(InVT, In);
21060 }
21061 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21062 In, ISD::SETGT);
21063 }
21064 // Use TESTD/Q, extended vector to packed dword/qword.
21065 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21066 "Unexpected vector type.");
21067 unsigned NumElts = InVT.getVectorNumElements();
21068 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21069 // We need to change to a wider element type that we have support for.
21070 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21071 // For 16 element vectors we extend to v16i32 unless we are explicitly
21072 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21073 // we need to split into two 8 element vectors which we can extend to v8i32,
21074 // truncate and concat the results. There's an additional complication if
21075 // the original type is v16i8. In that case we can't split the v16i8
21076 // directly, so we need to shuffle high elements to low and use
21077 // sign_extend_vector_inreg.
21078 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21079 SDValue Lo, Hi;
21080 if (InVT == MVT::v16i8) {
21081 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21082 Hi = DAG.getVectorShuffle(
21083 InVT, DL, In, In,
21084 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21085 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21086 } else {
21087 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21088 Lo = extract128BitVector(In, 0, DAG, DL);
21089 Hi = extract128BitVector(In, 8, DAG, DL);
21090 }
21091 // We're split now, just emit two truncates and a concat. The two
21092 // truncates will trigger legalization to come back to this function.
21093 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21094 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21095 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21096 }
21097 // We either have 8 elements or we're allowed to use 512-bit vectors.
21098 // If we have VLX, we want to use the narrowest vector that can get the
21099 // job done so we use vXi32.
21100 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21101 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21102 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21103 InVT = ExtVT;
21104 ShiftInx = InVT.getScalarSizeInBits() - 1;
21105 }
21106
21107 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21108 // We need to shift to get the lsb into sign position.
21109 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21110 DAG.getConstant(ShiftInx, DL, InVT));
21111 }
21112 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21113 if (Subtarget.hasDQI())
21114 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21115 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21116}
21117
21118SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21119 SDLoc DL(Op);
21120 MVT VT = Op.getSimpleValueType();
21121 SDValue In = Op.getOperand(0);
21122 MVT InVT = In.getSimpleValueType();
21124 "Invalid TRUNCATE operation");
21125
21126 // If we're called by the type legalizer, handle a few cases.
21127 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21128 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21129 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21130 VT.is128BitVector() && Subtarget.hasAVX512()) {
21131 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21132 "Unexpected subtarget!");
21133 // The default behavior is to truncate one step, concatenate, and then
21134 // truncate the remainder. We'd rather produce two 64-bit results and
21135 // concatenate those.
21136 SDValue Lo, Hi;
21137 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21138
21139 EVT LoVT, HiVT;
21140 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21141
21142 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21143 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21144 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21145 }
21146
21147 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21148 if (!Subtarget.hasAVX512() ||
21149 (InVT.is512BitVector() && VT.is256BitVector()))
21151 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21152 return SignPack;
21153
21154 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21155 if (!Subtarget.hasAVX512())
21156 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21157
21158 // Otherwise let default legalization handle it.
21159 return SDValue();
21160 }
21161
21162 if (VT.getVectorElementType() == MVT::i1)
21163 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21164
21165 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21166 // concat from subvectors to use VPTRUNC etc.
21167 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
21169 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21170 return SignPack;
21171
21172 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21173 if (Subtarget.hasAVX512()) {
21174 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21175 assert(VT == MVT::v32i8 && "Unexpected VT!");
21176 return splitVectorIntUnary(Op, DAG, DL);
21177 }
21178
21179 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21180 // and then truncate that. But we should only do that if we haven't been
21181 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21182 // handled by isel patterns.
21183 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21184 Subtarget.canExtendTo512DQ())
21185 return Op;
21186 }
21187
21188 // Handle truncation of V256 to V128 using shuffles.
21189 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21190
21191 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21192 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21193 if (Subtarget.hasInt256()) {
21194 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21195 In = DAG.getBitcast(MVT::v8i32, In);
21196 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21197 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21198 DAG.getVectorIdxConstant(0, DL));
21199 }
21200
21201 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21202 DAG.getVectorIdxConstant(0, DL));
21203 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21204 DAG.getVectorIdxConstant(2, DL));
21205 static const int ShufMask[] = {0, 2, 4, 6};
21206 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21207 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21208 }
21209
21210 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21211 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21212 if (Subtarget.hasInt256()) {
21213 // The PSHUFB mask:
21214 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21215 -1, -1, -1, -1, -1, -1, -1, -1,
21216 16, 17, 20, 21, 24, 25, 28, 29,
21217 -1, -1, -1, -1, -1, -1, -1, -1 };
21218 In = DAG.getBitcast(MVT::v32i8, In);
21219 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21220 In = DAG.getBitcast(MVT::v4i64, In);
21221
21222 static const int ShufMask2[] = {0, 2, -1, -1};
21223 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21224 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21225 DAG.getVectorIdxConstant(0, DL));
21226 return DAG.getBitcast(MVT::v8i16, In);
21227 }
21228
21229 return Subtarget.hasSSE41()
21230 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21231 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21232 }
21233
21234 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21235 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21236
21237 llvm_unreachable("All 256->128 cases should have been handled above!");
21238}
21239
21240// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21241// behaves on out of range inputs to generate optimized conversions.
21243 SelectionDAG &DAG,
21244 const X86Subtarget &Subtarget) {
21245 MVT SrcVT = Src.getSimpleValueType();
21246 unsigned DstBits = VT.getScalarSizeInBits();
21247 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21248
21249 // Calculate the converted result for values in the range 0 to
21250 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21251 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21252 SDValue Big =
21253 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21254 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21255 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21256
21257 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21258 // and only if the value was out of range. So we can use that
21259 // as our indicator that we rather use "Big" instead of "Small".
21260 //
21261 // Use "Small" if "IsOverflown" has all bits cleared
21262 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21263
21264 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21265 // use the slightly slower blendv select instead.
21266 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21267 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21268 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21269 }
21270
21271 SDValue IsOverflown =
21272 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21273 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21274 return DAG.getNode(ISD::OR, dl, VT, Small,
21275 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21276}
21277
21278SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21279 bool IsStrict = Op->isStrictFPOpcode();
21280 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21281 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21282 MVT VT = Op->getSimpleValueType(0);
21283 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21284 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21285 MVT SrcVT = Src.getSimpleValueType();
21286 SDLoc dl(Op);
21287
21288 SDValue Res;
21289 if (isSoftF16(SrcVT, Subtarget)) {
21290 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21291 if (IsStrict)
21292 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21293 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21294 {NVT, MVT::Other}, {Chain, Src})});
21295 return DAG.getNode(Op.getOpcode(), dl, VT,
21296 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21297 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
21298 return Op;
21299 }
21300
21301 if (VT.isVector()) {
21302 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21303 MVT ResVT = MVT::v4i32;
21304 MVT TruncVT = MVT::v4i1;
21305 unsigned Opc;
21306 if (IsStrict)
21308 else
21309 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21310
21311 if (!IsSigned && !Subtarget.hasVLX()) {
21312 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21313 // Widen to 512-bits.
21314 ResVT = MVT::v8i32;
21315 TruncVT = MVT::v8i1;
21316 Opc = Op.getOpcode();
21317 // Need to concat with zero vector for strict fp to avoid spurious
21318 // exceptions.
21319 // TODO: Should we just do this for non-strict as well?
21320 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21321 : DAG.getUNDEF(MVT::v8f64);
21322 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21323 DAG.getVectorIdxConstant(0, dl));
21324 }
21325 if (IsStrict) {
21326 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21327 Chain = Res.getValue(1);
21328 } else {
21329 Res = DAG.getNode(Opc, dl, ResVT, Src);
21330 }
21331
21332 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21333 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21334 DAG.getVectorIdxConstant(0, dl));
21335 if (IsStrict)
21336 return DAG.getMergeValues({Res, Chain}, dl);
21337 return Res;
21338 }
21339
21340 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21341 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
21342 return Op;
21343
21344 MVT ResVT = VT;
21345 MVT EleVT = VT.getVectorElementType();
21346 if (EleVT != MVT::i64)
21347 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21348
21349 if (SrcVT != MVT::v8f16) {
21350 SDValue Tmp =
21351 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21352 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21353 Ops[0] = Src;
21354 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21355 }
21356
21357 if (IsStrict) {
21358 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21360 dl, {ResVT, MVT::Other}, {Chain, Src});
21361 Chain = Res.getValue(1);
21362 } else {
21363 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21364 ResVT, Src);
21365 }
21366
21367 // TODO: Need to add exception check code for strict FP.
21368 if (EleVT.getSizeInBits() < 16) {
21369 ResVT = MVT::getVectorVT(EleVT, 8);
21370 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21371 }
21372
21373 if (ResVT != VT)
21374 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21375 DAG.getVectorIdxConstant(0, dl));
21376
21377 if (IsStrict)
21378 return DAG.getMergeValues({Res, Chain}, dl);
21379 return Res;
21380 }
21381
21382 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21383 if (VT.getVectorElementType() == MVT::i16) {
21384 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21385 SrcVT.getVectorElementType() == MVT::f64) &&
21386 "Expected f32/f64 vector!");
21387 MVT NVT = VT.changeVectorElementType(MVT::i32);
21388 if (IsStrict) {
21389 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21391 dl, {NVT, MVT::Other}, {Chain, Src});
21392 Chain = Res.getValue(1);
21393 } else {
21394 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21395 NVT, Src);
21396 }
21397
21398 // TODO: Need to add exception check code for strict FP.
21399 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21400
21401 if (IsStrict)
21402 return DAG.getMergeValues({Res, Chain}, dl);
21403 return Res;
21404 }
21405
21406 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21407 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21408 assert(!IsSigned && "Expected unsigned conversion!");
21409 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21410 return Op;
21411 }
21412
21413 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21414 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21415 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21416 Subtarget.useAVX512Regs()) {
21417 assert(!IsSigned && "Expected unsigned conversion!");
21418 assert(!Subtarget.hasVLX() && "Unexpected features!");
21419 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21420 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21421 // Need to concat with zero vector for strict fp to avoid spurious
21422 // exceptions.
21423 // TODO: Should we just do this for non-strict as well?
21424 SDValue Tmp =
21425 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21426 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21427 DAG.getVectorIdxConstant(0, dl));
21428
21429 if (IsStrict) {
21430 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21431 {Chain, Src});
21432 Chain = Res.getValue(1);
21433 } else {
21434 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21435 }
21436
21437 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21438 DAG.getVectorIdxConstant(0, dl));
21439
21440 if (IsStrict)
21441 return DAG.getMergeValues({Res, Chain}, dl);
21442 return Res;
21443 }
21444
21445 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21446 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21447 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21448 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21449 assert(!Subtarget.hasVLX() && "Unexpected features!");
21450 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21451 // Need to concat with zero vector for strict fp to avoid spurious
21452 // exceptions.
21453 // TODO: Should we just do this for non-strict as well?
21454 SDValue Tmp =
21455 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21456 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21457 DAG.getVectorIdxConstant(0, dl));
21458
21459 if (IsStrict) {
21460 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21461 {Chain, Src});
21462 Chain = Res.getValue(1);
21463 } else {
21464 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21465 }
21466
21467 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21468 DAG.getVectorIdxConstant(0, dl));
21469
21470 if (IsStrict)
21471 return DAG.getMergeValues({Res, Chain}, dl);
21472 return Res;
21473 }
21474
21475 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21476 if (!Subtarget.hasVLX()) {
21477 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21478 // legalizer and then widened again by vector op legalization.
21479 if (!IsStrict)
21480 return SDValue();
21481
21482 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21483 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21484 {Src, Zero, Zero, Zero});
21485 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21486 {Chain, Tmp});
21487 SDValue Chain = Tmp.getValue(1);
21488 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21489 DAG.getVectorIdxConstant(0, dl));
21490 return DAG.getMergeValues({Tmp, Chain}, dl);
21491 }
21492
21493 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21494 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21495 DAG.getUNDEF(MVT::v2f32));
21496 if (IsStrict) {
21497 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21499 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21500 }
21501 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21502 return DAG.getNode(Opc, dl, VT, Tmp);
21503 }
21504
21505 // Generate optimized instructions for pre AVX512 unsigned conversions from
21506 // vXf32 to vXi32.
21507 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21508 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21509 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21510 assert(!IsSigned && "Expected unsigned conversion!");
21511 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21512 }
21513
21514 return SDValue();
21515 }
21516
21517 assert(!VT.isVector());
21518
21519 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21520
21521 if (!IsSigned && UseSSEReg) {
21522 // Conversions from f32/f64 with AVX512 should be legal.
21523 if (Subtarget.hasAVX512())
21524 return Op;
21525
21526 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21527 // behaves on out of range inputs to generate optimized conversions.
21528 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21529 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21530 unsigned DstBits = VT.getScalarSizeInBits();
21531 APInt UIntLimit = APInt::getSignMask(DstBits);
21532 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21533 DAG.getConstant(UIntLimit, dl, VT));
21534 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21535
21536 // Calculate the converted result for values in the range:
21537 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21538 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21539 SDValue Small =
21540 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21541 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21542 SDValue Big = DAG.getNode(
21543 X86ISD::CVTTS2SI, dl, VT,
21544 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21545 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21546
21547 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21548 // and only if the value was out of range. So we can use that
21549 // as our indicator that we rather use "Big" instead of "Small".
21550 //
21551 // Use "Small" if "IsOverflown" has all bits cleared
21552 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21553 SDValue IsOverflown = DAG.getNode(
21554 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21555 return DAG.getNode(ISD::OR, dl, VT, Small,
21556 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21557 }
21558
21559 // Use default expansion for i64.
21560 if (VT == MVT::i64)
21561 return SDValue();
21562
21563 assert(VT == MVT::i32 && "Unexpected VT!");
21564
21565 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21566 // FIXME: This does not generate an invalid exception if the input does not
21567 // fit in i32. PR44019
21568 if (Subtarget.is64Bit()) {
21569 if (IsStrict) {
21570 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21571 {Chain, Src});
21572 Chain = Res.getValue(1);
21573 } else
21574 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21575
21576 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21577 if (IsStrict)
21578 return DAG.getMergeValues({Res, Chain}, dl);
21579 return Res;
21580 }
21581
21582 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21583 // use fisttp which will be handled later.
21584 if (!Subtarget.hasSSE3())
21585 return SDValue();
21586 }
21587
21588 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21589 // FIXME: This does not generate an invalid exception if the input does not
21590 // fit in i16. PR44019
21591 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21592 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21593 if (IsStrict) {
21594 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21595 {Chain, Src});
21596 Chain = Res.getValue(1);
21597 } else
21598 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21599
21600 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21601 if (IsStrict)
21602 return DAG.getMergeValues({Res, Chain}, dl);
21603 return Res;
21604 }
21605
21606 // If this is a FP_TO_SINT using SSEReg we're done.
21607 if (UseSSEReg && IsSigned)
21608 return Op;
21609
21610 // fp128 needs to use a libcall.
21611 if (SrcVT == MVT::f128) {
21612 RTLIB::Libcall LC;
21613 if (IsSigned)
21614 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21615 else
21616 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21617
21618 MakeLibCallOptions CallOptions;
21619 std::pair<SDValue, SDValue> Tmp =
21620 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21621
21622 if (IsStrict)
21623 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21624
21625 return Tmp.first;
21626 }
21627
21628 // Fall back to X87.
21629 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21630 if (IsStrict)
21631 return DAG.getMergeValues({V, Chain}, dl);
21632 return V;
21633 }
21634
21635 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21636}
21637
21638SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21639 SelectionDAG &DAG) const {
21640 SDValue Src = Op.getOperand(0);
21641 EVT DstVT = Op.getSimpleValueType();
21642 MVT SrcVT = Src.getSimpleValueType();
21643
21644 if (SrcVT.isVector())
21645 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21646
21647 if (SrcVT == MVT::f16)
21648 return SDValue();
21649
21650 // If the source is in an SSE register, the node is Legal.
21651 if (isScalarFPTypeInSSEReg(SrcVT))
21652 return Op;
21653
21654 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21655}
21656
21657SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21658 SelectionDAG &DAG) const {
21659 EVT DstVT = N->getValueType(0);
21660 SDValue Src = N->getOperand(0);
21661 EVT SrcVT = Src.getValueType();
21662
21663 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21664 // f16 must be promoted before using the lowering in this routine.
21665 // fp128 does not use this lowering.
21666 return SDValue();
21667 }
21668
21669 SDLoc DL(N);
21670 SDValue Chain = DAG.getEntryNode();
21671
21672 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21673
21674 // If we're converting from SSE, the stack slot needs to hold both types.
21675 // Otherwise it only needs to hold the DstVT.
21676 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21677 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21678 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21679 MachinePointerInfo MPI =
21681
21682 if (UseSSE) {
21683 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21684 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21685 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21686 SDValue Ops[] = { Chain, StackPtr };
21687
21688 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21689 /*Align*/ std::nullopt,
21691 Chain = Src.getValue(1);
21692 }
21693
21694 SDValue StoreOps[] = { Chain, Src, StackPtr };
21695 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21696 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
21698
21699 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21700}
21701
21702SDValue
21703X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21704 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21705 // but making use of X86 specifics to produce better instruction sequences.
21706 SDNode *Node = Op.getNode();
21707 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21708 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21709 SDLoc dl(SDValue(Node, 0));
21710 SDValue Src = Node->getOperand(0);
21711
21712 // There are three types involved here: SrcVT is the source floating point
21713 // type, DstVT is the type of the result, and TmpVT is the result of the
21714 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21715 // DstVT).
21716 EVT SrcVT = Src.getValueType();
21717 EVT DstVT = Node->getValueType(0);
21718 EVT TmpVT = DstVT;
21719
21720 // This code is only for floats and doubles. Fall back to generic code for
21721 // anything else.
21722 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
21723 return SDValue();
21724
21725 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21726 unsigned SatWidth = SatVT.getScalarSizeInBits();
21727 unsigned DstWidth = DstVT.getScalarSizeInBits();
21728 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21729 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21730 "Expected saturation width smaller than result width");
21731
21732 // Promote result of FP_TO_*INT to at least 32 bits.
21733 if (TmpWidth < 32) {
21734 TmpVT = MVT::i32;
21735 TmpWidth = 32;
21736 }
21737
21738 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21739 // us to use a native signed conversion instead.
21740 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21741 TmpVT = MVT::i64;
21742 TmpWidth = 64;
21743 }
21744
21745 // If the saturation width is smaller than the size of the temporary result,
21746 // we can always use signed conversion, which is native.
21747 if (SatWidth < TmpWidth)
21748 FpToIntOpcode = ISD::FP_TO_SINT;
21749
21750 // Determine minimum and maximum integer values and their corresponding
21751 // floating-point values.
21752 APInt MinInt, MaxInt;
21753 if (IsSigned) {
21754 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
21755 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
21756 } else {
21757 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
21758 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
21759 }
21760
21761 const fltSemantics &Sem = SrcVT.getFltSemantics();
21762 APFloat MinFloat(Sem);
21763 APFloat MaxFloat(Sem);
21764
21765 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21766 MinInt, IsSigned, APFloat::rmTowardZero);
21767 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21768 MaxInt, IsSigned, APFloat::rmTowardZero);
21769 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21770 && !(MaxStatus & APFloat::opStatus::opInexact);
21771
21772 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21773 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21774
21775 // If the integer bounds are exactly representable as floats, emit a
21776 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21777 if (AreExactFloatBounds) {
21778 if (DstVT != TmpVT) {
21779 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21780 SDValue MinClamped = DAG.getNode(
21781 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21782 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21783 SDValue BothClamped = DAG.getNode(
21784 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21785 // Convert clamped value to integer.
21786 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21787
21788 // NaN will become INDVAL, with the top bit set and the rest zero.
21789 // Truncation will discard the top bit, resulting in zero.
21790 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21791 }
21792
21793 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21794 SDValue MinClamped = DAG.getNode(
21795 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21796 // Clamp by MaxFloat from above. NaN cannot occur.
21797 SDValue BothClamped = DAG.getNode(
21798 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21799 // Convert clamped value to integer.
21800 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21801
21802 if (!IsSigned) {
21803 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21804 // which is zero.
21805 return FpToInt;
21806 }
21807
21808 // Otherwise, select zero if Src is NaN.
21809 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21810 return DAG.getSelectCC(
21811 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21812 }
21813
21814 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21815 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21816
21817 // Result of direct conversion, which may be selected away.
21818 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21819
21820 if (DstVT != TmpVT) {
21821 // NaN will become INDVAL, with the top bit set and the rest zero.
21822 // Truncation will discard the top bit, resulting in zero.
21823 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21824 }
21825
21826 SDValue Select = FpToInt;
21827 // For signed conversions where we saturate to the same size as the
21828 // result type of the fptoi instructions, INDVAL coincides with integer
21829 // minimum, so we don't need to explicitly check it.
21830 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21831 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21832 // MinInt if Src is NaN.
21833 Select = DAG.getSelectCC(
21834 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21835 }
21836
21837 // If Src OGT MaxFloat, select MaxInt.
21838 Select = DAG.getSelectCC(
21839 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21840
21841 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21842 // is already zero. The promoted case was already handled above.
21843 if (!IsSigned || DstVT != TmpVT) {
21844 return Select;
21845 }
21846
21847 // Otherwise, select 0 if Src is NaN.
21848 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21849 return DAG.getSelectCC(
21850 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21851}
21852
21853SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21854 bool IsStrict = Op->isStrictFPOpcode();
21855
21856 SDLoc DL(Op);
21857 MVT VT = Op.getSimpleValueType();
21858 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21859 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21860 MVT SVT = In.getSimpleValueType();
21861
21862 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21863 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
21864 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
21865 !Subtarget.getTargetTriple().isOSDarwin()))
21866 return SDValue();
21867
21868 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
21869 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
21870 return Op;
21871
21872 if (SVT == MVT::f16) {
21873 if (Subtarget.hasFP16())
21874 return Op;
21875
21876 if (VT != MVT::f32) {
21877 if (IsStrict)
21878 return DAG.getNode(
21879 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
21880 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
21881 {MVT::f32, MVT::Other}, {Chain, In})});
21882
21883 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21884 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
21885 }
21886
21887 if (!Subtarget.hasF16C()) {
21888 if (!Subtarget.getTargetTriple().isOSDarwin())
21889 return SDValue();
21890
21891 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
21892
21893 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21895 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21896
21897 In = DAG.getBitcast(MVT::i16, In);
21900 Entry.Node = In;
21901 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
21902 Entry.IsSExt = false;
21903 Entry.IsZExt = true;
21904 Args.push_back(Entry);
21905
21907 getLibcallName(RTLIB::FPEXT_F16_F32),
21909 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21910 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
21911 std::move(Args));
21912
21913 SDValue Res;
21914 std::tie(Res,Chain) = LowerCallTo(CLI);
21915 if (IsStrict)
21916 Res = DAG.getMergeValues({Res, Chain}, DL);
21917
21918 return Res;
21919 }
21920
21921 In = DAG.getBitcast(MVT::i16, In);
21922 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
21923 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
21924 DAG.getVectorIdxConstant(0, DL));
21925 SDValue Res;
21926 if (IsStrict) {
21927 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
21928 {Chain, In});
21929 Chain = Res.getValue(1);
21930 } else {
21931 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
21932 DAG.getTargetConstant(4, DL, MVT::i32));
21933 }
21934 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
21935 DAG.getVectorIdxConstant(0, DL));
21936 if (IsStrict)
21937 return DAG.getMergeValues({Res, Chain}, DL);
21938 return Res;
21939 }
21940
21941 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
21942 return Op;
21943
21944 if (SVT.getVectorElementType() == MVT::f16) {
21945 if (Subtarget.hasFP16() && isTypeLegal(SVT))
21946 return Op;
21947 assert(Subtarget.hasF16C() && "Unexpected features!");
21948 if (SVT == MVT::v2f16)
21949 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
21950 DAG.getUNDEF(MVT::v2f16));
21951 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
21952 DAG.getUNDEF(MVT::v4f16));
21953 if (IsStrict)
21954 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21955 {Op->getOperand(0), Res});
21956 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21957 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
21958 return Op;
21959 }
21960
21961 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21962
21963 SDValue Res =
21964 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21965 if (IsStrict)
21966 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21967 {Op->getOperand(0), Res});
21968 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21969}
21970
21971SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21972 bool IsStrict = Op->isStrictFPOpcode();
21973
21974 SDLoc DL(Op);
21975 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21976 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21977 MVT VT = Op.getSimpleValueType();
21978 MVT SVT = In.getSimpleValueType();
21979
21980 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
21981 return SDValue();
21982
21983 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
21984 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
21985 if (!Subtarget.getTargetTriple().isOSDarwin())
21986 return SDValue();
21987
21988 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
21990 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21991
21994 Entry.Node = In;
21995 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
21996 Entry.IsSExt = false;
21997 Entry.IsZExt = true;
21998 Args.push_back(Entry);
21999
22001 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22002 : RTLIB::FPROUND_F32_F16),
22004 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22005 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22006 std::move(Args));
22007
22008 SDValue Res;
22009 std::tie(Res, Chain) = LowerCallTo(CLI);
22010
22011 Res = DAG.getBitcast(MVT::f16, Res);
22012
22013 if (IsStrict)
22014 Res = DAG.getMergeValues({Res, Chain}, DL);
22015
22016 return Res;
22017 }
22018
22019 if (VT.getScalarType() == MVT::bf16) {
22020 if (SVT.getScalarType() == MVT::f32 &&
22021 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22022 Subtarget.hasAVXNECONVERT()))
22023 return Op;
22024 return SDValue();
22025 }
22026
22027 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22028 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22029 return SDValue();
22030
22031 if (VT.isVector())
22032 return Op;
22033
22034 SDValue Res;
22036 MVT::i32);
22037 if (IsStrict) {
22038 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22039 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22040 DAG.getVectorIdxConstant(0, DL));
22041 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22042 {Chain, Res, Rnd});
22043 Chain = Res.getValue(1);
22044 } else {
22045 // FIXME: Should we use zeros for upper elements for non-strict?
22046 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22047 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22048 }
22049
22050 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22051 DAG.getVectorIdxConstant(0, DL));
22052 Res = DAG.getBitcast(MVT::f16, Res);
22053
22054 if (IsStrict)
22055 return DAG.getMergeValues({Res, Chain}, DL);
22056
22057 return Res;
22058 }
22059
22060 return Op;
22061}
22062
22064 bool IsStrict = Op->isStrictFPOpcode();
22065 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22066 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22067 "Unexpected VT!");
22068
22069 SDLoc dl(Op);
22070 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22071 DAG.getConstant(0, dl, MVT::v8i16), Src,
22072 DAG.getVectorIdxConstant(0, dl));
22073
22074 SDValue Chain;
22075 if (IsStrict) {
22076 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22077 {Op.getOperand(0), Res});
22078 Chain = Res.getValue(1);
22079 } else {
22080 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22081 }
22082
22083 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22084 DAG.getVectorIdxConstant(0, dl));
22085
22086 if (IsStrict)
22087 return DAG.getMergeValues({Res, Chain}, dl);
22088
22089 return Res;
22090}
22091
22093 bool IsStrict = Op->isStrictFPOpcode();
22094 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22095 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22096 "Unexpected VT!");
22097
22098 SDLoc dl(Op);
22099 SDValue Res, Chain;
22100 if (IsStrict) {
22101 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22102 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22103 DAG.getVectorIdxConstant(0, dl));
22104 Res = DAG.getNode(
22105 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22106 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22107 Chain = Res.getValue(1);
22108 } else {
22109 // FIXME: Should we use zeros for upper elements for non-strict?
22110 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22111 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22112 DAG.getTargetConstant(4, dl, MVT::i32));
22113 }
22114
22115 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22116 DAG.getVectorIdxConstant(0, dl));
22117
22118 if (IsStrict)
22119 return DAG.getMergeValues({Res, Chain}, dl);
22120
22121 return Res;
22122}
22123
22124SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22125 SelectionDAG &DAG) const {
22126 SDLoc DL(Op);
22127
22128 MVT SVT = Op.getOperand(0).getSimpleValueType();
22129 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22130 Subtarget.hasAVXNECONVERT())) {
22131 SDValue Res;
22132 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22133 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22134 Res = DAG.getBitcast(MVT::v8i16, Res);
22135 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22136 DAG.getVectorIdxConstant(0, DL));
22137 }
22138
22139 MakeLibCallOptions CallOptions;
22140 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22141 SDValue Res =
22142 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22143 return DAG.getBitcast(MVT::i16, Res);
22144}
22145
22146/// Depending on uarch and/or optimizing for size, we might prefer to use a
22147/// vector operation in place of the typical scalar operation.
22149 SelectionDAG &DAG,
22150 const X86Subtarget &Subtarget) {
22151 // If both operands have other uses, this is probably not profitable.
22152 SDValue LHS = Op.getOperand(0);
22153 SDValue RHS = Op.getOperand(1);
22154 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22155 return Op;
22156
22157 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22158 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22159 if (IsFP && !Subtarget.hasSSE3())
22160 return Op;
22161 if (!IsFP && !Subtarget.hasSSSE3())
22162 return Op;
22163
22164 // Extract from a common vector.
22165 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22166 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22167 LHS.getOperand(0) != RHS.getOperand(0) ||
22168 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22169 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22170 !shouldUseHorizontalOp(true, DAG, Subtarget))
22171 return Op;
22172
22173 // Allow commuted 'hadd' ops.
22174 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22175 unsigned HOpcode;
22176 switch (Op.getOpcode()) {
22177 // clang-format off
22178 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22179 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22180 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22181 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22182 default:
22183 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22184 // clang-format on
22185 }
22186 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22187 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22188 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22189 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22190 std::swap(LExtIndex, RExtIndex);
22191
22192 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22193 return Op;
22194
22195 SDValue X = LHS.getOperand(0);
22196 EVT VecVT = X.getValueType();
22197 unsigned BitWidth = VecVT.getSizeInBits();
22198 unsigned NumLanes = BitWidth / 128;
22199 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22200 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22201 "Not expecting illegal vector widths here");
22202
22203 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22204 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22205 if (BitWidth == 256 || BitWidth == 512) {
22206 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22207 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22208 LExtIndex %= NumEltsPerLane;
22209 }
22210
22211 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22212 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22213 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22214 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22215 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22216 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22217 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22218}
22219
22220/// Depending on uarch and/or optimizing for size, we might prefer to use a
22221/// vector operation in place of the typical scalar operation.
22222SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22223 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22224 "Only expecting float/double");
22225 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22226}
22227
22228/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22229/// This mode isn't supported in hardware on X86. But as long as we aren't
22230/// compiling with trapping math, we can emulate this with
22231/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22233 SDValue N0 = Op.getOperand(0);
22234 SDLoc dl(Op);
22235 MVT VT = Op.getSimpleValueType();
22236
22237 // N0 += copysign(nextafter(0.5, 0.0), N0)
22238 const fltSemantics &Sem = VT.getFltSemantics();
22239 bool Ignored;
22240 APFloat Point5Pred = APFloat(0.5f);
22241 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22242 Point5Pred.next(/*nextDown*/true);
22243
22244 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22245 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22246 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22247
22248 // Truncate the result to remove fraction.
22249 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22250}
22251
22252/// The only differences between FABS and FNEG are the mask and the logic op.
22253/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22255 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22256 "Wrong opcode for lowering FABS or FNEG.");
22257
22258 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22259
22260 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22261 // into an FNABS. We'll lower the FABS after that if it is still in use.
22262 if (IsFABS)
22263 for (SDNode *User : Op->users())
22264 if (User->getOpcode() == ISD::FNEG)
22265 return Op;
22266
22267 SDLoc dl(Op);
22268 MVT VT = Op.getSimpleValueType();
22269
22270 bool IsF128 = (VT == MVT::f128);
22271 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22273 "Unexpected type in LowerFABSorFNEG");
22274
22275 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22276 // decide if we should generate a 16-byte constant mask when we only need 4 or
22277 // 8 bytes for the scalar case.
22278
22279 // There are no scalar bitwise logical SSE/AVX instructions, so we
22280 // generate a 16-byte vector constant and logic op even for the scalar case.
22281 // Using a 16-byte mask allows folding the load of the mask with
22282 // the logic op, so it can save (~4 bytes) on code size.
22283 bool IsFakeVector = !VT.isVector() && !IsF128;
22284 MVT LogicVT = VT;
22285 if (IsFakeVector)
22286 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22287 : (VT == MVT::f32) ? MVT::v4f32
22288 : MVT::v8f16;
22289
22290 unsigned EltBits = VT.getScalarSizeInBits();
22291 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22292 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22293 APInt::getSignMask(EltBits);
22294 const fltSemantics &Sem = VT.getFltSemantics();
22295 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22296
22297 SDValue Op0 = Op.getOperand(0);
22298 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22299 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22300 IsFNABS ? X86ISD::FOR :
22302 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22303
22304 if (VT.isVector() || IsF128)
22305 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22306
22307 // For the scalar case extend to a 128-bit vector, perform the logic op,
22308 // and extract the scalar result back out.
22309 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22310 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22311 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22312 DAG.getVectorIdxConstant(0, dl));
22313}
22314
22316 SDValue Mag = Op.getOperand(0);
22317 SDValue Sign = Op.getOperand(1);
22318 SDLoc dl(Op);
22319
22320 // If the sign operand is smaller, extend it first.
22321 MVT VT = Op.getSimpleValueType();
22322 if (Sign.getSimpleValueType().bitsLT(VT))
22323 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22324
22325 // And if it is bigger, shrink it first.
22326 if (Sign.getSimpleValueType().bitsGT(VT))
22327 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22328 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22329
22330 // At this point the operands and the result should have the same
22331 // type, and that won't be f80 since that is not custom lowered.
22332 bool IsF128 = (VT == MVT::f128);
22333 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22335 "Unexpected type in LowerFCOPYSIGN");
22336
22337 const fltSemantics &Sem = VT.getFltSemantics();
22338
22339 // Perform all scalar logic operations as 16-byte vectors because there are no
22340 // scalar FP logic instructions in SSE.
22341 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22342 // unnecessary splats, but we might miss load folding opportunities. Should
22343 // this decision be based on OptimizeForSize?
22344 bool IsFakeVector = !VT.isVector() && !IsF128;
22345 MVT LogicVT = VT;
22346 if (IsFakeVector)
22347 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22348 : (VT == MVT::f32) ? MVT::v4f32
22349 : MVT::v8f16;
22350
22351 // The mask constants are automatically splatted for vector types.
22352 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22353 SDValue SignMask = DAG.getConstantFP(
22354 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22355 SDValue MagMask = DAG.getConstantFP(
22356 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22357
22358 // First, clear all bits but the sign bit from the second operand (sign).
22359 if (IsFakeVector)
22360 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22361 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22362
22363 // Next, clear the sign bit from the first operand (magnitude).
22364 // TODO: If we had general constant folding for FP logic ops, this check
22365 // wouldn't be necessary.
22366 SDValue MagBits;
22367 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22368 APFloat APF = Op0CN->getValueAPF();
22369 APF.clearSign();
22370 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22371 } else {
22372 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22373 if (IsFakeVector)
22374 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22375 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22376 }
22377
22378 // OR the magnitude value with the sign bit.
22379 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22380 return !IsFakeVector ? Or
22381 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22382 DAG.getVectorIdxConstant(0, dl));
22383}
22384
22386 SDValue N0 = Op.getOperand(0);
22387 SDLoc dl(Op);
22388 MVT VT = Op.getSimpleValueType();
22389
22390 MVT OpVT = N0.getSimpleValueType();
22391 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22392 "Unexpected type for FGETSIGN");
22393
22394 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22395 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22396 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22397 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22398 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22399 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22400 return Res;
22401}
22402
22403/// Helper for attempting to create a X86ISD::BT node.
22404static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22405 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22406 // instruction. Since the shift amount is in-range-or-undefined, we know
22407 // that doing a bittest on the i32 value is ok. We extend to i32 because
22408 // the encoding for the i16 version is larger than the i32 version.
22409 // Also promote i16 to i32 for performance / code size reason.
22410 if (Src.getValueType().getScalarSizeInBits() < 32)
22411 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22412
22413 // No legal type found, give up.
22414 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22415 return SDValue();
22416
22417 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22418 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22419 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22420 // known to be zero.
22421 if (Src.getValueType() == MVT::i64 &&
22422 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22423 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22424
22425 // If the operand types disagree, extend the shift amount to match. Since
22426 // BT ignores high bits (like shifts) we can use anyextend.
22427 if (Src.getValueType() != BitNo.getValueType()) {
22428 // Peek through a mask/modulo operation.
22429 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22430 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22431 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22432 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22433 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22434 BitNo.getOperand(0)),
22435 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22436 BitNo.getOperand(1)));
22437 else
22438 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22439 }
22440
22441 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22442}
22443
22444/// Helper for creating a X86ISD::SETCC node.
22446 SelectionDAG &DAG) {
22447 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22448 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22449}
22450
22451/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22452/// recognizable memcmp expansion.
22453static bool isOrXorXorTree(SDValue X, bool Root = true) {
22454 if (X.getOpcode() == ISD::OR)
22455 return isOrXorXorTree(X.getOperand(0), false) &&
22456 isOrXorXorTree(X.getOperand(1), false);
22457 if (Root)
22458 return false;
22459 return X.getOpcode() == ISD::XOR;
22460}
22461
22462/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22463/// expansion.
22464template <typename F>
22466 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22467 SDValue Op0 = X.getOperand(0);
22468 SDValue Op1 = X.getOperand(1);
22469 if (X.getOpcode() == ISD::OR) {
22470 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22471 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22472 if (VecVT != CmpVT)
22473 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22474 if (HasPT)
22475 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22476 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22477 }
22478 if (X.getOpcode() == ISD::XOR) {
22479 SDValue A = SToV(Op0);
22480 SDValue B = SToV(Op1);
22481 if (VecVT != CmpVT)
22482 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22483 if (HasPT)
22484 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22485 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22486 }
22487 llvm_unreachable("Impossible");
22488}
22489
22490/// Try to map a 128-bit or larger integer comparison to vector instructions
22491/// before type legalization splits it up into chunks.
22494 const SDLoc &DL,
22495 SelectionDAG &DAG,
22496 const X86Subtarget &Subtarget) {
22497 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22498
22499 // We're looking for an oversized integer equality comparison.
22500 EVT OpVT = X.getValueType();
22501 unsigned OpSize = OpVT.getSizeInBits();
22502 if (!OpVT.isScalarInteger() || OpSize < 128)
22503 return SDValue();
22504
22505 // Ignore a comparison with zero because that gets special treatment in
22506 // EmitTest(). But make an exception for the special case of a pair of
22507 // logically-combined vector-sized operands compared to zero. This pattern may
22508 // be generated by the memcmp expansion pass with oversized integer compares
22509 // (see PR33325).
22510 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22511 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22512 return SDValue();
22513
22514 // Don't perform this combine if constructing the vector will be expensive.
22515 auto IsVectorBitCastCheap = [](SDValue X) {
22517 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22518 X.getOpcode() == ISD::LOAD;
22519 };
22520 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22521 !IsOrXorXorTreeCCZero)
22522 return SDValue();
22523
22524 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22525 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22526 // Otherwise use PCMPEQ (plus AND) and mask testing.
22527 bool NoImplicitFloatOps =
22529 Attribute::NoImplicitFloat);
22530 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22531 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22532 (OpSize == 256 && Subtarget.hasAVX()) ||
22533 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22534 bool HasPT = Subtarget.hasSSE41();
22535
22536 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22537 // vector registers are essentially free. (Technically, widening registers
22538 // prevents load folding, but the tradeoff is worth it.)
22539 bool PreferKOT = Subtarget.preferMaskRegisters();
22540 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22541
22542 EVT VecVT = MVT::v16i8;
22543 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22544 if (OpSize == 256) {
22545 VecVT = MVT::v32i8;
22546 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22547 }
22548 EVT CastVT = VecVT;
22549 bool NeedsAVX512FCast = false;
22550 if (OpSize == 512 || NeedZExt) {
22551 if (Subtarget.hasBWI()) {
22552 VecVT = MVT::v64i8;
22553 CmpVT = MVT::v64i1;
22554 if (OpSize == 512)
22555 CastVT = VecVT;
22556 } else {
22557 VecVT = MVT::v16i32;
22558 CmpVT = MVT::v16i1;
22559 CastVT = OpSize == 512 ? VecVT
22560 : OpSize == 256 ? MVT::v8i32
22561 : MVT::v4i32;
22562 NeedsAVX512FCast = true;
22563 }
22564 }
22565
22566 auto ScalarToVector = [&](SDValue X) -> SDValue {
22567 bool TmpZext = false;
22568 EVT TmpCastVT = CastVT;
22569 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22570 SDValue OrigX = X.getOperand(0);
22571 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22572 if (OrigSize < OpSize) {
22573 if (OrigSize == 128) {
22574 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22575 X = OrigX;
22576 TmpZext = true;
22577 } else if (OrigSize == 256) {
22578 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22579 X = OrigX;
22580 TmpZext = true;
22581 }
22582 }
22583 }
22584 X = DAG.getBitcast(TmpCastVT, X);
22585 if (!NeedZExt && !TmpZext)
22586 return X;
22587 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22588 DAG.getConstant(0, DL, VecVT), X,
22589 DAG.getVectorIdxConstant(0, DL));
22590 };
22591
22592 SDValue Cmp;
22593 if (IsOrXorXorTreeCCZero) {
22594 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22595 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22596 // Use 2 vector equality compares and 'and' the results before doing a
22597 // MOVMSK.
22598 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22599 } else {
22600 SDValue VecX = ScalarToVector(X);
22601 SDValue VecY = ScalarToVector(Y);
22602 if (VecVT != CmpVT) {
22603 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22604 } else if (HasPT) {
22605 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22606 } else {
22607 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22608 }
22609 }
22610 // AVX512 should emit a setcc that will lower to kortest.
22611 if (VecVT != CmpVT) {
22612 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22613 : CmpVT == MVT::v32i1 ? MVT::i32
22614 : MVT::i16;
22615 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22616 DAG.getConstant(0, DL, KRegVT), CC);
22617 }
22618 if (HasPT) {
22619 SDValue BCCmp =
22620 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22621 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22623 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22624 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22625 }
22626 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22627 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22628 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22629 assert(Cmp.getValueType() == MVT::v16i8 &&
22630 "Non 128-bit vector on pre-SSE41 target");
22631 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22632 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22633 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22634 }
22635
22636 return SDValue();
22637}
22638
22639/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22640/// style scalarized (associative) reduction patterns. Partial reductions
22641/// are supported when the pointer SrcMask is non-null.
22642/// TODO - move this to SelectionDAG?
22645 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22647 DenseMap<SDValue, APInt> SrcOpMap;
22648 EVT VT = MVT::Other;
22649
22650 // Recognize a special case where a vector is casted into wide integer to
22651 // test all 0s.
22652 assert(Op.getOpcode() == unsigned(BinOp) &&
22653 "Unexpected bit reduction opcode");
22654 Opnds.push_back(Op.getOperand(0));
22655 Opnds.push_back(Op.getOperand(1));
22656
22657 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22659 // BFS traverse all BinOp operands.
22660 if (I->getOpcode() == unsigned(BinOp)) {
22661 Opnds.push_back(I->getOperand(0));
22662 Opnds.push_back(I->getOperand(1));
22663 // Re-evaluate the number of nodes to be traversed.
22664 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22665 continue;
22666 }
22667
22668 // Quit if a non-EXTRACT_VECTOR_ELT
22669 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22670 return false;
22671
22672 // Quit if without a constant index.
22673 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22674 if (!Idx)
22675 return false;
22676
22677 SDValue Src = I->getOperand(0);
22678 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22679 if (M == SrcOpMap.end()) {
22680 VT = Src.getValueType();
22681 // Quit if not the same type.
22682 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22683 return false;
22684 unsigned NumElts = VT.getVectorNumElements();
22685 APInt EltCount = APInt::getZero(NumElts);
22686 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22687 SrcOps.push_back(Src);
22688 }
22689
22690 // Quit if element already used.
22691 unsigned CIdx = Idx->getZExtValue();
22692 if (M->second[CIdx])
22693 return false;
22694 M->second.setBit(CIdx);
22695 }
22696
22697 if (SrcMask) {
22698 // Collect the source partial masks.
22699 for (SDValue &SrcOp : SrcOps)
22700 SrcMask->push_back(SrcOpMap[SrcOp]);
22701 } else {
22702 // Quit if not all elements are used.
22703 for (const auto &I : SrcOpMap)
22704 if (!I.second.isAllOnes())
22705 return false;
22706 }
22707
22708 return true;
22709}
22710
22711// Helper function for comparing all bits of two vectors.
22713 ISD::CondCode CC, const APInt &OriginalMask,
22714 const X86Subtarget &Subtarget,
22715 SelectionDAG &DAG, X86::CondCode &X86CC) {
22716 EVT VT = LHS.getValueType();
22717 unsigned ScalarSize = VT.getScalarSizeInBits();
22718 if (OriginalMask.getBitWidth() != ScalarSize) {
22719 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22720 return SDValue();
22721 }
22722
22723 // Quit if not convertable to legal scalar or 128/256-bit vector.
22724 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22725 return SDValue();
22726
22727 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22728 if (VT.isFloatingPoint())
22729 return SDValue();
22730
22731 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22732 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22733
22734 APInt Mask = OriginalMask;
22735
22736 auto MaskBits = [&](SDValue Src) {
22737 if (Mask.isAllOnes())
22738 return Src;
22739 EVT SrcVT = Src.getValueType();
22740 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22741 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22742 };
22743
22744 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22745 if (VT.getSizeInBits() < 128) {
22746 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22747 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
22748 if (IntVT != MVT::i64)
22749 return SDValue();
22750 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
22751 MVT::i32, MVT::i32);
22752 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
22753 MVT::i32, MVT::i32);
22754 SDValue Lo =
22755 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
22756 SDValue Hi =
22757 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
22758 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22759 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
22760 DAG.getConstant(0, DL, MVT::i32));
22761 }
22762 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22763 DAG.getBitcast(IntVT, MaskBits(LHS)),
22764 DAG.getBitcast(IntVT, MaskBits(RHS)));
22765 }
22766
22767 // Without PTEST, a masked v2i64 or-reduction is not faster than
22768 // scalarization.
22769 bool UseKORTEST = Subtarget.useAVX512Regs();
22770 bool UsePTEST = Subtarget.hasSSE41();
22771 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
22772 return SDValue();
22773
22774 // Split down to 128/256/512-bit vector.
22775 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
22776
22777 // If the input vector has vector elements wider than the target test size,
22778 // then cast to <X x i64> so it will safely split.
22779 if (ScalarSize > TestSize) {
22780 if (!Mask.isAllOnes())
22781 return SDValue();
22782 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
22783 LHS = DAG.getBitcast(VT, LHS);
22784 RHS = DAG.getBitcast(VT, RHS);
22785 Mask = APInt::getAllOnes(64);
22786 }
22787
22788 if (VT.getSizeInBits() > TestSize) {
22789 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
22790 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
22791 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22792 while (VT.getSizeInBits() > TestSize) {
22793 auto Split = DAG.SplitVector(LHS, DL);
22794 VT = Split.first.getValueType();
22795 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22796 }
22797 RHS = DAG.getAllOnesConstant(DL, VT);
22798 } else if (!UsePTEST && !KnownRHS.isZero()) {
22799 // MOVMSK Special Case:
22800 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22801 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
22802 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
22803 LHS = DAG.getBitcast(VT, MaskBits(LHS));
22804 RHS = DAG.getBitcast(VT, MaskBits(RHS));
22805 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
22806 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
22807 V = DAG.getSExtOrTrunc(V, DL, VT);
22808 while (VT.getSizeInBits() > TestSize) {
22809 auto Split = DAG.SplitVector(V, DL);
22810 VT = Split.first.getValueType();
22811 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22812 }
22813 V = DAG.getNOT(DL, V, VT);
22814 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22815 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22816 DAG.getConstant(0, DL, MVT::i32));
22817 } else {
22818 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
22819 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
22820 while (VT.getSizeInBits() > TestSize) {
22821 auto Split = DAG.SplitVector(V, DL);
22822 VT = Split.first.getValueType();
22823 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22824 }
22825 LHS = V;
22826 RHS = DAG.getConstant(0, DL, VT);
22827 }
22828 }
22829
22830 if (UseKORTEST && VT.is512BitVector()) {
22831 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22832 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
22833 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22834 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22835 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
22836 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
22837 }
22838
22839 if (UsePTEST) {
22840 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
22841 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22842 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22843 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
22844 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22845 }
22846
22847 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22848 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
22849 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
22850 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
22851 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
22852 V = DAG.getNOT(DL, V, MaskVT);
22853 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22854 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22855 DAG.getConstant(0, DL, MVT::i32));
22856}
22857
22858// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22859// to CMP(MOVMSK(PCMPEQB(X,Y))).
22861 ISD::CondCode CC, const SDLoc &DL,
22862 const X86Subtarget &Subtarget,
22863 SelectionDAG &DAG,
22864 X86::CondCode &X86CC) {
22865 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22866
22867 bool CmpNull = isNullConstant(RHS);
22868 bool CmpAllOnes = isAllOnesConstant(RHS);
22869 if (!CmpNull && !CmpAllOnes)
22870 return SDValue();
22871
22872 SDValue Op = LHS;
22873 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22874 return SDValue();
22875
22876 // Check whether we're masking/truncating an OR-reduction result, in which
22877 // case track the masked bits.
22878 // TODO: Add CmpAllOnes support.
22879 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
22880 if (CmpNull) {
22881 switch (Op.getOpcode()) {
22882 case ISD::TRUNCATE: {
22883 SDValue Src = Op.getOperand(0);
22884 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22885 Op.getScalarValueSizeInBits());
22886 Op = Src;
22887 break;
22888 }
22889 case ISD::AND: {
22890 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22891 Mask = Cst->getAPIntValue();
22892 Op = Op.getOperand(0);
22893 }
22894 break;
22895 }
22896 }
22897 }
22898
22899 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
22900
22901 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
22902 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22904 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
22905 EVT VT = VecIns[0].getValueType();
22906 assert(llvm::all_of(VecIns,
22907 [VT](SDValue V) { return VT == V.getValueType(); }) &&
22908 "Reduction source vector mismatch");
22909
22910 // Quit if not splittable to scalar/128/256/512-bit vector.
22911 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22912 return SDValue();
22913
22914 // If more than one full vector is evaluated, AND/OR them first before
22915 // PTEST.
22916 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22917 Slot += 2, e += 1) {
22918 // Each iteration will AND/OR 2 nodes and append the result until there is
22919 // only 1 node left, i.e. the final value of all vectors.
22920 SDValue LHS = VecIns[Slot];
22921 SDValue RHS = VecIns[Slot + 1];
22922 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
22923 }
22924
22925 return LowerVectorAllEqual(DL, VecIns.back(),
22926 CmpNull ? DAG.getConstant(0, DL, VT)
22927 : DAG.getAllOnesConstant(DL, VT),
22928 CC, Mask, Subtarget, DAG, X86CC);
22929 }
22930
22931 // Match icmp(reduce_or(X),0) anyof reduction patterns.
22932 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22933 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22934 ISD::NodeType BinOp;
22935 if (SDValue Match =
22936 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
22937 EVT MatchVT = Match.getValueType();
22939 CmpNull ? DAG.getConstant(0, DL, MatchVT)
22940 : DAG.getAllOnesConstant(DL, MatchVT),
22941 CC, Mask, Subtarget, DAG, X86CC);
22942 }
22943 }
22944
22945 if (Mask.isAllOnes()) {
22946 assert(!Op.getValueType().isVector() &&
22947 "Illegal vector type for reduction pattern");
22949 if (Src.getValueType().isFixedLengthVector() &&
22950 Src.getValueType().getScalarType() == MVT::i1) {
22951 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
22952 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22953 if (Src.getOpcode() == ISD::SETCC) {
22954 SDValue LHS = Src.getOperand(0);
22955 SDValue RHS = Src.getOperand(1);
22956 EVT LHSVT = LHS.getValueType();
22957 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22958 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
22959 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
22960 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
22961 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
22962 X86CC);
22963 }
22964 }
22965 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
22966 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22967 // Peek through truncation, mask the LSB and compare against zero/LSB.
22968 if (Src.getOpcode() == ISD::TRUNCATE) {
22969 SDValue Inner = Src.getOperand(0);
22970 EVT InnerVT = Inner.getValueType();
22971 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
22972 unsigned BW = InnerVT.getScalarSizeInBits();
22973 APInt SrcMask = APInt(BW, 1);
22974 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
22975 return LowerVectorAllEqual(DL, Inner,
22976 DAG.getConstant(Cmp, DL, InnerVT), CC,
22977 SrcMask, Subtarget, DAG, X86CC);
22978 }
22979 }
22980 }
22981 }
22982
22983 return SDValue();
22984}
22985
22986/// return true if \c Op has a use that doesn't just read flags.
22988 for (SDUse &Use : Op->uses()) {
22989 SDNode *User = Use.getUser();
22990 unsigned UOpNo = Use.getOperandNo();
22991 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22992 // Look past truncate.
22993 UOpNo = User->use_begin()->getOperandNo();
22994 User = User->use_begin()->getUser();
22995 }
22996
22997 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22998 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22999 return true;
23000 }
23001 return false;
23002}
23003
23004// Transform to an x86-specific ALU node with flags if there is a chance of
23005// using an RMW op or only the flags are used. Otherwise, leave
23006// the node alone and emit a 'cmp' or 'test' instruction.
23008 for (SDNode *U : Op->users())
23009 if (U->getOpcode() != ISD::CopyToReg &&
23010 U->getOpcode() != ISD::SETCC &&
23011 U->getOpcode() != ISD::STORE)
23012 return false;
23013
23014 return true;
23015}
23016
23017/// Emit nodes that will be selected as "test Op0,Op0", or something
23018/// equivalent.
23020 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23021 // CF and OF aren't always set the way we want. Determine which
23022 // of these we need.
23023 bool NeedCF = false;
23024 bool NeedOF = false;
23025 switch (X86CC) {
23026 default: break;
23027 case X86::COND_A: case X86::COND_AE:
23028 case X86::COND_B: case X86::COND_BE:
23029 NeedCF = true;
23030 break;
23031 case X86::COND_G: case X86::COND_GE:
23032 case X86::COND_L: case X86::COND_LE:
23033 case X86::COND_O: case X86::COND_NO: {
23034 // Check if we really need to set the
23035 // Overflow flag. If NoSignedWrap is present
23036 // that is not actually needed.
23037 switch (Op->getOpcode()) {
23038 case ISD::ADD:
23039 case ISD::SUB:
23040 case ISD::MUL:
23041 case ISD::SHL:
23042 if (Op.getNode()->getFlags().hasNoSignedWrap())
23043 break;
23044 [[fallthrough]];
23045 default:
23046 NeedOF = true;
23047 break;
23048 }
23049 break;
23050 }
23051 }
23052 // See if we can use the EFLAGS value from the operand instead of
23053 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23054 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23055 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23056 // Emit a CMP with 0, which is the TEST pattern.
23057 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23058 DAG.getConstant(0, dl, Op.getValueType()));
23059 }
23060 unsigned Opcode = 0;
23061 unsigned NumOperands = 0;
23062
23063 SDValue ArithOp = Op;
23064
23065 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23066 // which may be the result of a CAST. We use the variable 'Op', which is the
23067 // non-casted variable when we check for possible users.
23068 switch (ArithOp.getOpcode()) {
23069 case ISD::AND:
23070 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23071 // because a TEST instruction will be better.
23072 if (!hasNonFlagsUse(Op))
23073 break;
23074
23075 [[fallthrough]];
23076 case ISD::ADD:
23077 case ISD::SUB:
23078 case ISD::OR:
23079 case ISD::XOR:
23081 break;
23082
23083 // Otherwise use a regular EFLAGS-setting instruction.
23084 switch (ArithOp.getOpcode()) {
23085 // clang-format off
23086 default: llvm_unreachable("unexpected operator!");
23087 case ISD::ADD: Opcode = X86ISD::ADD; break;
23088 case ISD::SUB: Opcode = X86ISD::SUB; break;
23089 case ISD::XOR: Opcode = X86ISD::XOR; break;
23090 case ISD::AND: Opcode = X86ISD::AND; break;
23091 case ISD::OR: Opcode = X86ISD::OR; break;
23092 // clang-format on
23093 }
23094
23095 NumOperands = 2;
23096 break;
23097 case X86ISD::ADD:
23098 case X86ISD::SUB:
23099 case X86ISD::OR:
23100 case X86ISD::XOR:
23101 case X86ISD::AND:
23102 return SDValue(Op.getNode(), 1);
23103 case ISD::SSUBO:
23104 case ISD::USUBO: {
23105 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23106 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23107 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23108 Op->getOperand(1)).getValue(1);
23109 }
23110 default:
23111 break;
23112 }
23113
23114 if (Opcode == 0) {
23115 // Emit a CMP with 0, which is the TEST pattern.
23116 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23117 DAG.getConstant(0, dl, Op.getValueType()));
23118 }
23119 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23120 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
23121
23122 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23123 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23124 return SDValue(New.getNode(), 1);
23125}
23126
23127/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23128/// equivalent.
23130 const SDLoc &dl, SelectionDAG &DAG,
23131 const X86Subtarget &Subtarget) {
23132 if (isNullConstant(Op1))
23133 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23134
23135 EVT CmpVT = Op0.getValueType();
23136
23137 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23138 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23139
23140 // Only promote the compare up to I32 if it is a 16 bit operation
23141 // with an immediate. 16 bit immediates are to be avoided unless the target
23142 // isn't slowed down by length changing prefixes, we're optimizing for
23143 // codesize or the comparison is with a folded load.
23144 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23145 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23147 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23148 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23149 // Don't do this if the immediate can fit in 8-bits.
23150 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23151 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23152 unsigned ExtendOp =
23154 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23155 // For equality comparisons try to use SIGN_EXTEND if the input was
23156 // truncate from something with enough sign bits.
23157 if (Op0.getOpcode() == ISD::TRUNCATE) {
23158 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23159 ExtendOp = ISD::SIGN_EXTEND;
23160 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23161 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23162 ExtendOp = ISD::SIGN_EXTEND;
23163 }
23164 }
23165
23166 CmpVT = MVT::i32;
23167 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23168 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23169 }
23170 }
23171
23172 // Try to shrink i64 compares if the input has enough zero bits.
23173 // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
23174 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23175 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23176 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23177 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23178 CmpVT = MVT::i32;
23179 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23180 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23181 }
23182
23183 // 0-x == y --> x+y == 0
23184 // 0-x != y --> x+y != 0
23185 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23186 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23187 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23188 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23189 return Add.getValue(1);
23190 }
23191
23192 // x == 0-y --> x+y == 0
23193 // x != 0-y --> x+y != 0
23194 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23195 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23196 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23197 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23198 return Add.getValue(1);
23199 }
23200
23201 // If we already have an XOR of the ops, use that to check for equality.
23202 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23203 unsigned X86Opc = X86ISD::SUB;
23204 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23205 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23206 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23207 X86Opc = X86ISD::XOR;
23208
23209 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23210 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23211 return CmpOp.getValue(1);
23212}
23213
23215 EVT VT) const {
23216 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
23217}
23218
23219bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23220 SDNode *N, SDValue, SDValue IntPow2) const {
23221 if (N->getOpcode() == ISD::FDIV)
23222 return true;
23223
23224 EVT FPVT = N->getValueType(0);
23225 EVT IntVT = IntPow2.getValueType();
23226
23227 // This indicates a non-free bitcast.
23228 // TODO: This is probably overly conservative as we will need to scale the
23229 // integer vector anyways for the int->fp cast.
23230 if (FPVT.isVector() &&
23231 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23232 return false;
23233
23234 return true;
23235}
23236
23237/// Check if replacement of SQRT with RSQRT should be disabled.
23238bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23239 EVT VT = Op.getValueType();
23240
23241 // We don't need to replace SQRT with RSQRT for half type.
23242 if (VT.getScalarType() == MVT::f16)
23243 return true;
23244
23245 // We never want to use both SQRT and RSQRT instructions for the same input.
23246 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23247 return false;
23248
23249 if (VT.isVector())
23250 return Subtarget.hasFastVectorFSQRT();
23251 return Subtarget.hasFastScalarFSQRT();
23252}
23253
23254/// The minimum architected relative accuracy is 2^-12. We need one
23255/// Newton-Raphson step to have a good float result (24 bits of precision).
23256SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23257 SelectionDAG &DAG, int Enabled,
23258 int &RefinementSteps,
23259 bool &UseOneConstNR,
23260 bool Reciprocal) const {
23261 SDLoc DL(Op);
23262 EVT VT = Op.getValueType();
23263
23264 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23265 // It is likely not profitable to do this for f64 because a double-precision
23266 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23267 // instructions: convert to single, rsqrtss, convert back to double, refine
23268 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23269 // along with FMA, this could be a throughput win.
23270 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23271 // after legalize types.
23272 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23273 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23274 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23275 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23276 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23277 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23278 RefinementSteps = 1;
23279
23280 UseOneConstNR = false;
23281 // There is no FSQRT for 512-bits, but there is RSQRT14.
23282 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23283 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23284 if (RefinementSteps == 0 && !Reciprocal)
23285 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23286 return Estimate;
23287 }
23288
23289 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23290 Subtarget.hasFP16()) {
23291 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23292 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23293 RefinementSteps = 0;
23294
23295 if (VT == MVT::f16) {
23297 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23298 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23299 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23300 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23301 }
23302
23303 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23304 }
23305 return SDValue();
23306}
23307
23308/// The minimum architected relative accuracy is 2^-12. We need one
23309/// Newton-Raphson step to have a good float result (24 bits of precision).
23310SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23311 int Enabled,
23312 int &RefinementSteps) const {
23313 SDLoc DL(Op);
23314 EVT VT = Op.getValueType();
23315
23316 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23317 // It is likely not profitable to do this for f64 because a double-precision
23318 // reciprocal estimate with refinement on x86 prior to FMA requires
23319 // 15 instructions: convert to single, rcpss, convert back to double, refine
23320 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23321 // along with FMA, this could be a throughput win.
23322
23323 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23324 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23325 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23326 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23327 // Enable estimate codegen with 1 refinement step for vector division.
23328 // Scalar division estimates are disabled because they break too much
23329 // real-world code. These defaults are intended to match GCC behavior.
23330 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23331 return SDValue();
23332
23333 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23334 RefinementSteps = 1;
23335
23336 // There is no FSQRT for 512-bits, but there is RCP14.
23337 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23338 return DAG.getNode(Opcode, DL, VT, Op);
23339 }
23340
23341 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23342 Subtarget.hasFP16()) {
23343 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23344 RefinementSteps = 0;
23345
23346 if (VT == MVT::f16) {
23348 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23349 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23350 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23351 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23352 }
23353
23354 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23355 }
23356 return SDValue();
23357}
23358
23359/// If we have at least two divisions that use the same divisor, convert to
23360/// multiplication by a reciprocal. This may need to be adjusted for a given
23361/// CPU if a division's cost is not at least twice the cost of a multiplication.
23362/// This is because we still need one division to calculate the reciprocal and
23363/// then we need two multiplies by that reciprocal as replacements for the
23364/// original divisions.
23365unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
23366 return 2;
23367}
23368
23369SDValue
23370X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23371 SelectionDAG &DAG,
23372 SmallVectorImpl<SDNode *> &Created) const {
23374 if (isIntDivCheap(N->getValueType(0), Attr))
23375 return SDValue(N,0); // Lower SDIV as SDIV
23376
23377 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23378 "Unexpected divisor!");
23379
23380 // Only perform this transform if CMOV is supported otherwise the select
23381 // below will become a branch.
23382 if (!Subtarget.canUseCMOV())
23383 return SDValue();
23384
23385 // fold (sdiv X, pow2)
23386 EVT VT = N->getValueType(0);
23387 // FIXME: Support i8.
23388 if (VT != MVT::i16 && VT != MVT::i32 &&
23389 !(Subtarget.is64Bit() && VT == MVT::i64))
23390 return SDValue();
23391
23392 // If the divisor is 2 or -2, the default expansion is better.
23393 if (Divisor == 2 ||
23394 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23395 return SDValue();
23396
23397 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23398}
23399
23400/// Result of 'and' is compared against zero. Change to a BT node if possible.
23401/// Returns the BT node and the condition code needed to use it.
23403 SelectionDAG &DAG, X86::CondCode &X86CC) {
23404 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23405 SDValue Op0 = And.getOperand(0);
23406 SDValue Op1 = And.getOperand(1);
23407 if (Op0.getOpcode() == ISD::TRUNCATE)
23408 Op0 = Op0.getOperand(0);
23409 if (Op1.getOpcode() == ISD::TRUNCATE)
23410 Op1 = Op1.getOperand(0);
23411
23412 SDValue Src, BitNo;
23413 if (Op1.getOpcode() == ISD::SHL)
23414 std::swap(Op0, Op1);
23415 if (Op0.getOpcode() == ISD::SHL) {
23416 if (isOneConstant(Op0.getOperand(0))) {
23417 // If we looked past a truncate, check that it's only truncating away
23418 // known zeros.
23419 unsigned BitWidth = Op0.getValueSizeInBits();
23420 unsigned AndBitWidth = And.getValueSizeInBits();
23421 if (BitWidth > AndBitWidth) {
23422 KnownBits Known = DAG.computeKnownBits(Op0);
23423 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23424 return SDValue();
23425 }
23426 Src = Op1;
23427 BitNo = Op0.getOperand(1);
23428 }
23429 } else if (Op1.getOpcode() == ISD::Constant) {
23430 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23431 uint64_t AndRHSVal = AndRHS->getZExtValue();
23432 SDValue AndLHS = Op0;
23433
23434 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23435 Src = AndLHS.getOperand(0);
23436 BitNo = AndLHS.getOperand(1);
23437 } else {
23438 // Use BT if the immediate can't be encoded in a TEST instruction or we
23439 // are optimizing for size and the immedaite won't fit in a byte.
23440 bool OptForSize = DAG.shouldOptForSize();
23441 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23442 isPowerOf2_64(AndRHSVal)) {
23443 Src = AndLHS;
23444 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23445 Src.getValueType());
23446 }
23447 }
23448 }
23449
23450 // No patterns found, give up.
23451 if (!Src.getNode())
23452 return SDValue();
23453
23454 // Remove any bit flip.
23455 if (isBitwiseNot(Src)) {
23456 Src = Src.getOperand(0);
23458 }
23459
23460 // Attempt to create the X86ISD::BT node.
23461 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23462 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23463 return BT;
23464 }
23465
23466 return SDValue();
23467}
23468
23469// Check if pre-AVX condcode can be performed by a single FCMP op.
23470static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23471 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23472}
23473
23474/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23475/// CMPs.
23476static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23477 SDValue &Op1, bool &IsAlwaysSignaling) {
23478 unsigned SSECC;
23479 bool Swap = false;
23480
23481 // SSE Condition code mapping:
23482 // 0 - EQ
23483 // 1 - LT
23484 // 2 - LE
23485 // 3 - UNORD
23486 // 4 - NEQ
23487 // 5 - NLT
23488 // 6 - NLE
23489 // 7 - ORD
23490 switch (SetCCOpcode) {
23491 // clang-format off
23492 default: llvm_unreachable("Unexpected SETCC condition");
23493 case ISD::SETOEQ:
23494 case ISD::SETEQ: SSECC = 0; break;
23495 case ISD::SETOGT:
23496 case ISD::SETGT: Swap = true; [[fallthrough]];
23497 case ISD::SETLT:
23498 case ISD::SETOLT: SSECC = 1; break;
23499 case ISD::SETOGE:
23500 case ISD::SETGE: Swap = true; [[fallthrough]];
23501 case ISD::SETLE:
23502 case ISD::SETOLE: SSECC = 2; break;
23503 case ISD::SETUO: SSECC = 3; break;
23504 case ISD::SETUNE:
23505 case ISD::SETNE: SSECC = 4; break;
23506 case ISD::SETULE: Swap = true; [[fallthrough]];
23507 case ISD::SETUGE: SSECC = 5; break;
23508 case ISD::SETULT: Swap = true; [[fallthrough]];
23509 case ISD::SETUGT: SSECC = 6; break;
23510 case ISD::SETO: SSECC = 7; break;
23511 case ISD::SETUEQ: SSECC = 8; break;
23512 case ISD::SETONE: SSECC = 12; break;
23513 // clang-format on
23514 }
23515 if (Swap)
23516 std::swap(Op0, Op1);
23517
23518 switch (SetCCOpcode) {
23519 default:
23520 IsAlwaysSignaling = true;
23521 break;
23522 case ISD::SETEQ:
23523 case ISD::SETOEQ:
23524 case ISD::SETUEQ:
23525 case ISD::SETNE:
23526 case ISD::SETONE:
23527 case ISD::SETUNE:
23528 case ISD::SETO:
23529 case ISD::SETUO:
23530 IsAlwaysSignaling = false;
23531 break;
23532 }
23533
23534 return SSECC;
23535}
23536
23537/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23538/// concatenate the result back.
23540 SelectionDAG &DAG, const SDLoc &dl) {
23541 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23542 "Unsupported VTs!");
23543 SDValue CC = DAG.getCondCode(Cond);
23544
23545 // Extract the LHS Lo/Hi vectors
23546 SDValue LHS1, LHS2;
23547 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23548
23549 // Extract the RHS Lo/Hi vectors
23550 SDValue RHS1, RHS2;
23551 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23552
23553 // Issue the operation on the smaller types and concatenate the result back
23554 EVT LoVT, HiVT;
23555 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23556 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23557 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23558 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23559}
23560
23562 SelectionDAG &DAG) {
23563 SDValue Op0 = Op.getOperand(0);
23564 SDValue Op1 = Op.getOperand(1);
23565 SDValue CC = Op.getOperand(2);
23566 MVT VT = Op.getSimpleValueType();
23567 assert(VT.getVectorElementType() == MVT::i1 &&
23568 "Cannot set masked compare for this operation");
23569
23570 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23571
23572 // Prefer SETGT over SETLT.
23573 if (SetCCOpcode == ISD::SETLT) {
23574 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23575 std::swap(Op0, Op1);
23576 }
23577
23578 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23579}
23580
23581/// Given a buildvector constant, return a new vector constant with each element
23582/// incremented or decremented. If incrementing or decrementing would result in
23583/// unsigned overflow or underflow or this is not a simple vector constant,
23584/// return an empty value.
23586 bool NSW) {
23587 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23588 if (!BV || !V.getValueType().isSimple())
23589 return SDValue();
23590
23591 MVT VT = V.getSimpleValueType();
23592 MVT EltVT = VT.getVectorElementType();
23593 unsigned NumElts = VT.getVectorNumElements();
23595 SDLoc DL(V);
23596 for (unsigned i = 0; i < NumElts; ++i) {
23597 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23598 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23599 return SDValue();
23600
23601 // Avoid overflow/underflow.
23602 const APInt &EltC = Elt->getAPIntValue();
23603 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23604 return SDValue();
23605 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23606 (!IsInc && EltC.isMinSignedValue())))
23607 return SDValue();
23608
23609 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23610 }
23611
23612 return DAG.getBuildVector(VT, DL, NewVecC);
23613}
23614
23615/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23616/// Op0 u<= Op1:
23617/// t = psubus Op0, Op1
23618/// pcmpeq t, <0..0>
23620 ISD::CondCode Cond, const SDLoc &dl,
23621 const X86Subtarget &Subtarget,
23622 SelectionDAG &DAG) {
23623 if (!Subtarget.hasSSE2())
23624 return SDValue();
23625
23626 MVT VET = VT.getVectorElementType();
23627 if (VET != MVT::i8 && VET != MVT::i16)
23628 return SDValue();
23629
23630 switch (Cond) {
23631 default:
23632 return SDValue();
23633 case ISD::SETULT: {
23634 // If the comparison is against a constant we can turn this into a
23635 // setule. With psubus, setule does not require a swap. This is
23636 // beneficial because the constant in the register is no longer
23637 // destructed as the destination so it can be hoisted out of a loop.
23638 // Only do this pre-AVX since vpcmp* is no longer destructive.
23639 if (Subtarget.hasAVX())
23640 return SDValue();
23641 SDValue ULEOp1 =
23642 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23643 if (!ULEOp1)
23644 return SDValue();
23645 Op1 = ULEOp1;
23646 break;
23647 }
23648 case ISD::SETUGT: {
23649 // If the comparison is against a constant, we can turn this into a setuge.
23650 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23651 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23652 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23653 SDValue UGEOp1 =
23654 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
23655 if (!UGEOp1)
23656 return SDValue();
23657 Op1 = Op0;
23658 Op0 = UGEOp1;
23659 break;
23660 }
23661 // Psubus is better than flip-sign because it requires no inversion.
23662 case ISD::SETUGE:
23663 std::swap(Op0, Op1);
23664 break;
23665 case ISD::SETULE:
23666 break;
23667 }
23668
23669 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23670 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23671 DAG.getConstant(0, dl, VT));
23672}
23673
23674static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23675 SelectionDAG &DAG) {
23676 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23677 Op.getOpcode() == ISD::STRICT_FSETCCS;
23678 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23679 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23680 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23681 MVT VT = Op->getSimpleValueType(0);
23682 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23683 MVT OpVT = Op0.getSimpleValueType();
23684 SDLoc dl(Op);
23685
23686 if (OpVT.isFloatingPoint()) {
23687 MVT EltVT = OpVT.getVectorElementType();
23688 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
23689 EltVT == MVT::f64);
23690
23691 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23692 if (isSoftF16(EltVT, Subtarget)) {
23693 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
23694 return SDValue();
23695
23696 // Break 256-bit FP vector compare into smaller ones.
23697 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
23698 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23699
23700 // Break 512-bit FP vector compare into smaller ones.
23701 if (OpVT.is512BitVector())
23702 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23703
23704 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
23705 if (IsStrict) {
23706 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
23707 {Chain, Op0});
23708 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
23709 {Chain, Op1});
23710 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
23711 {Chain, Op0, Op1, CC});
23712 }
23713 MVT DVT = VT.getVectorElementType() == MVT::i16
23714 ? VT.changeVectorElementType(MVT::i32)
23715 : VT;
23716 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
23717 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
23718 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
23719 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
23720 }
23721
23722 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23723
23724 // If we have a strict compare with a vXi1 result and the input is 128/256
23725 // bits we can't use a masked compare unless we have VLX. If we use a wider
23726 // compare like we do for non-strict, we might trigger spurious exceptions
23727 // from the upper elements. Instead emit a AVX compare and convert to mask.
23728 unsigned Opc;
23729 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23730 (!IsStrict || Subtarget.hasVLX() ||
23732#ifndef NDEBUG
23733 unsigned Num = VT.getVectorNumElements();
23734 assert(Num <= 16 ||
23735 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
23736#endif
23737 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23738 } else {
23739 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23740 // The SSE/AVX packed FP comparison nodes are defined with a
23741 // floating-point vector result that matches the operand type. This allows
23742 // them to work with an SSE1 target (integer vector types are not legal).
23743 VT = Op0.getSimpleValueType();
23744 }
23745
23746 SDValue Cmp;
23747 bool IsAlwaysSignaling;
23748 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23749 if (!Subtarget.hasAVX()) {
23750 // TODO: We could use following steps to handle a quiet compare with
23751 // signaling encodings.
23752 // 1. Get ordered masks from a quiet ISD::SETO
23753 // 2. Use the masks to mask potential unordered elements in operand A, B
23754 // 3. Get the compare results of masked A, B
23755 // 4. Calculating final result using the mask and result from 3
23756 // But currently, we just fall back to scalar operations.
23757 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23758 return SDValue();
23759
23760 // Insert an extra signaling instruction to raise exception.
23761 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23762 SDValue SignalCmp = DAG.getNode(
23763 Opc, dl, {VT, MVT::Other},
23764 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23765 // FIXME: It seems we need to update the flags of all new strict nodes.
23766 // Otherwise, mayRaiseFPException in MI will return false due to
23767 // NoFPExcept = false by default. However, I didn't find it in other
23768 // patches.
23769 SignalCmp->setFlags(Op->getFlags());
23770 Chain = SignalCmp.getValue(1);
23771 }
23772
23773 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23774 // emit two comparisons and a logic op to tie them together.
23775 if (!cheapX86FSETCC_SSE(Cond)) {
23776 // LLVM predicate is SETUEQ or SETONE.
23777 unsigned CC0, CC1;
23778 unsigned CombineOpc;
23779 if (Cond == ISD::SETUEQ) {
23780 CC0 = 3; // UNORD
23781 CC1 = 0; // EQ
23782 CombineOpc = X86ISD::FOR;
23783 } else {
23785 CC0 = 7; // ORD
23786 CC1 = 4; // NEQ
23787 CombineOpc = X86ISD::FAND;
23788 }
23789
23790 SDValue Cmp0, Cmp1;
23791 if (IsStrict) {
23792 Cmp0 = DAG.getNode(
23793 Opc, dl, {VT, MVT::Other},
23794 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23795 Cmp1 = DAG.getNode(
23796 Opc, dl, {VT, MVT::Other},
23797 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23798 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23799 Cmp1.getValue(1));
23800 } else {
23801 Cmp0 = DAG.getNode(
23802 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23803 Cmp1 = DAG.getNode(
23804 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23805 }
23806 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23807 } else {
23808 if (IsStrict) {
23809 Cmp = DAG.getNode(
23810 Opc, dl, {VT, MVT::Other},
23811 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23812 Chain = Cmp.getValue(1);
23813 } else
23814 Cmp = DAG.getNode(
23815 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23816 }
23817 } else {
23818 // Handle all other FP comparisons here.
23819 if (IsStrict) {
23820 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23821 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23822 Cmp = DAG.getNode(
23823 Opc, dl, {VT, MVT::Other},
23824 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23825 Chain = Cmp.getValue(1);
23826 } else
23827 Cmp = DAG.getNode(
23828 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23829 }
23830
23831 if (VT.getFixedSizeInBits() >
23832 Op.getSimpleValueType().getFixedSizeInBits()) {
23833 // We emitted a compare with an XMM/YMM result. Finish converting to a
23834 // mask register using a vptestm.
23836 Cmp = DAG.getBitcast(CastVT, Cmp);
23837 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23838 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23839 } else {
23840 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23841 // the result type of SETCC. The bitcast is expected to be optimized
23842 // away during combining/isel.
23843 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23844 }
23845
23846 if (IsStrict)
23847 return DAG.getMergeValues({Cmp, Chain}, dl);
23848
23849 return Cmp;
23850 }
23851
23852 assert(!IsStrict && "Strict SETCC only handles FP operands.");
23853
23854 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
23855 assert(VTOp0 == Op1.getSimpleValueType() &&
23856 "Expected operands with same type!");
23858 "Invalid number of packed elements for source and destination!");
23859
23860 // The non-AVX512 code below works under the assumption that source and
23861 // destination types are the same.
23862 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23863 "Value types for source and destination must be the same!");
23864
23865 // The result is boolean, but operands are int/float
23866 if (VT.getVectorElementType() == MVT::i1) {
23867 // In AVX-512 architecture setcc returns mask with i1 elements,
23868 // But there is no compare instruction for i8 and i16 elements in KNL.
23869 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23870 "Unexpected operand type");
23871 return LowerIntVSETCC_AVX512(Op, dl, DAG);
23872 }
23873
23874 // Lower using XOP integer comparisons.
23875 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23876 // Translate compare code to XOP PCOM compare mode.
23877 unsigned CmpMode = 0;
23878 switch (Cond) {
23879 // clang-format off
23880 default: llvm_unreachable("Unexpected SETCC condition");
23881 case ISD::SETULT:
23882 case ISD::SETLT: CmpMode = 0x00; break;
23883 case ISD::SETULE:
23884 case ISD::SETLE: CmpMode = 0x01; break;
23885 case ISD::SETUGT:
23886 case ISD::SETGT: CmpMode = 0x02; break;
23887 case ISD::SETUGE:
23888 case ISD::SETGE: CmpMode = 0x03; break;
23889 case ISD::SETEQ: CmpMode = 0x04; break;
23890 case ISD::SETNE: CmpMode = 0x05; break;
23891 // clang-format on
23892 }
23893
23894 // Are we comparing unsigned or signed integers?
23895 unsigned Opc =
23897
23898 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23899 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23900 }
23901
23902 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23903 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23905 SDValue BC0 = peekThroughBitcasts(Op0);
23906 if (BC0.getOpcode() == ISD::AND &&
23908 /*AllowUndefs=*/false)) {
23909 Cond = ISD::SETEQ;
23910 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23911 }
23912 }
23913
23914 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23915 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23916 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23918 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23919 unsigned BitWidth = VT.getScalarSizeInBits();
23920 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23921
23922 SDValue Result = Op0.getOperand(0);
23923 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23924 DAG.getConstant(ShiftAmt, dl, VT));
23925 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23926 DAG.getConstant(BitWidth - 1, dl, VT));
23927 return Result;
23928 }
23929 }
23930
23931 // Break 256-bit integer vector compare into smaller ones.
23932 if (VT.is256BitVector() && !Subtarget.hasInt256())
23933 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23934
23935 // Break 512-bit integer vector compare into smaller ones.
23936 // TODO: Try harder to use VPCMPx + VPMOV2x?
23937 if (VT.is512BitVector())
23938 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23939
23940 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23941 // not-of-PCMPEQ:
23942 // X != INT_MIN --> X >s INT_MIN
23943 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23944 // +X != 0 --> +X >s 0
23945 APInt ConstValue;
23946 if (Cond == ISD::SETNE &&
23947 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23948 if (ConstValue.isMinSignedValue())
23949 Cond = ISD::SETGT;
23950 else if (ConstValue.isMaxSignedValue())
23951 Cond = ISD::SETLT;
23952 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
23953 Cond = ISD::SETGT;
23954 }
23955
23956 // If both operands are known non-negative, then an unsigned compare is the
23957 // same as a signed compare and there's no need to flip signbits.
23958 // TODO: We could check for more general simplifications here since we're
23959 // computing known bits.
23960 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23961 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23962
23963 // Special case: Use min/max operations for unsigned compares.
23964 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23966 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23967 TLI.isOperationLegal(ISD::UMIN, VT)) {
23968 // If we have a constant operand, increment/decrement it and change the
23969 // condition to avoid an invert.
23970 if (Cond == ISD::SETUGT) {
23971 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23972 if (SDValue UGTOp1 =
23973 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
23974 Op1 = UGTOp1;
23975 Cond = ISD::SETUGE;
23976 }
23977 }
23978 if (Cond == ISD::SETULT) {
23979 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23980 if (SDValue ULTOp1 =
23981 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
23982 Op1 = ULTOp1;
23983 Cond = ISD::SETULE;
23984 }
23985 }
23986 bool Invert = false;
23987 unsigned Opc;
23988 switch (Cond) {
23989 // clang-format off
23990 default: llvm_unreachable("Unexpected condition code");
23991 case ISD::SETUGT: Invert = true; [[fallthrough]];
23992 case ISD::SETULE: Opc = ISD::UMIN; break;
23993 case ISD::SETULT: Invert = true; [[fallthrough]];
23994 case ISD::SETUGE: Opc = ISD::UMAX; break;
23995 // clang-format on
23996 }
23997
23998 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23999 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24000
24001 // If the logical-not of the result is required, perform that now.
24002 if (Invert)
24003 Result = DAG.getNOT(dl, Result, VT);
24004
24005 return Result;
24006 }
24007
24008 // Try to use SUBUS and PCMPEQ.
24009 if (FlipSigns)
24010 if (SDValue V =
24011 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24012 return V;
24013
24014 // We are handling one of the integer comparisons here. Since SSE only has
24015 // GT and EQ comparisons for integer, swapping operands and multiple
24016 // operations may be required for some comparisons.
24017 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24019 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24021 bool Invert = Cond == ISD::SETNE ||
24023
24024 if (Swap)
24025 std::swap(Op0, Op1);
24026
24027 // Check that the operation in question is available (most are plain SSE2,
24028 // but PCMPGTQ and PCMPEQQ have different requirements).
24029 if (VT == MVT::v2i64) {
24030 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24031 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24032
24033 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24034 // the odd elements over the even elements.
24035 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24036 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24037 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24038
24039 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24040 static const int MaskHi[] = { 1, 1, 3, 3 };
24041 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24042
24043 return DAG.getBitcast(VT, Result);
24044 }
24045
24046 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24047 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24048 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24049
24050 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24051 static const int MaskHi[] = { 1, 1, 3, 3 };
24052 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24053
24054 return DAG.getBitcast(VT, Result);
24055 }
24056
24057 // If the i64 elements are sign-extended enough to be representable as i32
24058 // then we can compare the lower i32 bits and splat.
24059 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24060 DAG.ComputeNumSignBits(Op1) > 32) {
24061 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24062 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24063
24064 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24065 static const int MaskLo[] = {0, 0, 2, 2};
24066 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24067
24068 return DAG.getBitcast(VT, Result);
24069 }
24070
24071 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24072 // bits of the inputs before performing those operations. The lower
24073 // compare is always unsigned.
24074 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24075 : 0x0000000080000000ULL,
24076 dl, MVT::v2i64);
24077
24078 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24079 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24080
24081 // Cast everything to the right type.
24082 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24083 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24084
24085 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24086 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24087 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24088
24089 // Create masks for only the low parts/high parts of the 64 bit integers.
24090 static const int MaskHi[] = { 1, 1, 3, 3 };
24091 static const int MaskLo[] = { 0, 0, 2, 2 };
24092 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24093 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24094 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24095
24096 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24097 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24098
24099 if (Invert)
24100 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24101
24102 return DAG.getBitcast(VT, Result);
24103 }
24104
24105 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24106 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24107 // pcmpeqd + pshufd + pand.
24108 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24109
24110 // First cast everything to the right type.
24111 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24112 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24113
24114 // Do the compare.
24115 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24116
24117 // Make sure the lower and upper halves are both all-ones.
24118 static const int Mask[] = { 1, 0, 3, 2 };
24119 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24120 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24121
24122 if (Invert)
24123 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24124
24125 return DAG.getBitcast(VT, Result);
24126 }
24127 }
24128
24129 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24130 // bits of the inputs before performing those operations.
24131 if (FlipSigns) {
24132 MVT EltVT = VT.getVectorElementType();
24134 VT);
24135 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24136 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24137 }
24138
24139 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24140
24141 // If the logical-not of the result is required, perform that now.
24142 if (Invert)
24143 Result = DAG.getNOT(dl, Result, VT);
24144
24145 return Result;
24146}
24147
24148// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24150 const SDLoc &dl, SelectionDAG &DAG,
24151 const X86Subtarget &Subtarget,
24152 SDValue &X86CC) {
24153 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24154
24155 // Must be a bitcast from vXi1.
24156 if (Op0.getOpcode() != ISD::BITCAST)
24157 return SDValue();
24158
24159 Op0 = Op0.getOperand(0);
24160 MVT VT = Op0.getSimpleValueType();
24161 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24162 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24163 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24164 return SDValue();
24165
24166 X86::CondCode X86Cond;
24167 if (isNullConstant(Op1)) {
24168 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24169 } else if (isAllOnesConstant(Op1)) {
24170 // C flag is set for all ones.
24171 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24172 } else
24173 return SDValue();
24174
24175 // If the input is an AND, we can combine it's operands into the KTEST.
24176 bool KTestable = false;
24177 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24178 KTestable = true;
24179 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24180 KTestable = true;
24181 if (!isNullConstant(Op1))
24182 KTestable = false;
24183 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24184 SDValue LHS = Op0.getOperand(0);
24185 SDValue RHS = Op0.getOperand(1);
24186 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24187 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24188 }
24189
24190 // If the input is an OR, we can combine it's operands into the KORTEST.
24191 SDValue LHS = Op0;
24192 SDValue RHS = Op0;
24193 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24194 LHS = Op0.getOperand(0);
24195 RHS = Op0.getOperand(1);
24196 }
24197
24198 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24199 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24200}
24201
24202/// Emit flags for the given setcc condition and operands. Also returns the
24203/// corresponding X86 condition code constant in X86CC.
24204SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24205 ISD::CondCode CC, const SDLoc &dl,
24206 SelectionDAG &DAG,
24207 SDValue &X86CC) const {
24208 // Equality Combines.
24209 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24210 X86::CondCode X86CondCode;
24211
24212 // Optimize to BT if possible.
24213 // Lower (X & (1 << N)) == 0 to BT(X, N).
24214 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24215 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24216 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24217 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24218 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24219 return BT;
24220 }
24221 }
24222
24223 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24224 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24225 X86CondCode)) {
24226 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24227 return CmpZ;
24228 }
24229
24230 // Try to lower using KORTEST or KTEST.
24231 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24232 return Test;
24233
24234 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24235 // of these.
24236 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24237 // If the input is a setcc, then reuse the input setcc or use a new one
24238 // with the inverted condition.
24239 if (Op0.getOpcode() == X86ISD::SETCC) {
24240 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24241
24242 X86CC = Op0.getOperand(0);
24243 if (Invert) {
24244 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24245 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24246 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24247 }
24248
24249 return Op0.getOperand(1);
24250 }
24251 }
24252
24253 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24254 // overflow.
24255 if (isMinSignedConstant(Op1)) {
24256 EVT VT = Op0.getValueType();
24257 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24258 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24260 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24261 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24262 DAG.getConstant(0, dl, VT), Op0);
24263 return SDValue(Neg.getNode(), 1);
24264 }
24265 }
24266
24267 // Try to use the carry flag from the add in place of an separate CMP for:
24268 // (seteq (add X, -1), -1). Similar for setne.
24269 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24270 Op0.getOperand(1) == Op1) {
24271 if (isProfitableToUseFlagOp(Op0)) {
24272 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24273
24274 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24275 Op0.getOperand(1));
24276 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24277 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24278 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24279 return SDValue(New.getNode(), 1);
24280 }
24281 }
24282 }
24283
24285 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24286 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24287
24288 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24289 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24290 return EFLAGS;
24291}
24292
24293SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24294
24295 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24296 Op.getOpcode() == ISD::STRICT_FSETCCS;
24297 MVT VT = Op->getSimpleValueType(0);
24298
24299 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24300
24301 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24302 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24303 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24304 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24305 SDLoc dl(Op);
24307 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24308
24309 if (isSoftF16(Op0.getValueType(), Subtarget))
24310 return SDValue();
24311
24312 // Handle f128 first, since one possible outcome is a normal integer
24313 // comparison which gets handled by emitFlagsForSetcc.
24314 if (Op0.getValueType() == MVT::f128) {
24315 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24316 Op.getOpcode() == ISD::STRICT_FSETCCS);
24317
24318 // If softenSetCCOperands returned a scalar, use it.
24319 if (!Op1.getNode()) {
24320 assert(Op0.getValueType() == Op.getValueType() &&
24321 "Unexpected setcc expansion!");
24322 if (IsStrict)
24323 return DAG.getMergeValues({Op0, Chain}, dl);
24324 return Op0;
24325 }
24326 }
24327
24328 if (Op0.getSimpleValueType().isInteger()) {
24329 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24330 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24331 // this may translate to less uops depending on uarch implementation. The
24332 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24333 // canonicalize to that CondCode.
24334 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24335 // encoding size - so it must either already be a i8 or i32 immediate, or it
24336 // shrinks down to that. We don't do this for any i64's to avoid additional
24337 // constant materializations.
24338 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24339 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24340 const APInt &Op1Val = Op1C->getAPIntValue();
24341 if (!Op1Val.isZero()) {
24342 // Ensure the constant+1 doesn't overflow.
24343 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24344 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24345 APInt Op1ValPlusOne = Op1Val + 1;
24346 if (Op1ValPlusOne.isSignedIntN(32) &&
24347 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24348 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24351 }
24352 }
24353 }
24354 }
24355
24356 SDValue X86CC;
24357 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24358 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24359 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24360 }
24361
24362 if (Subtarget.hasAVX10_2()) {
24363 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24364 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24365 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24366 if (Op0.getSimpleValueType() != MVT::f80)
24367 return getSETCC(
24368 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24369 }
24370 }
24371 // Handle floating point.
24372 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24373 if (CondCode == X86::COND_INVALID)
24374 return SDValue();
24375
24376 SDValue EFLAGS;
24377 if (IsStrict) {
24378 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24379 EFLAGS =
24381 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24382 Chain = EFLAGS.getValue(1);
24383 } else {
24384 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24385 }
24386
24387 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24388 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24389 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24390}
24391
24392SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24393 SDValue LHS = Op.getOperand(0);
24394 SDValue RHS = Op.getOperand(1);
24395 SDValue Carry = Op.getOperand(2);
24396 SDValue Cond = Op.getOperand(3);
24397 SDLoc DL(Op);
24398
24399 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24400 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24401
24402 // Recreate the carry if needed.
24403 EVT CarryVT = Carry.getValueType();
24404 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24405 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24406
24407 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24408 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24409 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24410}
24411
24412// This function returns three things: the arithmetic computation itself
24413// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24414// flag and the condition code define the case in which the arithmetic
24415// computation overflows.
24416static std::pair<SDValue, SDValue>
24418 assert(Op.getResNo() == 0 && "Unexpected result number!");
24419 SDValue Value, Overflow;
24420 SDValue LHS = Op.getOperand(0);
24421 SDValue RHS = Op.getOperand(1);
24422 unsigned BaseOp = 0;
24423 SDLoc DL(Op);
24424 switch (Op.getOpcode()) {
24425 default: llvm_unreachable("Unknown ovf instruction!");
24426 case ISD::SADDO:
24427 BaseOp = X86ISD::ADD;
24428 Cond = X86::COND_O;
24429 break;
24430 case ISD::UADDO:
24431 BaseOp = X86ISD::ADD;
24433 break;
24434 case ISD::SSUBO:
24435 BaseOp = X86ISD::SUB;
24436 Cond = X86::COND_O;
24437 break;
24438 case ISD::USUBO:
24439 BaseOp = X86ISD::SUB;
24440 Cond = X86::COND_B;
24441 break;
24442 case ISD::SMULO:
24443 BaseOp = X86ISD::SMUL;
24444 Cond = X86::COND_O;
24445 break;
24446 case ISD::UMULO:
24447 BaseOp = X86ISD::UMUL;
24448 Cond = X86::COND_O;
24449 break;
24450 }
24451
24452 if (BaseOp) {
24453 // Also sets EFLAGS.
24454 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24455 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24456 Overflow = Value.getValue(1);
24457 }
24458
24459 return std::make_pair(Value, Overflow);
24460}
24461
24463 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24464 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24465 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24466 // has only one use.
24467 SDLoc DL(Op);
24469 SDValue Value, Overflow;
24470 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24471
24472 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24473 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24474 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24475}
24476
24477/// Return true if opcode is a X86 logical comparison.
24479 unsigned Opc = Op.getOpcode();
24480 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24481 Opc == X86ISD::FCMP)
24482 return true;
24483 if (Op.getResNo() == 1 &&
24484 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24485 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
24486 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24487 return true;
24488
24489 return false;
24490}
24491
24493 if (V.getOpcode() != ISD::TRUNCATE)
24494 return false;
24495
24496 SDValue VOp0 = V.getOperand(0);
24497 unsigned InBits = VOp0.getValueSizeInBits();
24498 unsigned Bits = V.getValueSizeInBits();
24499 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24500}
24501
24502// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24504 unsigned X86CC, const SDLoc &DL,
24505 SelectionDAG &DAG,
24506 const X86Subtarget &Subtarget) {
24507 EVT CmpVT = CmpVal.getValueType();
24508 EVT VT = LHS.getValueType();
24509 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24510 return SDValue();
24511
24512 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24513 isOneConstant(CmpVal.getOperand(1))) {
24514 auto SplatLSB = [&](EVT SplatVT) {
24515 // we need mask of all zeros or ones with same size of the other
24516 // operands.
24517 SDValue Neg = CmpVal;
24518 if (CmpVT.bitsGT(SplatVT))
24519 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24520 else if (CmpVT.bitsLT(SplatVT))
24521 Neg = DAG.getNode(
24522 ISD::AND, DL, SplatVT,
24523 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24524 DAG.getConstant(1, DL, SplatVT));
24525 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24526 };
24527
24528 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24530 return SplatLSB(VT);
24531
24532 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24533 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24534 isa<ConstantSDNode>(RHS)) {
24535 SDValue Mask = SplatLSB(VT);
24536 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24537 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24538 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24539 }
24540
24541 SDValue Src1, Src2;
24542 auto isIdentityPatternZero = [&]() {
24543 switch (RHS.getOpcode()) {
24544 default:
24545 break;
24546 case ISD::OR:
24547 case ISD::XOR:
24548 case ISD::ADD:
24549 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24550 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24551 Src2 = LHS;
24552 return true;
24553 }
24554 break;
24555 case ISD::SHL:
24556 case ISD::SRA:
24557 case ISD::SRL:
24558 case ISD::SUB:
24559 if (RHS.getOperand(0) == LHS) {
24560 Src1 = RHS.getOperand(1);
24561 Src2 = LHS;
24562 return true;
24563 }
24564 break;
24565 }
24566 return false;
24567 };
24568
24569 auto isIdentityPatternOnes = [&]() {
24570 switch (LHS.getOpcode()) {
24571 default:
24572 break;
24573 case ISD::AND:
24574 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24575 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24576 Src2 = RHS;
24577 return true;
24578 }
24579 break;
24580 }
24581 return false;
24582 };
24583
24584 // Convert 'identity' patterns (iff X is 0 or 1):
24585 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24586 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24587 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24588 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24589 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24590 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24591 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24592 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24593 SDValue Mask = SplatLSB(Src1.getValueType());
24594 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24595 Src1); // Mask & z
24596 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24597 }
24598 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24599 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24600 SDValue Mask = SplatLSB(VT);
24601 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24602 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24603 }
24604 }
24605
24606 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24609 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24610
24611 // 'X - 1' sets the carry flag if X == 0.
24612 // '0 - X' sets the carry flag if X != 0.
24613 // Convert the carry flag to a -1/0 mask with sbb:
24614 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24615 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24616 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24617 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24618 SDValue Sub;
24619 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24620 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24621 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24622 } else {
24623 SDValue One = DAG.getConstant(1, DL, CmpVT);
24624 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24625 }
24626 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24627 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24628 Sub.getValue(1));
24629 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24630 }
24631
24632 return SDValue();
24633}
24634
24635SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24636 bool AddTest = true;
24637 SDValue Cond = Op.getOperand(0);
24638 SDValue Op1 = Op.getOperand(1);
24639 SDValue Op2 = Op.getOperand(2);
24640 SDLoc DL(Op);
24641 MVT VT = Op1.getSimpleValueType();
24642 SDValue CC;
24643
24644 if (isSoftF16(VT, Subtarget)) {
24645 MVT NVT = VT.changeTypeToInteger();
24646 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24647 DAG.getBitcast(NVT, Op1),
24648 DAG.getBitcast(NVT, Op2)));
24649 }
24650
24651 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24652 // are available or VBLENDV if AVX is available.
24653 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24654 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24655 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24656 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24657 bool IsAlwaysSignaling;
24658 unsigned SSECC =
24659 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24660 CondOp0, CondOp1, IsAlwaysSignaling);
24661
24662 if (Subtarget.hasAVX512()) {
24663 SDValue Cmp =
24664 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24665 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24666 assert(!VT.isVector() && "Not a scalar type?");
24667 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24668 }
24669
24670 if (SSECC < 8 || Subtarget.hasAVX()) {
24671 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24672 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24673
24674 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
24675 // instead of 3 logic instructions for size savings and potentially speed.
24676 // Unfortunately, there is no scalar form of VBLENDV.
24677 //
24678 // If either operand is a +0.0 constant, don't try this. We can expect to
24679 // optimize away at least one of the logic instructions later in that
24680 // case, so that sequence would be faster than a variable blend.
24681 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
24682 !isNullFPConstant(Op2)) {
24683 // Convert to vectors, do a VSELECT, and convert back to scalar.
24684 // All of the conversions should be optimized away.
24685 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24686 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24687 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24688 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24689
24690 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24691 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24692
24693 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24694
24695 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
24696 DAG.getVectorIdxConstant(0, DL));
24697 }
24698 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24699 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24700 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24701 }
24702 }
24703
24704 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24705 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24706 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24707 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24708 }
24709
24710 if (Cond.getOpcode() == ISD::SETCC &&
24711 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
24712 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24713 Cond = NewCond;
24714 // If the condition was updated, it's possible that the operands of the
24715 // select were also updated (for example, EmitTest has a RAUW). Refresh
24716 // the local references to the select operands in case they got stale.
24717 Op1 = Op.getOperand(1);
24718 Op2 = Op.getOperand(2);
24719 }
24720 }
24721
24722 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24723 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24724 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24725 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24726 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24727 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24728 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24729 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24730 if (Cond.getOpcode() == X86ISD::SETCC &&
24731 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24732 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24733 SDValue Cmp = Cond.getOperand(1);
24734 SDValue CmpOp0 = Cmp.getOperand(0);
24735 unsigned CondCode = Cond.getConstantOperandVal(0);
24736
24737 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24738 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24739 // handle to keep the CMP with 0. This should be removed by
24740 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24741 // cttz_zero_undef.
24742 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24743 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24744 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24745 };
24746 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
24747 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24748 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24749 // Keep Cmp.
24750 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
24751 DL, DAG, Subtarget)) {
24752 return R;
24753 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
24754 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24755 ((CondCode == X86::COND_S) || // smin(x, 0)
24756 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
24757 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24758 //
24759 // If the comparison is testing for a positive value, we have to invert
24760 // the sign bit mask, so only do that transform if the target has a
24761 // bitwise 'and not' instruction (the invert is free).
24762 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24763 unsigned ShCt = VT.getSizeInBits() - 1;
24764 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
24765 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
24766 if (CondCode == X86::COND_G)
24767 Shift = DAG.getNOT(DL, Shift, VT);
24768 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
24769 }
24770 }
24771
24772 // Look past (and (setcc_carry (cmp ...)), 1).
24773 if (Cond.getOpcode() == ISD::AND &&
24774 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24775 isOneConstant(Cond.getOperand(1)))
24776 Cond = Cond.getOperand(0);
24777
24778 // Attempt to fold "raw cond" cases by treating them as:
24779 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
24780 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
24781 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
24782 Subtarget))
24783 return R;
24784
24785 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24786 // setting operand in place of the X86ISD::SETCC.
24787 unsigned CondOpcode = Cond.getOpcode();
24788 if (CondOpcode == X86ISD::SETCC ||
24789 CondOpcode == X86ISD::SETCC_CARRY) {
24790 CC = Cond.getOperand(0);
24791
24792 SDValue Cmp = Cond.getOperand(1);
24793 bool IllegalFPCMov = false;
24794 if (VT.isFloatingPoint() && !VT.isVector() &&
24795 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
24796 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24797
24798 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24799 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24800 Cond = Cmp;
24801 AddTest = false;
24802 }
24803 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24804 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24805 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24806 SDValue Value;
24807 X86::CondCode X86Cond;
24808 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24809
24810 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24811 AddTest = false;
24812 }
24813
24814 if (AddTest) {
24815 // Look past the truncate if the high bits are known zero.
24817 Cond = Cond.getOperand(0);
24818
24819 // We know the result of AND is compared against zero. Try to match
24820 // it to BT.
24821 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24822 X86::CondCode X86CondCode;
24823 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
24824 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
24825 Cond = BT;
24826 AddTest = false;
24827 }
24828 }
24829 }
24830
24831 if (AddTest) {
24832 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24833 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24834 }
24835
24836 // a < b ? -1 : 0 -> RES = ~setcc_carry
24837 // a < b ? 0 : -1 -> RES = setcc_carry
24838 // a >= b ? -1 : 0 -> RES = setcc_carry
24839 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24840 if (Cond.getOpcode() == X86ISD::SUB) {
24841 unsigned CondCode = CC->getAsZExtVal();
24842
24843 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24844 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24845 (isNullConstant(Op1) || isNullConstant(Op2))) {
24846 SDValue Res =
24847 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24848 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24849 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24850 return DAG.getNOT(DL, Res, Res.getValueType());
24851 return Res;
24852 }
24853 }
24854
24855 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24856 // widen the cmov and push the truncate through. This avoids introducing a new
24857 // branch during isel and doesn't add any extensions.
24858 if (Op.getValueType() == MVT::i8 &&
24859 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24860 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24861 if (T1.getValueType() == T2.getValueType() &&
24862 // Exclude CopyFromReg to avoid partial register stalls.
24863 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24864 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24865 CC, Cond);
24866 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24867 }
24868 }
24869
24870 // Or finally, promote i8 cmovs if we have CMOV,
24871 // or i16 cmovs if it won't prevent folding a load.
24872 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24873 // legal, but EmitLoweredSelect() can not deal with these extensions
24874 // being inserted between two CMOV's. (in i16 case too TBN)
24875 // https://bugs.llvm.org/show_bug.cgi?id=40974
24876 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
24877 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24878 !X86::mayFoldLoad(Op2, Subtarget))) {
24879 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24880 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24881 SDValue Ops[] = { Op2, Op1, CC, Cond };
24882 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24883 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24884 }
24885
24886 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24887 // condition is true.
24888 SDValue Ops[] = { Op2, Op1, CC, Cond };
24889 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24890}
24891
24893 const X86Subtarget &Subtarget,
24894 SelectionDAG &DAG) {
24895 MVT VT = Op->getSimpleValueType(0);
24896 SDValue In = Op->getOperand(0);
24897 MVT InVT = In.getSimpleValueType();
24898 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24899 MVT VTElt = VT.getVectorElementType();
24900 unsigned NumElts = VT.getVectorNumElements();
24901
24902 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24903 MVT ExtVT = VT;
24904 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24905 // If v16i32 is to be avoided, we'll need to split and concatenate.
24906 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24907 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24908
24909 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24910 }
24911
24912 // Widen to 512-bits if VLX is not supported.
24913 MVT WideVT = ExtVT;
24914 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24915 NumElts *= 512 / ExtVT.getSizeInBits();
24916 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24917 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
24918 DAG.getVectorIdxConstant(0, dl));
24919 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24920 }
24921
24922 SDValue V;
24923 MVT WideEltVT = WideVT.getVectorElementType();
24924 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24925 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24926 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24927 } else {
24928 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
24929 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24930 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24931 }
24932
24933 // Truncate if we had to extend i16/i8 above.
24934 if (VT != ExtVT) {
24935 WideVT = MVT::getVectorVT(VTElt, NumElts);
24936 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24937 }
24938
24939 // Extract back to 128/256-bit if we widened.
24940 if (WideVT != VT)
24941 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24942 DAG.getVectorIdxConstant(0, dl));
24943
24944 return V;
24945}
24946
24948 SelectionDAG &DAG) {
24949 SDValue In = Op->getOperand(0);
24950 MVT InVT = In.getSimpleValueType();
24951 SDLoc DL(Op);
24952
24953 if (InVT.getVectorElementType() == MVT::i1)
24954 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
24955
24956 assert(Subtarget.hasAVX() && "Expected AVX support");
24957 return LowerAVXExtend(Op, DL, DAG, Subtarget);
24958}
24959
24960// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24961// For sign extend this needs to handle all vector sizes and SSE4.1 and
24962// non-SSE4.1 targets. For zero extend this should only handle inputs of
24963// MVT::v64i8 when BWI is not supported, but AVX512 is.
24965 const X86Subtarget &Subtarget,
24966 SelectionDAG &DAG) {
24967 SDValue In = Op->getOperand(0);
24968 MVT VT = Op->getSimpleValueType(0);
24969 MVT InVT = In.getSimpleValueType();
24970
24971 MVT SVT = VT.getVectorElementType();
24972 MVT InSVT = InVT.getVectorElementType();
24974
24975 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24976 return SDValue();
24977 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24978 return SDValue();
24979 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24980 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24981 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24982 return SDValue();
24983
24984 SDLoc dl(Op);
24985 unsigned Opc = Op.getOpcode();
24986 unsigned NumElts = VT.getVectorNumElements();
24987
24988 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24989 // For 512-bit vectors, we need 128-bits or 256-bits.
24990 if (InVT.getSizeInBits() > 128) {
24991 // Input needs to be at least the same number of elements as output, and
24992 // at least 128-bits.
24993 int InSize = InSVT.getSizeInBits() * NumElts;
24994 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24995 InVT = In.getSimpleValueType();
24996 }
24997
24998 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24999 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25000 // need to be handled here for 256/512-bit results.
25001 if (Subtarget.hasInt256()) {
25002 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25003
25004 if (InVT.getVectorNumElements() != NumElts)
25005 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25006
25007 // FIXME: Apparently we create inreg operations that could be regular
25008 // extends.
25009 unsigned ExtOpc =
25012 return DAG.getNode(ExtOpc, dl, VT, In);
25013 }
25014
25015 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25016 if (Subtarget.hasAVX()) {
25017 assert(VT.is256BitVector() && "256-bit vector expected");
25018 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25019 int HalfNumElts = HalfVT.getVectorNumElements();
25020
25021 unsigned NumSrcElts = InVT.getVectorNumElements();
25022 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25023 for (int i = 0; i != HalfNumElts; ++i)
25024 HiMask[i] = HalfNumElts + i;
25025
25026 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25027 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25028 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25029 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25030 }
25031
25032 // We should only get here for sign extend.
25033 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25034 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25035 unsigned InNumElts = InVT.getVectorNumElements();
25036
25037 // If the source elements are already all-signbits, we don't need to extend,
25038 // just splat the elements.
25039 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25040 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25041 unsigned Scale = InNumElts / NumElts;
25042 SmallVector<int, 16> ShuffleMask;
25043 for (unsigned I = 0; I != NumElts; ++I)
25044 ShuffleMask.append(Scale, I);
25045 return DAG.getBitcast(VT,
25046 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25047 }
25048
25049 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25050 SDValue Curr = In;
25051 SDValue SignExt = Curr;
25052
25053 // As SRAI is only available on i16/i32 types, we expand only up to i32
25054 // and handle i64 separately.
25055 if (InVT != MVT::v4i32) {
25056 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25057
25058 unsigned DestWidth = DestVT.getScalarSizeInBits();
25059 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25060 unsigned DestElts = DestVT.getVectorNumElements();
25061
25062 // Build a shuffle mask that takes each input element and places it in the
25063 // MSBs of the new element size.
25064 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25065 for (unsigned i = 0; i != DestElts; ++i)
25066 Mask[i * Scale + (Scale - 1)] = i;
25067
25068 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25069 Curr = DAG.getBitcast(DestVT, Curr);
25070
25071 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25072 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25073 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25074 }
25075
25076 if (VT == MVT::v2i64) {
25077 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25078 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25079 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25080 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25081 SignExt = DAG.getBitcast(VT, SignExt);
25082 }
25083
25084 return SignExt;
25085}
25086
25088 SelectionDAG &DAG) {
25089 MVT VT = Op->getSimpleValueType(0);
25090 SDValue In = Op->getOperand(0);
25091 MVT InVT = In.getSimpleValueType();
25092 SDLoc dl(Op);
25093
25094 if (InVT.getVectorElementType() == MVT::i1)
25095 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25096
25097 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25099 "Expected same number of elements");
25100 assert((VT.getVectorElementType() == MVT::i16 ||
25101 VT.getVectorElementType() == MVT::i32 ||
25102 VT.getVectorElementType() == MVT::i64) &&
25103 "Unexpected element type");
25104 assert((InVT.getVectorElementType() == MVT::i8 ||
25105 InVT.getVectorElementType() == MVT::i16 ||
25106 InVT.getVectorElementType() == MVT::i32) &&
25107 "Unexpected element type");
25108
25109 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25110 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25111 return splitVectorIntUnary(Op, DAG, dl);
25112 }
25113
25114 if (Subtarget.hasInt256())
25115 return Op;
25116
25117 // Optimize vectors in AVX mode
25118 // Sign extend v8i16 to v8i32 and
25119 // v4i32 to v4i64
25120 //
25121 // Divide input vector into two parts
25122 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25123 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25124 // concat the vectors to original VT
25125 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25126 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25127
25128 unsigned NumElems = InVT.getVectorNumElements();
25129 SmallVector<int,8> ShufMask(NumElems, -1);
25130 for (unsigned i = 0; i != NumElems/2; ++i)
25131 ShufMask[i] = i + NumElems/2;
25132
25133 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25134 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25135
25136 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25137}
25138
25139/// Change a vector store into a pair of half-size vector stores.
25141 SDValue StoredVal = Store->getValue();
25142 assert((StoredVal.getValueType().is256BitVector() ||
25143 StoredVal.getValueType().is512BitVector()) &&
25144 "Expecting 256/512-bit op");
25145
25146 // Splitting volatile memory ops is not allowed unless the operation was not
25147 // legal to begin with. Assume the input store is legal (this transform is
25148 // only used for targets with AVX). Note: It is possible that we have an
25149 // illegal type like v2i128, and so we could allow splitting a volatile store
25150 // in that case if that is important.
25151 if (!Store->isSimple())
25152 return SDValue();
25153
25154 SDLoc DL(Store);
25155 SDValue Value0, Value1;
25156 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25157 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25158 SDValue Ptr0 = Store->getBasePtr();
25159 SDValue Ptr1 =
25160 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25161 SDValue Ch0 =
25162 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25163 Store->getOriginalAlign(),
25164 Store->getMemOperand()->getFlags());
25165 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25166 Store->getPointerInfo().getWithOffset(HalfOffset),
25167 Store->getOriginalAlign(),
25168 Store->getMemOperand()->getFlags());
25169 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25170}
25171
25172/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25173/// type.
25175 SelectionDAG &DAG) {
25176 SDValue StoredVal = Store->getValue();
25177 assert(StoreVT.is128BitVector() &&
25178 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25179 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25180
25181 // Splitting volatile memory ops is not allowed unless the operation was not
25182 // legal to begin with. We are assuming the input op is legal (this transform
25183 // is only used for targets with AVX).
25184 if (!Store->isSimple())
25185 return SDValue();
25186
25187 MVT StoreSVT = StoreVT.getScalarType();
25188 unsigned NumElems = StoreVT.getVectorNumElements();
25189 unsigned ScalarSize = StoreSVT.getStoreSize();
25190
25191 SDLoc DL(Store);
25193 for (unsigned i = 0; i != NumElems; ++i) {
25194 unsigned Offset = i * ScalarSize;
25195 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25197 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25198 DAG.getVectorIdxConstant(i, DL));
25199 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25200 Store->getPointerInfo().getWithOffset(Offset),
25201 Store->getOriginalAlign(),
25202 Store->getMemOperand()->getFlags());
25203 Stores.push_back(Ch);
25204 }
25205 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25206}
25207
25208static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25209 SelectionDAG &DAG) {
25210 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25211 SDLoc dl(St);
25212 SDValue StoredVal = St->getValue();
25213
25214 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25215 if (StoredVal.getValueType().isVector() &&
25216 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25217 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25218 assert(NumElts <= 8 && "Unexpected VT");
25219 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25220 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25221 "Expected AVX512F without AVX512DQI");
25222
25223 // We must pad with zeros to ensure we store zeroes to any unused bits.
25224 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25225 DAG.getUNDEF(MVT::v16i1), StoredVal,
25226 DAG.getVectorIdxConstant(0, dl));
25227 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25228 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25229 // Make sure we store zeros in the extra bits.
25230 if (NumElts < 8)
25231 StoredVal = DAG.getZeroExtendInReg(
25232 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25233
25234 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25235 St->getPointerInfo(), St->getOriginalAlign(),
25236 St->getMemOperand()->getFlags());
25237 }
25238
25239 if (St->isTruncatingStore())
25240 return SDValue();
25241
25242 // If this is a 256-bit store of concatenated ops, we are better off splitting
25243 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
25244 // and each half can execute independently. Some cores would split the op into
25245 // halves anyway, so the concat (vinsertf128) is purely an extra op.
25246 MVT StoreVT = StoredVal.getSimpleValueType();
25247 if (StoreVT.is256BitVector() ||
25248 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
25249 !Subtarget.hasBWI())) {
25250 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
25251 return splitVectorStore(St, DAG);
25252 return SDValue();
25253 }
25254
25255 if (StoreVT.is32BitVector())
25256 return SDValue();
25257
25258 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25259 assert(StoreVT.is64BitVector() && "Unexpected VT");
25260 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25262 "Unexpected type action!");
25263
25264 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25265 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25266 DAG.getUNDEF(StoreVT));
25267
25268 if (Subtarget.hasSSE2()) {
25269 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25270 // and store it.
25271 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25272 MVT CastVT = MVT::getVectorVT(StVT, 2);
25273 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25274 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25275 DAG.getVectorIdxConstant(0, dl));
25276
25277 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25278 St->getPointerInfo(), St->getOriginalAlign(),
25279 St->getMemOperand()->getFlags());
25280 }
25281 assert(Subtarget.hasSSE1() && "Expected SSE");
25282 SDVTList Tys = DAG.getVTList(MVT::Other);
25283 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25284 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25285 St->getMemOperand());
25286}
25287
25288// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25289// may emit an illegal shuffle but the expansion is still better than scalar
25290// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25291// we'll emit a shuffle and a arithmetic shift.
25292// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25293// TODO: It is possible to support ZExt by zeroing the undef values during
25294// the shuffle phase or after the shuffle.
25295static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25296 SelectionDAG &DAG) {
25297 MVT RegVT = Op.getSimpleValueType();
25298 assert(RegVT.isVector() && "We only custom lower vector loads.");
25299 assert(RegVT.isInteger() &&
25300 "We only custom lower integer vector loads.");
25301
25302 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25303 SDLoc dl(Ld);
25304
25305 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25306 if (RegVT.getVectorElementType() == MVT::i1) {
25307 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25308 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25309 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25310 "Expected AVX512F without AVX512DQI");
25311
25312 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25313 Ld->getPointerInfo(), Ld->getOriginalAlign(),
25314 Ld->getMemOperand()->getFlags());
25315
25316 // Replace chain users with the new chain.
25317 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25318
25319 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25320 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25321 DAG.getBitcast(MVT::v16i1, Val),
25322 DAG.getVectorIdxConstant(0, dl));
25323 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25324 }
25325
25326 return SDValue();
25327}
25328
25329/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25330/// each of which has no other use apart from the AND / OR.
25331static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25332 Opc = Op.getOpcode();
25333 if (Opc != ISD::OR && Opc != ISD::AND)
25334 return false;
25335 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25336 Op.getOperand(0).hasOneUse() &&
25337 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25338 Op.getOperand(1).hasOneUse());
25339}
25340
25341SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25342 SDValue Chain = Op.getOperand(0);
25343 SDValue Cond = Op.getOperand(1);
25344 SDValue Dest = Op.getOperand(2);
25345 SDLoc dl(Op);
25346
25347 // Bail out when we don't have native compare instructions.
25348 if (Cond.getOpcode() == ISD::SETCC &&
25349 Cond.getOperand(0).getValueType() != MVT::f128 &&
25350 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25351 SDValue LHS = Cond.getOperand(0);
25352 SDValue RHS = Cond.getOperand(1);
25353 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25354
25355 // Special case for
25356 // setcc([su]{add,sub,mul}o == 0)
25357 // setcc([su]{add,sub,mul}o != 1)
25358 if (ISD::isOverflowIntrOpRes(LHS) &&
25359 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25360 (isNullConstant(RHS) || isOneConstant(RHS))) {
25361 SDValue Value, Overflow;
25362 X86::CondCode X86Cond;
25363 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25364
25365 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25366 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25367
25368 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25369 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25370 Overflow, Op->getFlags());
25371 }
25372
25373 if (LHS.getSimpleValueType().isInteger()) {
25374 SDValue CCVal;
25375 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25376 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25377 EFLAGS, Op->getFlags());
25378 }
25379
25380 if (CC == ISD::SETOEQ) {
25381 // For FCMP_OEQ, we can emit
25382 // two branches instead of an explicit AND instruction with a
25383 // separate test. However, we only do this if this block doesn't
25384 // have a fall-through edge, because this requires an explicit
25385 // jmp when the condition is false.
25386 if (Op.getNode()->hasOneUse()) {
25387 SDNode *User = *Op.getNode()->user_begin();
25388 // Look for an unconditional branch following this conditional branch.
25389 // We need this because we need to reverse the successors in order
25390 // to implement FCMP_OEQ.
25391 if (User->getOpcode() == ISD::BR) {
25392 SDValue FalseBB = User->getOperand(1);
25393 SDNode *NewBR =
25394 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25395 assert(NewBR == User);
25396 (void)NewBR;
25397 Dest = FalseBB;
25398
25399 SDValue Cmp =
25400 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25401 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25402 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25403 CCVal, Cmp, Op->getFlags());
25404 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25405 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25406 Cmp, Op->getFlags());
25407 }
25408 }
25409 } else if (CC == ISD::SETUNE) {
25410 // For FCMP_UNE, we can emit
25411 // two branches instead of an explicit OR instruction with a
25412 // separate test.
25413 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25414 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25415 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25416 Cmp, Op->getFlags());
25417 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25418 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25419 Cmp, Op->getFlags());
25420 } else {
25421 X86::CondCode X86Cond =
25422 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25423 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25424 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25425 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25426 Cmp, Op->getFlags());
25427 }
25428 }
25429
25431 SDValue Value, Overflow;
25432 X86::CondCode X86Cond;
25433 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25434
25435 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25436 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25437 Overflow, Op->getFlags());
25438 }
25439
25440 // Look past the truncate if the high bits are known zero.
25442 Cond = Cond.getOperand(0);
25443
25444 EVT CondVT = Cond.getValueType();
25445
25446 // Add an AND with 1 if we don't already have one.
25447 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25448 Cond =
25449 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25450
25451 SDValue LHS = Cond;
25452 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25453
25454 SDValue CCVal;
25455 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25456 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25457 Op->getFlags());
25458}
25459
25460// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25461// Calls to _alloca are needed to probe the stack when allocating more than 4k
25462// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25463// that the guard pages used by the OS virtual memory manager are allocated in
25464// correct sequence.
25465SDValue
25466X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25467 SelectionDAG &DAG) const {
25469 bool SplitStack = MF.shouldSplitStack();
25470 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25471 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25472 SplitStack || EmitStackProbeCall;
25473 SDLoc dl(Op);
25474
25475 // Get the inputs.
25476 SDNode *Node = Op.getNode();
25477 SDValue Chain = Op.getOperand(0);
25478 SDValue Size = Op.getOperand(1);
25479 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25480 EVT VT = Node->getValueType(0);
25481
25482 // Chain the dynamic stack allocation so that it doesn't modify the stack
25483 // pointer when other instructions are using the stack.
25484 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25485
25486 bool Is64Bit = Subtarget.is64Bit();
25487 MVT SPTy = getPointerTy(DAG.getDataLayout());
25488
25490 if (!Lower) {
25491 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25493 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25494 " not tell us which reg is the stack pointer!");
25495
25496 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25497 const Align StackAlign = TFI.getStackAlign();
25498 if (hasInlineStackProbe(MF)) {
25499 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25500 {Chain, Size});
25501 Chain = Result.getValue(1);
25502 } else {
25503 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25504 Chain = SP.getValue(1);
25505 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25506 }
25507 if (Alignment && *Alignment > StackAlign)
25508 Result = DAG.getNode(
25509 ISD::AND, dl, VT, Result,
25510 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25511 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25512 } else if (SplitStack) {
25513 if (Is64Bit) {
25514 // The 64 bit implementation of segmented stacks needs to clobber both r10
25515 // r11. This makes it impossible to use it along with nested parameters.
25516 const Function &F = MF.getFunction();
25517 for (const auto &A : F.args()) {
25518 if (A.hasNestAttr())
25519 report_fatal_error("Cannot use segmented stacks with functions that "
25520 "have nested arguments.");
25521 }
25522 }
25523
25524 Result =
25525 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25526 Chain = Result.getValue(1);
25527 } else {
25528 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25529 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25530 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25531
25532 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25533 Register SPReg = RegInfo->getStackRegister();
25534 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25535 Chain = SP.getValue(1);
25536
25537 if (Alignment) {
25538 SP = DAG.getNode(
25539 ISD::AND, dl, VT, SP.getValue(0),
25540 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25541 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25542 }
25543
25544 Result = SP;
25545 }
25546
25547 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25548
25549 SDValue Ops[2] = {Result, Chain};
25550 return DAG.getMergeValues(Ops, dl);
25551}
25552
25553SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25555 auto PtrVT = getPointerTy(MF.getDataLayout());
25557
25558 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25559 SDLoc DL(Op);
25560
25561 if (!Subtarget.is64Bit() ||
25562 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25563 // vastart just stores the address of the VarArgsFrameIndex slot into the
25564 // memory location argument.
25565 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25566 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
25567 MachinePointerInfo(SV));
25568 }
25569
25570 // __va_list_tag:
25571 // gp_offset (0 - 6 * 8)
25572 // fp_offset (48 - 48 + 8 * 16)
25573 // overflow_arg_area (point to parameters coming in memory).
25574 // reg_save_area
25576 SDValue FIN = Op.getOperand(1);
25577 // Store gp_offset
25578 SDValue Store = DAG.getStore(
25579 Op.getOperand(0), DL,
25580 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25581 MachinePointerInfo(SV));
25582 MemOps.push_back(Store);
25583
25584 // Store fp_offset
25585 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25586 Store = DAG.getStore(
25587 Op.getOperand(0), DL,
25588 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25589 MachinePointerInfo(SV, 4));
25590 MemOps.push_back(Store);
25591
25592 // Store ptr to overflow_arg_area
25593 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25594 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25595 Store =
25596 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25597 MemOps.push_back(Store);
25598
25599 // Store ptr to reg_save_area.
25600 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25601 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25602 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25603 Store = DAG.getStore(
25604 Op.getOperand(0), DL, RSFIN, FIN,
25605 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25606 MemOps.push_back(Store);
25607 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25608}
25609
25610SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25611 assert(Subtarget.is64Bit() &&
25612 "LowerVAARG only handles 64-bit va_arg!");
25613 assert(Op.getNumOperands() == 4);
25614
25616 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25617 // The Win64 ABI uses char* instead of a structure.
25618 return DAG.expandVAArg(Op.getNode());
25619
25620 SDValue Chain = Op.getOperand(0);
25621 SDValue SrcPtr = Op.getOperand(1);
25622 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25623 unsigned Align = Op.getConstantOperandVal(3);
25624 SDLoc dl(Op);
25625
25626 EVT ArgVT = Op.getNode()->getValueType(0);
25627 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25628 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25629 uint8_t ArgMode;
25630
25631 // Decide which area this value should be read from.
25632 // TODO: Implement the AMD64 ABI in its entirety. This simple
25633 // selection mechanism works only for the basic types.
25634 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25635 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25636 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25637 } else {
25638 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25639 "Unhandled argument type in LowerVAARG");
25640 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25641 }
25642
25643 if (ArgMode == 2) {
25644 // Make sure using fp_offset makes sense.
25645 assert(!Subtarget.useSoftFloat() &&
25646 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25647 Subtarget.hasSSE1());
25648 }
25649
25650 // Insert VAARG node into the DAG
25651 // VAARG returns two values: Variable Argument Address, Chain
25652 SDValue InstOps[] = {Chain, SrcPtr,
25653 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25654 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25655 DAG.getTargetConstant(Align, dl, MVT::i32)};
25656 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25659 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25660 /*Alignment=*/std::nullopt,
25662 Chain = VAARG.getValue(1);
25663
25664 // Load the next argument and return it
25665 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25666}
25667
25668static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25669 SelectionDAG &DAG) {
25670 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25671 // where a va_list is still an i8*.
25672 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25673 if (Subtarget.isCallingConvWin64(
25675 // Probably a Win64 va_copy.
25676 return DAG.expandVACopy(Op.getNode());
25677
25678 SDValue Chain = Op.getOperand(0);
25679 SDValue DstPtr = Op.getOperand(1);
25680 SDValue SrcPtr = Op.getOperand(2);
25681 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25682 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25683 SDLoc DL(Op);
25684
25685 return DAG.getMemcpy(
25686 Chain, DL, DstPtr, SrcPtr,
25687 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25688 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25689 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
25690 MachinePointerInfo(SrcSV));
25691}
25692
25693// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25694static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25695 switch (Opc) {
25696 case ISD::SHL:
25697 case X86ISD::VSHL:
25698 case X86ISD::VSHLI:
25699 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25700 case ISD::SRL:
25701 case X86ISD::VSRL:
25702 case X86ISD::VSRLI:
25703 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25704 case ISD::SRA:
25705 case X86ISD::VSRA:
25706 case X86ISD::VSRAI:
25707 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25708 }
25709 llvm_unreachable("Unknown target vector shift node");
25710}
25711
25712/// Handle vector element shifts where the shift amount is a constant.
25713/// Takes immediate version of shift as input.
25714static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25715 SDValue SrcOp, uint64_t ShiftAmt,
25716 SelectionDAG &DAG) {
25717 MVT ElementType = VT.getVectorElementType();
25718
25719 // Bitcast the source vector to the output type, this is mainly necessary for
25720 // vXi8/vXi64 shifts.
25721 if (VT != SrcOp.getSimpleValueType())
25722 SrcOp = DAG.getBitcast(VT, SrcOp);
25723
25724 // Fold this packed shift into its first operand if ShiftAmt is 0.
25725 if (ShiftAmt == 0)
25726 return SrcOp;
25727
25728 // Check for ShiftAmt >= element width
25729 if (ShiftAmt >= ElementType.getSizeInBits()) {
25730 if (Opc == X86ISD::VSRAI)
25731 ShiftAmt = ElementType.getSizeInBits() - 1;
25732 else
25733 return DAG.getConstant(0, dl, VT);
25734 }
25735
25736 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
25737 && "Unknown target vector shift-by-constant node");
25738
25739 // Fold this packed vector shift into a build vector if SrcOp is a
25740 // vector of Constants or UNDEFs.
25742 unsigned ShiftOpc;
25743 switch (Opc) {
25744 default: llvm_unreachable("Unknown opcode!");
25745 case X86ISD::VSHLI:
25746 ShiftOpc = ISD::SHL;
25747 break;
25748 case X86ISD::VSRLI:
25749 ShiftOpc = ISD::SRL;
25750 break;
25751 case X86ISD::VSRAI:
25752 ShiftOpc = ISD::SRA;
25753 break;
25754 }
25755
25756 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
25757 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
25758 return C;
25759 }
25760
25761 return DAG.getNode(Opc, dl, VT, SrcOp,
25762 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25763}
25764
25765/// Handle vector element shifts by a splat shift amount
25766static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25767 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
25768 const X86Subtarget &Subtarget,
25769 SelectionDAG &DAG) {
25770 MVT AmtVT = ShAmt.getSimpleValueType();
25771 assert(AmtVT.isVector() && "Vector shift type mismatch");
25772 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
25773 "Illegal vector splat index");
25774
25775 // Move the splat element to the bottom element.
25776 if (ShAmtIdx != 0) {
25777 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25778 Mask[0] = ShAmtIdx;
25779 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
25780 }
25781
25782 // Peek through any zext node if we can get back to a 128-bit source.
25783 if (AmtVT.getScalarSizeInBits() == 64 &&
25784 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
25786 ShAmt.getOperand(0).getValueType().isSimple() &&
25787 ShAmt.getOperand(0).getValueType().is128BitVector()) {
25788 ShAmt = ShAmt.getOperand(0);
25789 AmtVT = ShAmt.getSimpleValueType();
25790 }
25791
25792 // See if we can mask off the upper elements using the existing source node.
25793 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25794 // do this for vXi64 types.
25795 bool IsMasked = false;
25796 if (AmtVT.getScalarSizeInBits() < 64) {
25797 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
25798 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
25799 // If the shift amount has come from a scalar, then zero-extend the scalar
25800 // before moving to the vector.
25801 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
25802 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25803 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
25804 AmtVT = MVT::v4i32;
25805 IsMasked = true;
25806 } else if (ShAmt.getOpcode() == ISD::AND) {
25807 // See if the shift amount is already masked (e.g. for rotation modulo),
25808 // then we can zero-extend it by setting all the other mask elements to
25809 // zero.
25810 SmallVector<SDValue> MaskElts(
25811 AmtVT.getVectorNumElements(),
25812 DAG.getConstant(0, dl, AmtVT.getScalarType()));
25813 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
25814 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
25815 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
25816 {ShAmt.getOperand(1), Mask}))) {
25817 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
25818 IsMasked = true;
25819 }
25820 }
25821 }
25822
25823 // Extract if the shift amount vector is larger than 128-bits.
25824 if (AmtVT.getSizeInBits() > 128) {
25825 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
25826 AmtVT = ShAmt.getSimpleValueType();
25827 }
25828
25829 // Zero-extend bottom element to v2i64 vector type, either by extension or
25830 // shuffle masking.
25831 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
25832 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
25833 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
25834 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25835 } else if (Subtarget.hasSSE41()) {
25836 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25837 MVT::v2i64, ShAmt);
25838 } else {
25839 SDValue ByteShift = DAG.getTargetConstant(
25840 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25841 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25842 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25843 ByteShift);
25844 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25845 ByteShift);
25846 }
25847 }
25848
25849 // Change opcode to non-immediate version.
25850 Opc = getTargetVShiftUniformOpcode(Opc, true);
25851
25852 // The return type has to be a 128-bit type with the same element
25853 // type as the input type.
25854 MVT EltVT = VT.getVectorElementType();
25855 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25856
25857 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25858 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25859}
25860
25861/// Return Mask with the necessary casting or extending
25862/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25863static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25864 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25865 const SDLoc &dl) {
25866
25867 if (isAllOnesConstant(Mask))
25868 return DAG.getConstant(1, dl, MaskVT);
25869 if (X86::isZeroNode(Mask))
25870 return DAG.getConstant(0, dl, MaskVT);
25871
25872 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25873
25874 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25875 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25876 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25877 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25878 SDValue Lo, Hi;
25879 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
25880 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25881 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25882 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25883 } else {
25884 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25885 Mask.getSimpleValueType().getSizeInBits());
25886 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25887 // are extracted by EXTRACT_SUBVECTOR.
25888 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25889 DAG.getBitcast(BitcastVT, Mask),
25890 DAG.getVectorIdxConstant(0, dl));
25891 }
25892}
25893
25894/// Return (and \p Op, \p Mask) for compare instructions or
25895/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25896/// necessary casting or extending for \p Mask when lowering masking intrinsics
25898 SDValue PreservedSrc,
25899 const X86Subtarget &Subtarget,
25900 SelectionDAG &DAG) {
25901 MVT VT = Op.getSimpleValueType();
25902 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25903 unsigned OpcodeSelect = ISD::VSELECT;
25904 SDLoc dl(Op);
25905
25906 if (isAllOnesConstant(Mask))
25907 return Op;
25908
25909 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25910
25911 if (PreservedSrc.isUndef())
25912 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25913 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25914}
25915
25916/// Creates an SDNode for a predicated scalar operation.
25917/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25918/// The mask is coming as MVT::i8 and it should be transformed
25919/// to MVT::v1i1 while lowering masking intrinsics.
25920/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25921/// "X86select" instead of "vselect". We just can't create the "vselect" node
25922/// for a scalar instruction.
25924 SDValue PreservedSrc,
25925 const X86Subtarget &Subtarget,
25926 SelectionDAG &DAG) {
25927
25928 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25929 if (MaskConst->getZExtValue() & 0x1)
25930 return Op;
25931
25932 MVT VT = Op.getSimpleValueType();
25933 SDLoc dl(Op);
25934
25935 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25936 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25937 DAG.getBitcast(MVT::v8i1, Mask),
25938 DAG.getVectorIdxConstant(0, dl));
25939 if (Op.getOpcode() == X86ISD::FSETCCM ||
25940 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25941 Op.getOpcode() == X86ISD::VFPCLASSS)
25942 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25943
25944 if (PreservedSrc.isUndef())
25945 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25946 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25947}
25948
25950 if (!Fn->hasPersonalityFn())
25952 "querying registration node size for function without personality");
25953 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25954 // WinEHStatePass for the full struct definition.
25955 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25956 case EHPersonality::MSVC_X86SEH: return 24;
25957 case EHPersonality::MSVC_CXX: return 16;
25958 default: break;
25959 }
25961 "can only recover FP for 32-bit MSVC EH personality functions");
25962}
25963
25964/// When the MSVC runtime transfers control to us, either to an outlined
25965/// function or when returning to a parent frame after catching an exception, we
25966/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25967/// Here's the math:
25968/// RegNodeBase = EntryEBP - RegNodeSize
25969/// ParentFP = RegNodeBase - ParentFrameOffset
25970/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25971/// subtracting the offset (negative on x86) takes us back to the parent FP.
25973 SDValue EntryEBP) {
25975 SDLoc dl;
25976
25977 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25978 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25979
25980 // It's possible that the parent function no longer has a personality function
25981 // if the exceptional code was optimized away, in which case we just return
25982 // the incoming EBP.
25983 if (!Fn->hasPersonalityFn())
25984 return EntryEBP;
25985
25986 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25987 // registration, or the .set_setframe offset.
25990 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25991 SDValue ParentFrameOffset =
25992 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25993
25994 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25995 // prologue to RBP in the parent function.
25996 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
25997 if (Subtarget.is64Bit())
25998 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25999
26000 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26001 // RegNodeBase = EntryEBP - RegNodeSize
26002 // ParentFP = RegNodeBase - ParentFrameOffset
26003 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26004 DAG.getConstant(RegNodeSize, dl, PtrVT));
26005 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26006}
26007
26008SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26009 SelectionDAG &DAG) const {
26010 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26011 auto isRoundModeCurDirection = [](SDValue Rnd) {
26012 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26013 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26014
26015 return false;
26016 };
26017 auto isRoundModeSAE = [](SDValue Rnd) {
26018 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26019 unsigned RC = C->getZExtValue();
26021 // Clear the NO_EXC bit and check remaining bits.
26023 // As a convenience we allow no other bits or explicitly
26024 // current direction.
26025 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26026 }
26027 }
26028
26029 return false;
26030 };
26031 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26032 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26033 RC = C->getZExtValue();
26035 // Clear the NO_EXC bit and check remaining bits.
26041 }
26042 }
26043
26044 return false;
26045 };
26046
26047 SDLoc dl(Op);
26048 unsigned IntNo = Op.getConstantOperandVal(0);
26049 MVT VT = Op.getSimpleValueType();
26050 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26051
26052 // Propagate flags from original node to transformed node(s).
26053 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26054
26055 if (IntrData) {
26056 switch(IntrData->Type) {
26057 case INTR_TYPE_1OP: {
26058 // We specify 2 possible opcodes for intrinsics with rounding modes.
26059 // First, we check if the intrinsic may have non-default rounding mode,
26060 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26061 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26062 if (IntrWithRoundingModeOpcode != 0) {
26063 SDValue Rnd = Op.getOperand(2);
26064 unsigned RC = 0;
26065 if (isRoundModeSAEToX(Rnd, RC))
26066 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26067 Op.getOperand(1),
26068 DAG.getTargetConstant(RC, dl, MVT::i32));
26069 if (!isRoundModeCurDirection(Rnd))
26070 return SDValue();
26071 }
26072 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26073 Op.getOperand(1));
26074 }
26075 case INTR_TYPE_1OP_SAE: {
26076 SDValue Sae = Op.getOperand(2);
26077
26078 unsigned Opc;
26079 if (isRoundModeCurDirection(Sae))
26080 Opc = IntrData->Opc0;
26081 else if (isRoundModeSAE(Sae))
26082 Opc = IntrData->Opc1;
26083 else
26084 return SDValue();
26085
26086 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26087 }
26088 case INTR_TYPE_2OP: {
26089 SDValue Src2 = Op.getOperand(2);
26090
26091 // We specify 2 possible opcodes for intrinsics with rounding modes.
26092 // First, we check if the intrinsic may have non-default rounding mode,
26093 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26094 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26095 if (IntrWithRoundingModeOpcode != 0) {
26096 SDValue Rnd = Op.getOperand(3);
26097 unsigned RC = 0;
26098 if (isRoundModeSAEToX(Rnd, RC))
26099 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26100 Op.getOperand(1), Src2,
26101 DAG.getTargetConstant(RC, dl, MVT::i32));
26102 if (!isRoundModeCurDirection(Rnd))
26103 return SDValue();
26104 }
26105
26106 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26107 Op.getOperand(1), Src2);
26108 }
26109 case INTR_TYPE_2OP_SAE: {
26110 SDValue Sae = Op.getOperand(3);
26111
26112 unsigned Opc;
26113 if (isRoundModeCurDirection(Sae))
26114 Opc = IntrData->Opc0;
26115 else if (isRoundModeSAE(Sae))
26116 Opc = IntrData->Opc1;
26117 else
26118 return SDValue();
26119
26120 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26121 Op.getOperand(2));
26122 }
26123 case INTR_TYPE_3OP:
26124 case INTR_TYPE_3OP_IMM8: {
26125 SDValue Src1 = Op.getOperand(1);
26126 SDValue Src2 = Op.getOperand(2);
26127 SDValue Src3 = Op.getOperand(3);
26128
26129 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26130 Src3.getValueType() != MVT::i8) {
26131 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26132 }
26133
26134 // We specify 2 possible opcodes for intrinsics with rounding modes.
26135 // First, we check if the intrinsic may have non-default rounding mode,
26136 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26137 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26138 if (IntrWithRoundingModeOpcode != 0) {
26139 SDValue Rnd = Op.getOperand(4);
26140 unsigned RC = 0;
26141 if (isRoundModeSAEToX(Rnd, RC))
26142 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26143 Src1, Src2, Src3,
26144 DAG.getTargetConstant(RC, dl, MVT::i32));
26145 if (!isRoundModeCurDirection(Rnd))
26146 return SDValue();
26147 }
26148
26149 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26150 {Src1, Src2, Src3});
26151 }
26152 case INTR_TYPE_4OP_IMM8: {
26153 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26154 SDValue Src4 = Op.getOperand(4);
26155 if (Src4.getValueType() != MVT::i8) {
26156 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26157 }
26158
26159 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26160 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26161 Src4);
26162 }
26163 case INTR_TYPE_1OP_MASK: {
26164 SDValue Src = Op.getOperand(1);
26165 SDValue PassThru = Op.getOperand(2);
26166 SDValue Mask = Op.getOperand(3);
26167 // We add rounding mode to the Node when
26168 // - RC Opcode is specified and
26169 // - RC is not "current direction".
26170 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26171 if (IntrWithRoundingModeOpcode != 0) {
26172 SDValue Rnd = Op.getOperand(4);
26173 unsigned RC = 0;
26174 if (isRoundModeSAEToX(Rnd, RC))
26175 return getVectorMaskingNode(
26176 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26177 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26178 Mask, PassThru, Subtarget, DAG);
26179 if (!isRoundModeCurDirection(Rnd))
26180 return SDValue();
26181 }
26182 return getVectorMaskingNode(
26183 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26184 Subtarget, DAG);
26185 }
26187 SDValue Src = Op.getOperand(1);
26188 SDValue PassThru = Op.getOperand(2);
26189 SDValue Mask = Op.getOperand(3);
26190 SDValue Rnd = Op.getOperand(4);
26191
26192 unsigned Opc;
26193 if (isRoundModeCurDirection(Rnd))
26194 Opc = IntrData->Opc0;
26195 else if (isRoundModeSAE(Rnd))
26196 Opc = IntrData->Opc1;
26197 else
26198 return SDValue();
26199
26200 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26201 Subtarget, DAG);
26202 }
26203 case INTR_TYPE_SCALAR_MASK: {
26204 SDValue Src1 = Op.getOperand(1);
26205 SDValue Src2 = Op.getOperand(2);
26206 SDValue passThru = Op.getOperand(3);
26207 SDValue Mask = Op.getOperand(4);
26208 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26209 // There are 2 kinds of intrinsics in this group:
26210 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26211 // (2) With rounding mode and sae - 7 operands.
26212 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26213 if (Op.getNumOperands() == (5U + HasRounding)) {
26214 if (HasRounding) {
26215 SDValue Rnd = Op.getOperand(5);
26216 unsigned RC = 0;
26217 if (isRoundModeSAEToX(Rnd, RC))
26218 return getScalarMaskingNode(
26219 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26220 DAG.getTargetConstant(RC, dl, MVT::i32)),
26221 Mask, passThru, Subtarget, DAG);
26222 if (!isRoundModeCurDirection(Rnd))
26223 return SDValue();
26224 }
26225 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26226 Src2),
26227 Mask, passThru, Subtarget, DAG);
26228 }
26229
26230 assert(Op.getNumOperands() == (6U + HasRounding) &&
26231 "Unexpected intrinsic form");
26232 SDValue RoundingMode = Op.getOperand(5);
26233 unsigned Opc = IntrData->Opc0;
26234 if (HasRounding) {
26235 SDValue Sae = Op.getOperand(6);
26236 if (isRoundModeSAE(Sae))
26237 Opc = IntrWithRoundingModeOpcode;
26238 else if (!isRoundModeCurDirection(Sae))
26239 return SDValue();
26240 }
26241 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26242 Src2, RoundingMode),
26243 Mask, passThru, Subtarget, DAG);
26244 }
26246 SDValue Src1 = Op.getOperand(1);
26247 SDValue Src2 = Op.getOperand(2);
26248 SDValue passThru = Op.getOperand(3);
26249 SDValue Mask = Op.getOperand(4);
26250 SDValue Rnd = Op.getOperand(5);
26251
26252 SDValue NewOp;
26253 unsigned RC = 0;
26254 if (isRoundModeCurDirection(Rnd))
26255 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26256 else if (isRoundModeSAEToX(Rnd, RC))
26257 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26258 DAG.getTargetConstant(RC, dl, MVT::i32));
26259 else
26260 return SDValue();
26261
26262 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26263 }
26265 SDValue Src1 = Op.getOperand(1);
26266 SDValue Src2 = Op.getOperand(2);
26267 SDValue passThru = Op.getOperand(3);
26268 SDValue Mask = Op.getOperand(4);
26269 SDValue Sae = Op.getOperand(5);
26270 unsigned Opc;
26271 if (isRoundModeCurDirection(Sae))
26272 Opc = IntrData->Opc0;
26273 else if (isRoundModeSAE(Sae))
26274 Opc = IntrData->Opc1;
26275 else
26276 return SDValue();
26277
26278 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26279 Mask, passThru, Subtarget, DAG);
26280 }
26281 case INTR_TYPE_2OP_MASK: {
26282 SDValue Src1 = Op.getOperand(1);
26283 SDValue Src2 = Op.getOperand(2);
26284 SDValue PassThru = Op.getOperand(3);
26285 SDValue Mask = Op.getOperand(4);
26286 SDValue NewOp;
26287 if (IntrData->Opc1 != 0) {
26288 SDValue Rnd = Op.getOperand(5);
26289 unsigned RC = 0;
26290 if (isRoundModeSAEToX(Rnd, RC))
26291 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26292 DAG.getTargetConstant(RC, dl, MVT::i32));
26293 else if (!isRoundModeCurDirection(Rnd))
26294 return SDValue();
26295 }
26296 if (!NewOp)
26297 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26298 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26299 }
26301 SDValue Src1 = Op.getOperand(1);
26302 SDValue Src2 = Op.getOperand(2);
26303 SDValue PassThru = Op.getOperand(3);
26304 SDValue Mask = Op.getOperand(4);
26305
26306 unsigned Opc = IntrData->Opc0;
26307 if (IntrData->Opc1 != 0) {
26308 SDValue Sae = Op.getOperand(5);
26309 if (isRoundModeSAE(Sae))
26310 Opc = IntrData->Opc1;
26311 else if (!isRoundModeCurDirection(Sae))
26312 return SDValue();
26313 }
26314
26315 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26316 Mask, PassThru, Subtarget, DAG);
26317 }
26319 SDValue Src1 = Op.getOperand(1);
26320 SDValue Src2 = Op.getOperand(2);
26321 SDValue Src3 = Op.getOperand(3);
26322 SDValue PassThru = Op.getOperand(4);
26323 SDValue Mask = Op.getOperand(5);
26324 SDValue Sae = Op.getOperand(6);
26325 unsigned Opc;
26326 if (isRoundModeCurDirection(Sae))
26327 Opc = IntrData->Opc0;
26328 else if (isRoundModeSAE(Sae))
26329 Opc = IntrData->Opc1;
26330 else
26331 return SDValue();
26332
26333 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26334 Mask, PassThru, Subtarget, DAG);
26335 }
26337 SDValue Src1 = Op.getOperand(1);
26338 SDValue Src2 = Op.getOperand(2);
26339 SDValue Src3 = Op.getOperand(3);
26340 SDValue PassThru = Op.getOperand(4);
26341 SDValue Mask = Op.getOperand(5);
26342
26343 unsigned Opc = IntrData->Opc0;
26344 if (IntrData->Opc1 != 0) {
26345 SDValue Sae = Op.getOperand(6);
26346 if (isRoundModeSAE(Sae))
26347 Opc = IntrData->Opc1;
26348 else if (!isRoundModeCurDirection(Sae))
26349 return SDValue();
26350 }
26351 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26352 Mask, PassThru, Subtarget, DAG);
26353 }
26354 case BLENDV: {
26355 SDValue Src1 = Op.getOperand(1);
26356 SDValue Src2 = Op.getOperand(2);
26357 SDValue Src3 = Op.getOperand(3);
26358
26360 Src3 = DAG.getBitcast(MaskVT, Src3);
26361
26362 // Reverse the operands to match VSELECT order.
26363 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26364 }
26365 case VPERM_2OP : {
26366 SDValue Src1 = Op.getOperand(1);
26367 SDValue Src2 = Op.getOperand(2);
26368
26369 // Swap Src1 and Src2 in the node creation
26370 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26371 }
26372 case CFMA_OP_MASKZ:
26373 case CFMA_OP_MASK: {
26374 SDValue Src1 = Op.getOperand(1);
26375 SDValue Src2 = Op.getOperand(2);
26376 SDValue Src3 = Op.getOperand(3);
26377 SDValue Mask = Op.getOperand(4);
26378 MVT VT = Op.getSimpleValueType();
26379
26380 SDValue PassThru = Src3;
26381 if (IntrData->Type == CFMA_OP_MASKZ)
26382 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26383
26384 // We add rounding mode to the Node when
26385 // - RC Opcode is specified and
26386 // - RC is not "current direction".
26387 SDValue NewOp;
26388 if (IntrData->Opc1 != 0) {
26389 SDValue Rnd = Op.getOperand(5);
26390 unsigned RC = 0;
26391 if (isRoundModeSAEToX(Rnd, RC))
26392 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26393 DAG.getTargetConstant(RC, dl, MVT::i32));
26394 else if (!isRoundModeCurDirection(Rnd))
26395 return SDValue();
26396 }
26397 if (!NewOp)
26398 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26399 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26400 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26401 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26402 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26403 }
26404 case IFMA_OP:
26405 // NOTE: We need to swizzle the operands to pass the multiply operands
26406 // first.
26407 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26408 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26409 case FPCLASSS: {
26410 SDValue Src1 = Op.getOperand(1);
26411 SDValue Imm = Op.getOperand(2);
26412 SDValue Mask = Op.getOperand(3);
26413 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26414 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26415 Subtarget, DAG);
26416 // Need to fill with zeros to ensure the bitcast will produce zeroes
26417 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26418 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26419 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26420 DAG.getVectorIdxConstant(0, dl));
26421 return DAG.getBitcast(MVT::i8, Ins);
26422 }
26423
26424 case CMP_MASK_CC: {
26425 MVT MaskVT = Op.getSimpleValueType();
26426 SDValue CC = Op.getOperand(3);
26427 SDValue Mask = Op.getOperand(4);
26428 // We specify 2 possible opcodes for intrinsics with rounding modes.
26429 // First, we check if the intrinsic may have non-default rounding mode,
26430 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26431 if (IntrData->Opc1 != 0) {
26432 SDValue Sae = Op.getOperand(5);
26433 if (isRoundModeSAE(Sae))
26434 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26435 Op.getOperand(2), CC, Mask, Sae);
26436 if (!isRoundModeCurDirection(Sae))
26437 return SDValue();
26438 }
26439 //default rounding mode
26440 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26441 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26442 }
26443 case CMP_MASK_SCALAR_CC: {
26444 SDValue Src1 = Op.getOperand(1);
26445 SDValue Src2 = Op.getOperand(2);
26446 SDValue CC = Op.getOperand(3);
26447 SDValue Mask = Op.getOperand(4);
26448
26449 SDValue Cmp;
26450 if (IntrData->Opc1 != 0) {
26451 SDValue Sae = Op.getOperand(5);
26452 if (isRoundModeSAE(Sae))
26453 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26454 else if (!isRoundModeCurDirection(Sae))
26455 return SDValue();
26456 }
26457 //default rounding mode
26458 if (!Cmp.getNode())
26459 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26460
26461 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26462 Subtarget, DAG);
26463 // Need to fill with zeros to ensure the bitcast will produce zeroes
26464 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26465 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26466 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26467 DAG.getVectorIdxConstant(0, dl));
26468 return DAG.getBitcast(MVT::i8, Ins);
26469 }
26470 case COMI: { // Comparison intrinsics
26471 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26472 SDValue LHS = Op.getOperand(1);
26473 SDValue RHS = Op.getOperand(2);
26474 // Some conditions require the operands to be swapped.
26475 if (CC == ISD::SETLT || CC == ISD::SETLE)
26476 std::swap(LHS, RHS);
26477
26478 // For AVX10.2, Support EQ and NE.
26479 bool HasAVX10_2_COMX =
26480 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26481
26482 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26483 // For BF type we need to fall back.
26484 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26485
26486 auto ComiOpCode = IntrData->Opc0;
26487 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26488
26489 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26490 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26491
26492 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26493
26494 SDValue SetCC;
26495 switch (CC) {
26496 case ISD::SETEQ: {
26497 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26498 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26499 break;
26500 // (ZF = 1 and PF = 0)
26501 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26502 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26503 break;
26504 }
26505 case ISD::SETNE: {
26506 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26507 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26508 break;
26509 // (ZF = 0 or PF = 1)
26510 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26511 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26512 break;
26513 }
26514 case ISD::SETGT: // (CF = 0 and ZF = 0)
26515 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26516 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26517 break;
26518 }
26519 case ISD::SETGE: // CF = 0
26520 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26521 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26522 break;
26523 default:
26524 llvm_unreachable("Unexpected illegal condition!");
26525 }
26526 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26527 }
26528 case COMI_RM: { // Comparison intrinsics with Sae
26529 SDValue LHS = Op.getOperand(1);
26530 SDValue RHS = Op.getOperand(2);
26531 unsigned CondVal = Op.getConstantOperandVal(3);
26532 SDValue Sae = Op.getOperand(4);
26533
26534 SDValue FCmp;
26535 if (isRoundModeCurDirection(Sae))
26536 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26537 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26538 else if (isRoundModeSAE(Sae))
26539 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26540 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26541 else
26542 return SDValue();
26543 // Need to fill with zeros to ensure the bitcast will produce zeroes
26544 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26545 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26546 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26547 DAG.getVectorIdxConstant(0, dl));
26548 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26549 DAG.getBitcast(MVT::i16, Ins));
26550 }
26551 case VSHIFT: {
26552 SDValue SrcOp = Op.getOperand(1);
26553 SDValue ShAmt = Op.getOperand(2);
26554 assert(ShAmt.getValueType() == MVT::i32 &&
26555 "Unexpected VSHIFT amount type");
26556
26557 // Catch shift-by-constant.
26558 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26559 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26560 Op.getSimpleValueType(), SrcOp,
26561 CShAmt->getZExtValue(), DAG);
26562
26563 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26564 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26565 SrcOp, ShAmt, 0, Subtarget, DAG);
26566 }
26568 SDValue Mask = Op.getOperand(3);
26569 SDValue DataToCompress = Op.getOperand(1);
26570 SDValue PassThru = Op.getOperand(2);
26571 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26572 return Op.getOperand(1);
26573
26574 // Avoid false dependency.
26575 if (PassThru.isUndef())
26576 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26577
26578 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26579 Mask);
26580 }
26581 case FIXUPIMM:
26582 case FIXUPIMM_MASKZ: {
26583 SDValue Src1 = Op.getOperand(1);
26584 SDValue Src2 = Op.getOperand(2);
26585 SDValue Src3 = Op.getOperand(3);
26586 SDValue Imm = Op.getOperand(4);
26587 SDValue Mask = Op.getOperand(5);
26588 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26589 ? Src1
26590 : getZeroVector(VT, Subtarget, DAG, dl);
26591
26592 unsigned Opc = IntrData->Opc0;
26593 if (IntrData->Opc1 != 0) {
26594 SDValue Sae = Op.getOperand(6);
26595 if (isRoundModeSAE(Sae))
26596 Opc = IntrData->Opc1;
26597 else if (!isRoundModeCurDirection(Sae))
26598 return SDValue();
26599 }
26600
26601 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26602
26603 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
26604 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26605
26606 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26607 }
26608 case ROUNDP: {
26609 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26610 // Clear the upper bits of the rounding immediate so that the legacy
26611 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26612 uint64_t Round = Op.getConstantOperandVal(2);
26613 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26614 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26615 Op.getOperand(1), RoundingMode);
26616 }
26617 case ROUNDS: {
26618 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26619 // Clear the upper bits of the rounding immediate so that the legacy
26620 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26621 uint64_t Round = Op.getConstantOperandVal(3);
26622 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26623 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26624 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26625 }
26626 case BEXTRI: {
26627 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26628
26629 uint64_t Imm = Op.getConstantOperandVal(2);
26630 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26631 Op.getValueType());
26632 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26633 Op.getOperand(1), Control);
26634 }
26635 // ADC/SBB
26636 case ADX: {
26637 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26638 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26639
26640 SDValue Res;
26641 // If the carry in is zero, then we should just use ADD/SUB instead of
26642 // ADC/SBB.
26643 if (isNullConstant(Op.getOperand(1))) {
26644 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26645 Op.getOperand(3));
26646 } else {
26647 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26648 DAG.getAllOnesConstant(dl, MVT::i8));
26649 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26650 Op.getOperand(3), GenCF.getValue(1));
26651 }
26652 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26653 SDValue Results[] = { SetCC, Res };
26654 return DAG.getMergeValues(Results, dl);
26655 }
26656 case CVTPD2PS_MASK:
26657 case CVTPD2DQ_MASK:
26658 case CVTQQ2PS_MASK:
26659 case TRUNCATE_TO_REG: {
26660 SDValue Src = Op.getOperand(1);
26661 SDValue PassThru = Op.getOperand(2);
26662 SDValue Mask = Op.getOperand(3);
26663
26664 if (isAllOnesConstant(Mask))
26665 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26666
26667 MVT SrcVT = Src.getSimpleValueType();
26668 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26669 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26670 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26671 {Src, PassThru, Mask});
26672 }
26673 case TRUNCATE2_TO_REG: {
26674 SDValue Src = Op.getOperand(1);
26675 SDValue Src2 = Op.getOperand(2);
26676 SDValue PassThru = Op.getOperand(3);
26677 SDValue Mask = Op.getOperand(4);
26678
26679 if (isAllOnesConstant(Mask))
26680 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
26681
26682 MVT Src2VT = Src2.getSimpleValueType();
26683 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
26684 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26685 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26686 {Src, Src2, PassThru, Mask});
26687 }
26688 case CVTPS2PH_MASK: {
26689 SDValue Src = Op.getOperand(1);
26690 SDValue Rnd = Op.getOperand(2);
26691 SDValue PassThru = Op.getOperand(3);
26692 SDValue Mask = Op.getOperand(4);
26693
26694 unsigned RC = 0;
26695 unsigned Opc = IntrData->Opc0;
26696 bool SAE = Src.getValueType().is512BitVector() &&
26697 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
26698 if (SAE) {
26700 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
26701 }
26702
26703 if (isAllOnesConstant(Mask))
26704 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
26705
26706 if (SAE)
26708 else
26709 Opc = IntrData->Opc1;
26710 MVT SrcVT = Src.getSimpleValueType();
26711 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26712 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26713 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
26714 }
26715 case CVTNEPS2BF16_MASK: {
26716 SDValue Src = Op.getOperand(1);
26717 SDValue PassThru = Op.getOperand(2);
26718 SDValue Mask = Op.getOperand(3);
26719
26720 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26721 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26722
26723 // Break false dependency.
26724 if (PassThru.isUndef())
26725 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26726
26727 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26728 Mask);
26729 }
26730 default:
26731 break;
26732 }
26733 }
26734
26735 switch (IntNo) {
26736 default: return SDValue(); // Don't custom lower most intrinsics.
26737
26738 // ptest and testp intrinsics. The intrinsic these come from are designed to
26739 // return an integer value, not just an instruction so lower it to the ptest
26740 // or testp pattern and a setcc for the result.
26741 case Intrinsic::x86_avx512_ktestc_b:
26742 case Intrinsic::x86_avx512_ktestc_w:
26743 case Intrinsic::x86_avx512_ktestc_d:
26744 case Intrinsic::x86_avx512_ktestc_q:
26745 case Intrinsic::x86_avx512_ktestz_b:
26746 case Intrinsic::x86_avx512_ktestz_w:
26747 case Intrinsic::x86_avx512_ktestz_d:
26748 case Intrinsic::x86_avx512_ktestz_q:
26749 case Intrinsic::x86_sse41_ptestz:
26750 case Intrinsic::x86_sse41_ptestc:
26751 case Intrinsic::x86_sse41_ptestnzc:
26752 case Intrinsic::x86_avx_ptestz_256:
26753 case Intrinsic::x86_avx_ptestc_256:
26754 case Intrinsic::x86_avx_ptestnzc_256:
26755 case Intrinsic::x86_avx_vtestz_ps:
26756 case Intrinsic::x86_avx_vtestc_ps:
26757 case Intrinsic::x86_avx_vtestnzc_ps:
26758 case Intrinsic::x86_avx_vtestz_pd:
26759 case Intrinsic::x86_avx_vtestc_pd:
26760 case Intrinsic::x86_avx_vtestnzc_pd:
26761 case Intrinsic::x86_avx_vtestz_ps_256:
26762 case Intrinsic::x86_avx_vtestc_ps_256:
26763 case Intrinsic::x86_avx_vtestnzc_ps_256:
26764 case Intrinsic::x86_avx_vtestz_pd_256:
26765 case Intrinsic::x86_avx_vtestc_pd_256:
26766 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26767 unsigned TestOpc = X86ISD::PTEST;
26768 X86::CondCode X86CC;
26769 switch (IntNo) {
26770 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
26771 case Intrinsic::x86_avx512_ktestc_b:
26772 case Intrinsic::x86_avx512_ktestc_w:
26773 case Intrinsic::x86_avx512_ktestc_d:
26774 case Intrinsic::x86_avx512_ktestc_q:
26775 // CF = 1
26776 TestOpc = X86ISD::KTEST;
26777 X86CC = X86::COND_B;
26778 break;
26779 case Intrinsic::x86_avx512_ktestz_b:
26780 case Intrinsic::x86_avx512_ktestz_w:
26781 case Intrinsic::x86_avx512_ktestz_d:
26782 case Intrinsic::x86_avx512_ktestz_q:
26783 TestOpc = X86ISD::KTEST;
26784 X86CC = X86::COND_E;
26785 break;
26786 case Intrinsic::x86_avx_vtestz_ps:
26787 case Intrinsic::x86_avx_vtestz_pd:
26788 case Intrinsic::x86_avx_vtestz_ps_256:
26789 case Intrinsic::x86_avx_vtestz_pd_256:
26790 TestOpc = X86ISD::TESTP;
26791 [[fallthrough]];
26792 case Intrinsic::x86_sse41_ptestz:
26793 case Intrinsic::x86_avx_ptestz_256:
26794 // ZF = 1
26795 X86CC = X86::COND_E;
26796 break;
26797 case Intrinsic::x86_avx_vtestc_ps:
26798 case Intrinsic::x86_avx_vtestc_pd:
26799 case Intrinsic::x86_avx_vtestc_ps_256:
26800 case Intrinsic::x86_avx_vtestc_pd_256:
26801 TestOpc = X86ISD::TESTP;
26802 [[fallthrough]];
26803 case Intrinsic::x86_sse41_ptestc:
26804 case Intrinsic::x86_avx_ptestc_256:
26805 // CF = 1
26806 X86CC = X86::COND_B;
26807 break;
26808 case Intrinsic::x86_avx_vtestnzc_ps:
26809 case Intrinsic::x86_avx_vtestnzc_pd:
26810 case Intrinsic::x86_avx_vtestnzc_ps_256:
26811 case Intrinsic::x86_avx_vtestnzc_pd_256:
26812 TestOpc = X86ISD::TESTP;
26813 [[fallthrough]];
26814 case Intrinsic::x86_sse41_ptestnzc:
26815 case Intrinsic::x86_avx_ptestnzc_256:
26816 // ZF and CF = 0
26817 X86CC = X86::COND_A;
26818 break;
26819 }
26820
26821 SDValue LHS = Op.getOperand(1);
26822 SDValue RHS = Op.getOperand(2);
26823 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26824 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26825 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26826 }
26827
26828 case Intrinsic::x86_sse42_pcmpistria128:
26829 case Intrinsic::x86_sse42_pcmpestria128:
26830 case Intrinsic::x86_sse42_pcmpistric128:
26831 case Intrinsic::x86_sse42_pcmpestric128:
26832 case Intrinsic::x86_sse42_pcmpistrio128:
26833 case Intrinsic::x86_sse42_pcmpestrio128:
26834 case Intrinsic::x86_sse42_pcmpistris128:
26835 case Intrinsic::x86_sse42_pcmpestris128:
26836 case Intrinsic::x86_sse42_pcmpistriz128:
26837 case Intrinsic::x86_sse42_pcmpestriz128: {
26838 unsigned Opcode;
26839 X86::CondCode X86CC;
26840 switch (IntNo) {
26841 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26842 case Intrinsic::x86_sse42_pcmpistria128:
26843 Opcode = X86ISD::PCMPISTR;
26844 X86CC = X86::COND_A;
26845 break;
26846 case Intrinsic::x86_sse42_pcmpestria128:
26847 Opcode = X86ISD::PCMPESTR;
26848 X86CC = X86::COND_A;
26849 break;
26850 case Intrinsic::x86_sse42_pcmpistric128:
26851 Opcode = X86ISD::PCMPISTR;
26852 X86CC = X86::COND_B;
26853 break;
26854 case Intrinsic::x86_sse42_pcmpestric128:
26855 Opcode = X86ISD::PCMPESTR;
26856 X86CC = X86::COND_B;
26857 break;
26858 case Intrinsic::x86_sse42_pcmpistrio128:
26859 Opcode = X86ISD::PCMPISTR;
26860 X86CC = X86::COND_O;
26861 break;
26862 case Intrinsic::x86_sse42_pcmpestrio128:
26863 Opcode = X86ISD::PCMPESTR;
26864 X86CC = X86::COND_O;
26865 break;
26866 case Intrinsic::x86_sse42_pcmpistris128:
26867 Opcode = X86ISD::PCMPISTR;
26868 X86CC = X86::COND_S;
26869 break;
26870 case Intrinsic::x86_sse42_pcmpestris128:
26871 Opcode = X86ISD::PCMPESTR;
26872 X86CC = X86::COND_S;
26873 break;
26874 case Intrinsic::x86_sse42_pcmpistriz128:
26875 Opcode = X86ISD::PCMPISTR;
26876 X86CC = X86::COND_E;
26877 break;
26878 case Intrinsic::x86_sse42_pcmpestriz128:
26879 Opcode = X86ISD::PCMPESTR;
26880 X86CC = X86::COND_E;
26881 break;
26882 }
26884 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26885 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26886 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26887 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26888 }
26889
26890 case Intrinsic::x86_sse42_pcmpistri128:
26891 case Intrinsic::x86_sse42_pcmpestri128: {
26892 unsigned Opcode;
26893 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26894 Opcode = X86ISD::PCMPISTR;
26895 else
26896 Opcode = X86ISD::PCMPESTR;
26897
26899 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26900 return DAG.getNode(Opcode, dl, VTs, NewOps);
26901 }
26902
26903 case Intrinsic::x86_sse42_pcmpistrm128:
26904 case Intrinsic::x86_sse42_pcmpestrm128: {
26905 unsigned Opcode;
26906 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26907 Opcode = X86ISD::PCMPISTR;
26908 else
26909 Opcode = X86ISD::PCMPESTR;
26910
26912 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26913 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26914 }
26915
26916 case Intrinsic::eh_sjlj_lsda: {
26918 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26919 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26920 auto &Context = MF.getContext();
26921 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26922 Twine(MF.getFunctionNumber()));
26923 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
26924 DAG.getMCSymbol(S, PtrVT));
26925 }
26926
26927 case Intrinsic::x86_seh_lsda: {
26928 // Compute the symbol for the LSDA. We know it'll get emitted later.
26930 SDValue Op1 = Op.getOperand(1);
26931 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26934
26935 // Generate a simple absolute symbol reference. This intrinsic is only
26936 // supported on 32-bit Windows, which isn't PIC.
26937 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26938 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26939 }
26940
26941 case Intrinsic::eh_recoverfp: {
26942 SDValue FnOp = Op.getOperand(1);
26943 SDValue IncomingFPOp = Op.getOperand(2);
26944 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26945 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26946 if (!Fn)
26948 "llvm.eh.recoverfp must take a function as the first argument");
26949 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26950 }
26951
26952 case Intrinsic::localaddress: {
26953 // Returns one of the stack, base, or frame pointer registers, depending on
26954 // which is used to reference local variables.
26956 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26957 unsigned Reg;
26958 if (RegInfo->hasBasePointer(MF))
26959 Reg = RegInfo->getBaseRegister();
26960 else { // Handles the SP or FP case.
26961 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26962 if (CantUseFP)
26963 Reg = RegInfo->getPtrSizedStackRegister(MF);
26964 else
26965 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26966 }
26967 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26968 }
26969 case Intrinsic::x86_avx512_vp2intersect_q_512:
26970 case Intrinsic::x86_avx512_vp2intersect_q_256:
26971 case Intrinsic::x86_avx512_vp2intersect_q_128:
26972 case Intrinsic::x86_avx512_vp2intersect_d_512:
26973 case Intrinsic::x86_avx512_vp2intersect_d_256:
26974 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26975 MVT MaskVT = Op.getSimpleValueType();
26976
26977 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26978 SDLoc DL(Op);
26979
26982 Op->getOperand(1), Op->getOperand(2));
26983
26984 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26985 MaskVT, Operation);
26986 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26987 MaskVT, Operation);
26988 return DAG.getMergeValues({Result0, Result1}, DL);
26989 }
26990 case Intrinsic::x86_mmx_pslli_w:
26991 case Intrinsic::x86_mmx_pslli_d:
26992 case Intrinsic::x86_mmx_pslli_q:
26993 case Intrinsic::x86_mmx_psrli_w:
26994 case Intrinsic::x86_mmx_psrli_d:
26995 case Intrinsic::x86_mmx_psrli_q:
26996 case Intrinsic::x86_mmx_psrai_w:
26997 case Intrinsic::x86_mmx_psrai_d: {
26998 SDLoc DL(Op);
26999 SDValue ShAmt = Op.getOperand(2);
27000 // If the argument is a constant, convert it to a target constant.
27001 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27002 // Clamp out of bounds shift amounts since they will otherwise be masked
27003 // to 8-bits which may make it no longer out of bounds.
27004 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27005 if (ShiftAmount == 0)
27006 return Op.getOperand(1);
27007
27008 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27009 Op.getOperand(0), Op.getOperand(1),
27010 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27011 }
27012
27013 unsigned NewIntrinsic;
27014 switch (IntNo) {
27015 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27016 case Intrinsic::x86_mmx_pslli_w:
27017 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27018 break;
27019 case Intrinsic::x86_mmx_pslli_d:
27020 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27021 break;
27022 case Intrinsic::x86_mmx_pslli_q:
27023 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27024 break;
27025 case Intrinsic::x86_mmx_psrli_w:
27026 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27027 break;
27028 case Intrinsic::x86_mmx_psrli_d:
27029 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27030 break;
27031 case Intrinsic::x86_mmx_psrli_q:
27032 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27033 break;
27034 case Intrinsic::x86_mmx_psrai_w:
27035 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27036 break;
27037 case Intrinsic::x86_mmx_psrai_d:
27038 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27039 break;
27040 }
27041
27042 // The vector shift intrinsics with scalars uses 32b shift amounts but
27043 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27044 // MMX register.
27045 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27046 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27047 DAG.getTargetConstant(NewIntrinsic, DL,
27049 Op.getOperand(1), ShAmt);
27050 }
27051 case Intrinsic::thread_pointer: {
27052 if (Subtarget.isTargetELF()) {
27053 SDLoc dl(Op);
27054 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27055 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27057 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27058 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27060 }
27062 "Target OS doesn't support __builtin_thread_pointer() yet.");
27063 }
27064 }
27065}
27066
27068 SDValue Src, SDValue Mask, SDValue Base,
27069 SDValue Index, SDValue ScaleOp, SDValue Chain,
27070 const X86Subtarget &Subtarget) {
27071 SDLoc dl(Op);
27072 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27073 // Scale must be constant.
27074 if (!C)
27075 return SDValue();
27076 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27077 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27078 TLI.getPointerTy(DAG.getDataLayout()));
27079 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27080 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27081 // If source is undef or we know it won't be used, use a zero vector
27082 // to break register dependency.
27083 // TODO: use undef instead and let BreakFalseDeps deal with it?
27084 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27085 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27086
27087 // Cast mask to an integer type.
27088 Mask = DAG.getBitcast(MaskVT, Mask);
27089
27090 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27091
27092 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27093 SDValue Res =
27094 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27095 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27096 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27097}
27098
27100 SDValue Src, SDValue Mask, SDValue Base,
27101 SDValue Index, SDValue ScaleOp, SDValue Chain,
27102 const X86Subtarget &Subtarget) {
27103 MVT VT = Op.getSimpleValueType();
27104 SDLoc dl(Op);
27105 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27106 // Scale must be constant.
27107 if (!C)
27108 return SDValue();
27109 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27110 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27111 TLI.getPointerTy(DAG.getDataLayout()));
27112 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27114 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27115
27116 // We support two versions of the gather intrinsics. One with scalar mask and
27117 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27118 if (Mask.getValueType() != MaskVT)
27119 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27120
27121 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27122 // If source is undef or we know it won't be used, use a zero vector
27123 // to break register dependency.
27124 // TODO: use undef instead and let BreakFalseDeps deal with it?
27125 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27126 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27127
27128 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27129
27130 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27131 SDValue Res =
27132 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27133 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27134 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27135}
27136
27137static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27138 SDValue Src, SDValue Mask, SDValue Base,
27139 SDValue Index, SDValue ScaleOp, SDValue Chain,
27140 const X86Subtarget &Subtarget) {
27141 SDLoc dl(Op);
27142 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27143 // Scale must be constant.
27144 if (!C)
27145 return SDValue();
27146 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27147 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27148 TLI.getPointerTy(DAG.getDataLayout()));
27149 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27150 Src.getSimpleValueType().getVectorNumElements());
27151 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27152
27153 // We support two versions of the scatter intrinsics. One with scalar mask and
27154 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27155 if (Mask.getValueType() != MaskVT)
27156 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27157
27158 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27159
27160 SDVTList VTs = DAG.getVTList(MVT::Other);
27161 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27162 SDValue Res =
27163 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
27164 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27165 return Res;
27166}
27167
27168static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27169 SDValue Mask, SDValue Base, SDValue Index,
27170 SDValue ScaleOp, SDValue Chain,
27171 const X86Subtarget &Subtarget) {
27172 SDLoc dl(Op);
27173 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27174 // Scale must be constant.
27175 if (!C)
27176 return SDValue();
27177 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27178 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27179 TLI.getPointerTy(DAG.getDataLayout()));
27180 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27181 SDValue Segment = DAG.getRegister(0, MVT::i32);
27182 MVT MaskVT =
27183 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27184 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27185 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27186 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27187 return SDValue(Res, 0);
27188}
27189
27190/// Handles the lowering of builtin intrinsics with chain that return their
27191/// value into registers EDX:EAX.
27192/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27193/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27194/// TargetOpcode.
27195/// Returns a Glue value which can be used to add extra copy-from-reg if the
27196/// expanded intrinsics implicitly defines extra registers (i.e. not just
27197/// EDX:EAX).
27199 SelectionDAG &DAG,
27200 unsigned TargetOpcode,
27201 unsigned SrcReg,
27202 const X86Subtarget &Subtarget,
27204 SDValue Chain = N->getOperand(0);
27205 SDValue Glue;
27206
27207 if (SrcReg) {
27208 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27209 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27210 Glue = Chain.getValue(1);
27211 }
27212
27213 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27214 SDValue N1Ops[] = {Chain, Glue};
27215 SDNode *N1 = DAG.getMachineNode(
27216 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27217 Chain = SDValue(N1, 0);
27218
27219 // Reads the content of XCR and returns it in registers EDX:EAX.
27220 SDValue LO, HI;
27221 if (Subtarget.is64Bit()) {
27222 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27223 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27224 LO.getValue(2));
27225 } else {
27226 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27227 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27228 LO.getValue(2));
27229 }
27230 Chain = HI.getValue(1);
27231 Glue = HI.getValue(2);
27232
27233 if (Subtarget.is64Bit()) {
27234 // Merge the two 32-bit values into a 64-bit one.
27235 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27236 DAG.getConstant(32, DL, MVT::i8));
27237 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27238 Results.push_back(Chain);
27239 return Glue;
27240 }
27241
27242 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27243 SDValue Ops[] = { LO, HI };
27244 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27245 Results.push_back(Pair);
27246 Results.push_back(Chain);
27247 return Glue;
27248}
27249
27250/// Handles the lowering of builtin intrinsics that read the time stamp counter
27251/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27252/// READCYCLECOUNTER nodes.
27253static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27254 SelectionDAG &DAG,
27255 const X86Subtarget &Subtarget,
27257 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27258 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27259 // and the EAX register is loaded with the low-order 32 bits.
27260 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27261 /* NoRegister */0, Subtarget,
27262 Results);
27263 if (Opcode != X86::RDTSCP)
27264 return;
27265
27266 SDValue Chain = Results[1];
27267 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27268 // the ECX register. Add 'ecx' explicitly to the chain.
27269 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27270 Results[1] = ecx;
27271 Results.push_back(ecx.getValue(1));
27272}
27273
27275 SelectionDAG &DAG) {
27277 SDLoc DL(Op);
27278 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27279 Results);
27280 return DAG.getMergeValues(Results, DL);
27281}
27282
27285 SDValue Chain = Op.getOperand(0);
27286 SDValue RegNode = Op.getOperand(2);
27287 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27288 if (!EHInfo)
27289 report_fatal_error("EH registrations only live in functions using WinEH");
27290
27291 // Cast the operand to an alloca, and remember the frame index.
27292 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27293 if (!FINode)
27294 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27295 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27296
27297 // Return the chain operand without making any DAG nodes.
27298 return Chain;
27299}
27300
27303 SDValue Chain = Op.getOperand(0);
27304 SDValue EHGuard = Op.getOperand(2);
27305 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27306 if (!EHInfo)
27307 report_fatal_error("EHGuard only live in functions using WinEH");
27308
27309 // Cast the operand to an alloca, and remember the frame index.
27310 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27311 if (!FINode)
27312 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27313 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27314
27315 // Return the chain operand without making any DAG nodes.
27316 return Chain;
27317}
27318
27319/// Emit Truncating Store with signed or unsigned saturation.
27320static SDValue
27321EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27322 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27323 SelectionDAG &DAG) {
27324 SDVTList VTs = DAG.getVTList(MVT::Other);
27325 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27326 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27327 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27328 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27329}
27330
27331/// Emit Masked Truncating Store with signed or unsigned saturation.
27332static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27333 const SDLoc &DL,
27334 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27335 MachineMemOperand *MMO, SelectionDAG &DAG) {
27336 SDVTList VTs = DAG.getVTList(MVT::Other);
27337 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27338 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27339 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27340}
27341
27343 const MachineFunction &MF) {
27344 if (!Subtarget.is64Bit())
27345 return false;
27346 // 64-bit targets support extended Swift async frame setup,
27347 // except for targets that use the windows 64 prologue.
27348 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27349}
27350
27352 SelectionDAG &DAG) {
27353 unsigned IntNo = Op.getConstantOperandVal(1);
27354 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27355 if (!IntrData) {
27356 switch (IntNo) {
27357
27358 case Intrinsic::swift_async_context_addr: {
27359 SDLoc dl(Op);
27360 auto &MF = DAG.getMachineFunction();
27361 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27362 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27364 X86FI->setHasSwiftAsyncContext(true);
27365 SDValue Chain = Op->getOperand(0);
27366 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27367 SDValue Result =
27368 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27369 DAG.getTargetConstant(8, dl, MVT::i32)),
27370 0);
27371 // Return { result, chain }.
27372 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27373 CopyRBP.getValue(1));
27374 } else {
27375 // No special extended frame, create or reuse an existing stack slot.
27376 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27377 if (!X86FI->getSwiftAsyncContextFrameIdx())
27378 X86FI->setSwiftAsyncContextFrameIdx(
27379 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27380 false));
27381 SDValue Result =
27382 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27383 PtrSize == 8 ? MVT::i64 : MVT::i32);
27384 // Return { result, chain }.
27385 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27386 Op->getOperand(0));
27387 }
27388 }
27389
27390 case llvm::Intrinsic::x86_seh_ehregnode:
27391 return MarkEHRegistrationNode(Op, DAG);
27392 case llvm::Intrinsic::x86_seh_ehguard:
27393 return MarkEHGuard(Op, DAG);
27394 case llvm::Intrinsic::x86_rdpkru: {
27395 SDLoc dl(Op);
27396 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27397 // Create a RDPKRU node and pass 0 to the ECX parameter.
27398 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27399 DAG.getConstant(0, dl, MVT::i32));
27400 }
27401 case llvm::Intrinsic::x86_wrpkru: {
27402 SDLoc dl(Op);
27403 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27404 // to the EDX and ECX parameters.
27405 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27406 Op.getOperand(0), Op.getOperand(2),
27407 DAG.getConstant(0, dl, MVT::i32),
27408 DAG.getConstant(0, dl, MVT::i32));
27409 }
27410 case llvm::Intrinsic::asan_check_memaccess: {
27411 // Mark this as adjustsStack because it will be lowered to a call.
27413 // Don't do anything here, we will expand these intrinsics out later.
27414 return Op;
27415 }
27416 case llvm::Intrinsic::x86_flags_read_u32:
27417 case llvm::Intrinsic::x86_flags_read_u64:
27418 case llvm::Intrinsic::x86_flags_write_u32:
27419 case llvm::Intrinsic::x86_flags_write_u64: {
27420 // We need a frame pointer because this will get lowered to a PUSH/POP
27421 // sequence.
27424 // Don't do anything here, we will expand these intrinsics out later
27425 // during FinalizeISel in EmitInstrWithCustomInserter.
27426 return Op;
27427 }
27428 case Intrinsic::x86_lwpins32:
27429 case Intrinsic::x86_lwpins64:
27430 case Intrinsic::x86_umwait:
27431 case Intrinsic::x86_tpause: {
27432 SDLoc dl(Op);
27433 SDValue Chain = Op->getOperand(0);
27434 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27435 unsigned Opcode;
27436
27437 switch (IntNo) {
27438 default: llvm_unreachable("Impossible intrinsic");
27439 case Intrinsic::x86_umwait:
27440 Opcode = X86ISD::UMWAIT;
27441 break;
27442 case Intrinsic::x86_tpause:
27443 Opcode = X86ISD::TPAUSE;
27444 break;
27445 case Intrinsic::x86_lwpins32:
27446 case Intrinsic::x86_lwpins64:
27447 Opcode = X86ISD::LWPINS;
27448 break;
27449 }
27450
27452 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27453 Op->getOperand(3), Op->getOperand(4));
27454 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27455 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27456 Operation.getValue(1));
27457 }
27458 case Intrinsic::x86_enqcmd:
27459 case Intrinsic::x86_enqcmds: {
27460 SDLoc dl(Op);
27461 SDValue Chain = Op.getOperand(0);
27462 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27463 unsigned Opcode;
27464 switch (IntNo) {
27465 default: llvm_unreachable("Impossible intrinsic!");
27466 case Intrinsic::x86_enqcmd:
27467 Opcode = X86ISD::ENQCMD;
27468 break;
27469 case Intrinsic::x86_enqcmds:
27470 Opcode = X86ISD::ENQCMDS;
27471 break;
27472 }
27473 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27474 Op.getOperand(3));
27475 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27476 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27477 Operation.getValue(1));
27478 }
27479 case Intrinsic::x86_aesenc128kl:
27480 case Intrinsic::x86_aesdec128kl:
27481 case Intrinsic::x86_aesenc256kl:
27482 case Intrinsic::x86_aesdec256kl: {
27483 SDLoc DL(Op);
27484 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27485 SDValue Chain = Op.getOperand(0);
27486 unsigned Opcode;
27487
27488 switch (IntNo) {
27489 default: llvm_unreachable("Impossible intrinsic");
27490 case Intrinsic::x86_aesenc128kl:
27491 Opcode = X86ISD::AESENC128KL;
27492 break;
27493 case Intrinsic::x86_aesdec128kl:
27494 Opcode = X86ISD::AESDEC128KL;
27495 break;
27496 case Intrinsic::x86_aesenc256kl:
27497 Opcode = X86ISD::AESENC256KL;
27498 break;
27499 case Intrinsic::x86_aesdec256kl:
27500 Opcode = X86ISD::AESDEC256KL;
27501 break;
27502 }
27503
27504 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27505 MachineMemOperand *MMO = MemIntr->getMemOperand();
27506 EVT MemVT = MemIntr->getMemoryVT();
27508 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27509 MMO);
27510 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27511
27512 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27513 {ZF, Operation.getValue(0), Operation.getValue(2)});
27514 }
27515 case Intrinsic::x86_aesencwide128kl:
27516 case Intrinsic::x86_aesdecwide128kl:
27517 case Intrinsic::x86_aesencwide256kl:
27518 case Intrinsic::x86_aesdecwide256kl: {
27519 SDLoc DL(Op);
27520 SDVTList VTs = DAG.getVTList(
27521 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27522 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27523 SDValue Chain = Op.getOperand(0);
27524 unsigned Opcode;
27525
27526 switch (IntNo) {
27527 default: llvm_unreachable("Impossible intrinsic");
27528 case Intrinsic::x86_aesencwide128kl:
27529 Opcode = X86ISD::AESENCWIDE128KL;
27530 break;
27531 case Intrinsic::x86_aesdecwide128kl:
27532 Opcode = X86ISD::AESDECWIDE128KL;
27533 break;
27534 case Intrinsic::x86_aesencwide256kl:
27535 Opcode = X86ISD::AESENCWIDE256KL;
27536 break;
27537 case Intrinsic::x86_aesdecwide256kl:
27538 Opcode = X86ISD::AESDECWIDE256KL;
27539 break;
27540 }
27541
27542 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27543 MachineMemOperand *MMO = MemIntr->getMemOperand();
27544 EVT MemVT = MemIntr->getMemoryVT();
27546 Opcode, DL, VTs,
27547 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27548 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27549 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27550 MemVT, MMO);
27551 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27552
27553 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27554 {ZF, Operation.getValue(1), Operation.getValue(2),
27555 Operation.getValue(3), Operation.getValue(4),
27556 Operation.getValue(5), Operation.getValue(6),
27557 Operation.getValue(7), Operation.getValue(8),
27558 Operation.getValue(9)});
27559 }
27560 case Intrinsic::x86_testui: {
27561 SDLoc dl(Op);
27562 SDValue Chain = Op.getOperand(0);
27563 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27564 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27565 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27566 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27567 Operation.getValue(1));
27568 }
27569 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27570 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27571 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27572 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27573 case Intrinsic::x86_t2rpntlvwz0_internal:
27574 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27575 case Intrinsic::x86_t2rpntlvwz1_internal:
27576 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27577 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27579 unsigned IntNo = Op.getConstantOperandVal(1);
27580 unsigned Opc = 0;
27581 switch (IntNo) {
27582 default:
27583 llvm_unreachable("Unexpected intrinsic!");
27584 case Intrinsic::x86_t2rpntlvwz0_internal:
27585 Opc = X86::PT2RPNTLVWZ0V;
27586 break;
27587 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27588 Opc = X86::PT2RPNTLVWZ0T1V;
27589 break;
27590 case Intrinsic::x86_t2rpntlvwz1_internal:
27591 Opc = X86::PT2RPNTLVWZ1V;
27592 break;
27593 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27594 Opc = X86::PT2RPNTLVWZ1T1V;
27595 break;
27596 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27597 Opc = X86::PT2RPNTLVWZ0RSV;
27598 break;
27599 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27600 Opc = X86::PT2RPNTLVWZ0RST1V;
27601 break;
27602 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27603 Opc = X86::PT2RPNTLVWZ1RSV;
27604 break;
27605 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27606 Opc = X86::PT2RPNTLVWZ1RST1V;
27607 break;
27608 }
27609
27610 SDLoc DL(Op);
27611 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27612
27613 SDValue Ops[] = {Op.getOperand(2), // Row
27614 Op.getOperand(3), // Col0
27615 Op.getOperand(4), // Col1
27616 Op.getOperand(5), // Base
27617 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27618 Op.getOperand(6), // Index
27619 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27620 DAG.getRegister(0, MVT::i16), // Segment
27621 Op.getOperand(0)}; // Chain
27622
27623 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27624 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27625 SDValue(Res, 0));
27626 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27627 SDValue(Res, 0));
27628 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
27629 }
27630 case Intrinsic::x86_atomic_bts_rm:
27631 case Intrinsic::x86_atomic_btc_rm:
27632 case Intrinsic::x86_atomic_btr_rm: {
27633 SDLoc DL(Op);
27634 MVT VT = Op.getSimpleValueType();
27635 SDValue Chain = Op.getOperand(0);
27636 SDValue Op1 = Op.getOperand(2);
27637 SDValue Op2 = Op.getOperand(3);
27638 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
27639 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
27641 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27642 SDValue Res =
27643 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27644 {Chain, Op1, Op2}, VT, MMO);
27645 Chain = Res.getValue(1);
27646 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27647 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27648 }
27649 case Intrinsic::x86_atomic_bts:
27650 case Intrinsic::x86_atomic_btc:
27651 case Intrinsic::x86_atomic_btr: {
27652 SDLoc DL(Op);
27653 MVT VT = Op.getSimpleValueType();
27654 SDValue Chain = Op.getOperand(0);
27655 SDValue Op1 = Op.getOperand(2);
27656 SDValue Op2 = Op.getOperand(3);
27657 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
27658 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
27659 : X86ISD::LBTR;
27660 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
27661 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27662 SDValue Res =
27663 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27664 {Chain, Op1, Op2, Size}, VT, MMO);
27665 Chain = Res.getValue(1);
27666 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27667 unsigned Imm = Op2->getAsZExtVal();
27668 if (Imm)
27669 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
27670 DAG.getShiftAmountConstant(Imm, VT, DL));
27671 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27672 }
27673 case Intrinsic::x86_cmpccxadd32:
27674 case Intrinsic::x86_cmpccxadd64: {
27675 SDLoc DL(Op);
27676 SDValue Chain = Op.getOperand(0);
27677 SDValue Addr = Op.getOperand(2);
27678 SDValue Src1 = Op.getOperand(3);
27679 SDValue Src2 = Op.getOperand(4);
27680 SDValue CC = Op.getOperand(5);
27681 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27683 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
27684 MVT::i32, MMO);
27685 return Operation;
27686 }
27687 case Intrinsic::x86_aadd32:
27688 case Intrinsic::x86_aadd64:
27689 case Intrinsic::x86_aand32:
27690 case Intrinsic::x86_aand64:
27691 case Intrinsic::x86_aor32:
27692 case Intrinsic::x86_aor64:
27693 case Intrinsic::x86_axor32:
27694 case Intrinsic::x86_axor64: {
27695 SDLoc DL(Op);
27696 SDValue Chain = Op.getOperand(0);
27697 SDValue Op1 = Op.getOperand(2);
27698 SDValue Op2 = Op.getOperand(3);
27699 MVT VT = Op2.getSimpleValueType();
27700 unsigned Opc = 0;
27701 switch (IntNo) {
27702 default:
27703 llvm_unreachable("Unknown Intrinsic");
27704 case Intrinsic::x86_aadd32:
27705 case Intrinsic::x86_aadd64:
27706 Opc = X86ISD::AADD;
27707 break;
27708 case Intrinsic::x86_aand32:
27709 case Intrinsic::x86_aand64:
27710 Opc = X86ISD::AAND;
27711 break;
27712 case Intrinsic::x86_aor32:
27713 case Intrinsic::x86_aor64:
27714 Opc = X86ISD::AOR;
27715 break;
27716 case Intrinsic::x86_axor32:
27717 case Intrinsic::x86_axor64:
27718 Opc = X86ISD::AXOR;
27719 break;
27720 }
27721 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27722 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27723 {Chain, Op1, Op2}, VT, MMO);
27724 }
27725 case Intrinsic::x86_atomic_add_cc:
27726 case Intrinsic::x86_atomic_sub_cc:
27727 case Intrinsic::x86_atomic_or_cc:
27728 case Intrinsic::x86_atomic_and_cc:
27729 case Intrinsic::x86_atomic_xor_cc: {
27730 SDLoc DL(Op);
27731 SDValue Chain = Op.getOperand(0);
27732 SDValue Op1 = Op.getOperand(2);
27733 SDValue Op2 = Op.getOperand(3);
27734 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
27735 MVT VT = Op2.getSimpleValueType();
27736 unsigned Opc = 0;
27737 switch (IntNo) {
27738 default:
27739 llvm_unreachable("Unknown Intrinsic");
27740 case Intrinsic::x86_atomic_add_cc:
27741 Opc = X86ISD::LADD;
27742 break;
27743 case Intrinsic::x86_atomic_sub_cc:
27744 Opc = X86ISD::LSUB;
27745 break;
27746 case Intrinsic::x86_atomic_or_cc:
27747 Opc = X86ISD::LOR;
27748 break;
27749 case Intrinsic::x86_atomic_and_cc:
27750 Opc = X86ISD::LAND;
27751 break;
27752 case Intrinsic::x86_atomic_xor_cc:
27753 Opc = X86ISD::LXOR;
27754 break;
27755 }
27756 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27757 SDValue LockArith =
27758 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27759 {Chain, Op1, Op2}, VT, MMO);
27760 Chain = LockArith.getValue(1);
27761 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
27762 }
27763 }
27764 return SDValue();
27765 }
27766
27767 SDLoc dl(Op);
27768 switch(IntrData->Type) {
27769 default: llvm_unreachable("Unknown Intrinsic Type");
27770 case RDSEED:
27771 case RDRAND: {
27772 // Emit the node with the right value type.
27773 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27774 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27775
27776 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27777 // Otherwise return the value from Rand, which is always 0, casted to i32.
27778 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27779 DAG.getConstant(1, dl, Op->getValueType(1)),
27780 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27781 SDValue(Result.getNode(), 1)};
27782 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27783
27784 // Return { result, isValid, chain }.
27785 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27786 SDValue(Result.getNode(), 2));
27787 }
27788 case GATHER_AVX2: {
27789 SDValue Chain = Op.getOperand(0);
27790 SDValue Src = Op.getOperand(2);
27791 SDValue Base = Op.getOperand(3);
27792 SDValue Index = Op.getOperand(4);
27793 SDValue Mask = Op.getOperand(5);
27794 SDValue Scale = Op.getOperand(6);
27795 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27796 Scale, Chain, Subtarget);
27797 }
27798 case GATHER: {
27799 //gather(v1, mask, index, base, scale);
27800 SDValue Chain = Op.getOperand(0);
27801 SDValue Src = Op.getOperand(2);
27802 SDValue Base = Op.getOperand(3);
27803 SDValue Index = Op.getOperand(4);
27804 SDValue Mask = Op.getOperand(5);
27805 SDValue Scale = Op.getOperand(6);
27806 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27807 Chain, Subtarget);
27808 }
27809 case SCATTER: {
27810 //scatter(base, mask, index, v1, scale);
27811 SDValue Chain = Op.getOperand(0);
27812 SDValue Base = Op.getOperand(2);
27813 SDValue Mask = Op.getOperand(3);
27814 SDValue Index = Op.getOperand(4);
27815 SDValue Src = Op.getOperand(5);
27816 SDValue Scale = Op.getOperand(6);
27817 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27818 Scale, Chain, Subtarget);
27819 }
27820 case PREFETCH: {
27821 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27822 assert((HintVal == 2 || HintVal == 3) &&
27823 "Wrong prefetch hint in intrinsic: should be 2 or 3");
27824 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27825 SDValue Chain = Op.getOperand(0);
27826 SDValue Mask = Op.getOperand(2);
27827 SDValue Index = Op.getOperand(3);
27828 SDValue Base = Op.getOperand(4);
27829 SDValue Scale = Op.getOperand(5);
27830 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27831 Subtarget);
27832 }
27833 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27834 case RDTSC: {
27836 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27837 Results);
27838 return DAG.getMergeValues(Results, dl);
27839 }
27840 // Read Performance Monitoring Counters.
27841 case RDPMC:
27842 // Read Processor Register.
27843 case RDPRU:
27844 // GetExtended Control Register.
27845 case XGETBV: {
27847
27848 // RDPMC uses ECX to select the index of the performance counter to read.
27849 // RDPRU uses ECX to select the processor register to read.
27850 // XGETBV uses ECX to select the index of the XCR register to return.
27851 // The result is stored into registers EDX:EAX.
27852 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27853 Subtarget, Results);
27854 return DAG.getMergeValues(Results, dl);
27855 }
27856 // XTEST intrinsics.
27857 case XTEST: {
27858 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27859 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27860
27861 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27862 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27863 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27864 Ret, SDValue(InTrans.getNode(), 1));
27865 }
27868 case TRUNCATE_TO_MEM_VI32: {
27869 SDValue Mask = Op.getOperand(4);
27870 SDValue DataToTruncate = Op.getOperand(3);
27871 SDValue Addr = Op.getOperand(2);
27872 SDValue Chain = Op.getOperand(0);
27873
27874 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27875 assert(MemIntr && "Expected MemIntrinsicSDNode!");
27876
27877 EVT MemVT = MemIntr->getMemoryVT();
27878
27879 uint16_t TruncationOp = IntrData->Opc0;
27880 switch (TruncationOp) {
27881 case X86ISD::VTRUNC: {
27882 if (isAllOnesConstant(Mask)) // return just a truncate store
27883 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27884 MemIntr->getMemOperand());
27885
27886 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27887 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27888 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27889
27890 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27891 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27892 true /* truncating */);
27893 }
27894 case X86ISD::VTRUNCUS:
27895 case X86ISD::VTRUNCS: {
27896 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27897 if (isAllOnesConstant(Mask))
27898 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27899 MemIntr->getMemOperand(), DAG);
27900
27901 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27902 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27903
27904 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27905 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27906 }
27907 default:
27908 llvm_unreachable("Unsupported truncstore intrinsic");
27909 }
27910 }
27911 case INTR_TYPE_CAST_MMX:
27912 return SDValue(); // handled in combineINTRINSIC_*
27913 }
27914}
27915
27916SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27917 SelectionDAG &DAG) const {
27919 MFI.setReturnAddressIsTaken(true);
27920
27922 return SDValue();
27923
27924 unsigned Depth = Op.getConstantOperandVal(0);
27925 SDLoc dl(Op);
27926 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27927
27928 if (Depth > 0) {
27929 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27930 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27931 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27932 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27933 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27935 }
27936
27937 // Just load the return address.
27938 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27939 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27941}
27942
27943SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27944 SelectionDAG &DAG) const {
27946 return getReturnAddressFrameIndex(DAG);
27947}
27948
27949SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27951 MachineFrameInfo &MFI = MF.getFrameInfo();
27953 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27954 EVT VT = Op.getValueType();
27955
27956 MFI.setFrameAddressIsTaken(true);
27957
27958 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27959 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27960 // is not possible to crawl up the stack without looking at the unwind codes
27961 // simultaneously.
27962 int FrameAddrIndex = FuncInfo->getFAIndex();
27963 if (!FrameAddrIndex) {
27964 // Set up a frame object for the return address.
27965 unsigned SlotSize = RegInfo->getSlotSize();
27966 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27967 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27968 FuncInfo->setFAIndex(FrameAddrIndex);
27969 }
27970 return DAG.getFrameIndex(FrameAddrIndex, VT);
27971 }
27972
27973 unsigned FrameReg =
27974 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27975 SDLoc dl(Op); // FIXME probably not meaningful
27976 unsigned Depth = Op.getConstantOperandVal(0);
27977 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
27978 (FrameReg == X86::EBP && VT == MVT::i32)) &&
27979 "Invalid Frame Register!");
27980 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27981 while (Depth--)
27982 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27984 return FrameAddr;
27985}
27986
27987// FIXME? Maybe this could be a TableGen attribute on some registers and
27988// this table could be generated automatically from RegInfo.
27990 const MachineFunction &MF) const {
27991 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27992
27994 .Case("esp", X86::ESP)
27995 .Case("rsp", X86::RSP)
27996 .Case("ebp", X86::EBP)
27997 .Case("rbp", X86::RBP)
27998 .Case("r14", X86::R14)
27999 .Case("r15", X86::R15)
28000 .Default(0);
28001
28002 if (Reg == X86::EBP || Reg == X86::RBP) {
28003 if (!TFI.hasFP(MF))
28004 report_fatal_error("register " + StringRef(RegName) +
28005 " is allocatable: function has no frame pointer");
28006#ifndef NDEBUG
28007 else {
28008 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28009 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28010 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28011 "Invalid Frame Register!");
28012 }
28013#endif
28014 }
28015
28016 if (Reg)
28017 return Reg;
28018
28019 report_fatal_error("Invalid register name global variable");
28020}
28021
28022SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28023 SelectionDAG &DAG) const {
28024 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28025 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28026}
28027
28029 const Constant *PersonalityFn) const {
28030 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28031 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28032
28033 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28034}
28035
28037 const Constant *PersonalityFn) const {
28038 // Funclet personalities don't use selectors (the runtime does the selection).
28040 return X86::NoRegister;
28041 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28042}
28043
28045 return Subtarget.isTargetWin64();
28046}
28047
28048SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28049 SDValue Chain = Op.getOperand(0);
28050 SDValue Offset = Op.getOperand(1);
28051 SDValue Handler = Op.getOperand(2);
28052 SDLoc dl (Op);
28053
28054 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28055 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28056 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28057 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28058 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28059 "Invalid Frame Register!");
28060 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28061 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28062
28063 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28064 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28065 dl));
28066 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28067 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28068 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28069
28070 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28071 DAG.getRegister(StoreAddrReg, PtrVT));
28072}
28073
28074SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28075 SelectionDAG &DAG) const {
28076 SDLoc DL(Op);
28077 // If the subtarget is not 64bit, we may need the global base reg
28078 // after isel expand pseudo, i.e., after CGBR pass ran.
28079 // Therefore, ask for the GlobalBaseReg now, so that the pass
28080 // inserts the code for us in case we need it.
28081 // Otherwise, we will end up in a situation where we will
28082 // reference a virtual register that is not defined!
28083 if (!Subtarget.is64Bit()) {
28084 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28085 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28086 }
28087 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28088 DAG.getVTList(MVT::i32, MVT::Other),
28089 Op.getOperand(0), Op.getOperand(1));
28090}
28091
28092SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28093 SelectionDAG &DAG) const {
28094 SDLoc DL(Op);
28095 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28096 Op.getOperand(0), Op.getOperand(1));
28097}
28098
28099SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28100 SelectionDAG &DAG) const {
28101 SDLoc DL(Op);
28102 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28103 Op.getOperand(0));
28104}
28105
28107 return Op.getOperand(0);
28108}
28109
28110SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28111 SelectionDAG &DAG) const {
28112 SDValue Root = Op.getOperand(0);
28113 SDValue Trmp = Op.getOperand(1); // trampoline
28114 SDValue FPtr = Op.getOperand(2); // nested function
28115 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28116 SDLoc dl (Op);
28117
28118 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28119 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28120
28121 if (Subtarget.is64Bit()) {
28122 SDValue OutChains[6];
28123
28124 // Large code-model.
28125 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28126 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28127
28128 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28129 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28130
28131 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28132
28133 // Load the pointer to the nested function into R11.
28134 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28135 SDValue Addr = Trmp;
28136 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28137 Addr, MachinePointerInfo(TrmpAddr));
28138
28139 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28140 DAG.getConstant(2, dl, MVT::i64));
28141 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28142 MachinePointerInfo(TrmpAddr, 2), Align(2));
28143
28144 // Load the 'nest' parameter value into R10.
28145 // R10 is specified in X86CallingConv.td
28146 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28147 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28148 DAG.getConstant(10, dl, MVT::i64));
28149 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28150 Addr, MachinePointerInfo(TrmpAddr, 10));
28151
28152 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28153 DAG.getConstant(12, dl, MVT::i64));
28154 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28155 MachinePointerInfo(TrmpAddr, 12), Align(2));
28156
28157 // Jump to the nested function.
28158 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28159 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28160 DAG.getConstant(20, dl, MVT::i64));
28161 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28162 Addr, MachinePointerInfo(TrmpAddr, 20));
28163
28164 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28165 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28166 DAG.getConstant(22, dl, MVT::i64));
28167 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28168 Addr, MachinePointerInfo(TrmpAddr, 22));
28169
28170 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28171 } else {
28172 const Function *Func =
28173 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28174 CallingConv::ID CC = Func->getCallingConv();
28175 unsigned NestReg;
28176
28177 switch (CC) {
28178 default:
28179 llvm_unreachable("Unsupported calling convention");
28180 case CallingConv::C:
28182 // Pass 'nest' parameter in ECX.
28183 // Must be kept in sync with X86CallingConv.td
28184 NestReg = X86::ECX;
28185
28186 // Check that ECX wasn't needed by an 'inreg' parameter.
28187 FunctionType *FTy = Func->getFunctionType();
28188 const AttributeList &Attrs = Func->getAttributes();
28189
28190 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28191 unsigned InRegCount = 0;
28192 unsigned Idx = 0;
28193
28194 for (FunctionType::param_iterator I = FTy->param_begin(),
28195 E = FTy->param_end(); I != E; ++I, ++Idx)
28196 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28197 const DataLayout &DL = DAG.getDataLayout();
28198 // FIXME: should only count parameters that are lowered to integers.
28199 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28200 }
28201
28202 if (InRegCount > 2) {
28203 report_fatal_error("Nest register in use - reduce number of inreg"
28204 " parameters!");
28205 }
28206 }
28207 break;
28208 }
28211 case CallingConv::Fast:
28212 case CallingConv::Tail:
28214 // Pass 'nest' parameter in EAX.
28215 // Must be kept in sync with X86CallingConv.td
28216 NestReg = X86::EAX;
28217 break;
28218 }
28219
28220 SDValue OutChains[4];
28221 SDValue Addr, Disp;
28222
28223 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28224 DAG.getConstant(10, dl, MVT::i32));
28225 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28226
28227 // This is storing the opcode for MOV32ri.
28228 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28229 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28230 OutChains[0] =
28231 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28232 Trmp, MachinePointerInfo(TrmpAddr));
28233
28234 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28235 DAG.getConstant(1, dl, MVT::i32));
28236 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28237 MachinePointerInfo(TrmpAddr, 1), Align(1));
28238
28239 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28240 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28241 DAG.getConstant(5, dl, MVT::i32));
28242 OutChains[2] =
28243 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28244 MachinePointerInfo(TrmpAddr, 5), Align(1));
28245
28246 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28247 DAG.getConstant(6, dl, MVT::i32));
28248 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28249 MachinePointerInfo(TrmpAddr, 6), Align(1));
28250
28251 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28252 }
28253}
28254
28255SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28256 SelectionDAG &DAG) const {
28257 /*
28258 The rounding mode is in bits 11:10 of FPSR, and has the following
28259 settings:
28260 00 Round to nearest
28261 01 Round to -inf
28262 10 Round to +inf
28263 11 Round to 0
28264
28265 GET_ROUNDING, on the other hand, expects the following:
28266 -1 Undefined
28267 0 Round to 0
28268 1 Round to nearest
28269 2 Round to +inf
28270 3 Round to -inf
28271
28272 To perform the conversion, we use a packed lookup table of the four 2-bit
28273 values that we can index by FPSP[11:10]
28274 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28275
28276 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28277 */
28278
28280 MVT VT = Op.getSimpleValueType();
28281 SDLoc DL(Op);
28282
28283 // Save FP Control Word to stack slot
28284 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28285 SDValue StackSlot =
28286 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28287
28289
28290 SDValue Chain = Op.getOperand(0);
28291 SDValue Ops[] = {Chain, StackSlot};
28293 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28295
28296 // Load FP Control Word from stack slot
28297 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28298 Chain = CWD.getValue(1);
28299
28300 // Mask and turn the control bits into a shift for the lookup table.
28301 SDValue Shift =
28302 DAG.getNode(ISD::SRL, DL, MVT::i16,
28303 DAG.getNode(ISD::AND, DL, MVT::i16,
28304 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28305 DAG.getConstant(9, DL, MVT::i8));
28306 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28307
28308 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28309 SDValue RetVal =
28310 DAG.getNode(ISD::AND, DL, MVT::i32,
28311 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28312 DAG.getConstant(3, DL, MVT::i32));
28313
28314 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28315
28316 return DAG.getMergeValues({RetVal, Chain}, DL);
28317}
28318
28319SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28320 SelectionDAG &DAG) const {
28322 SDLoc DL(Op);
28323 SDValue Chain = Op.getNode()->getOperand(0);
28324
28325 // FP control word may be set only from data in memory. So we need to allocate
28326 // stack space to save/load FP control word.
28327 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28328 SDValue StackSlot =
28329 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28331 MachineMemOperand *MMO =
28333
28334 // Store FP control word into memory.
28335 SDValue Ops[] = {Chain, StackSlot};
28336 Chain = DAG.getMemIntrinsicNode(
28337 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28338
28339 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28340 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28341 Chain = CWD.getValue(1);
28342 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28343 DAG.getConstant(0xf3ff, DL, MVT::i16));
28344
28345 // Calculate new rounding mode.
28346 SDValue NewRM = Op.getNode()->getOperand(1);
28347 SDValue RMBits;
28348 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28349 uint64_t RM = CVal->getZExtValue();
28350 int FieldVal;
28351 switch (static_cast<RoundingMode>(RM)) {
28352 // clang-format off
28353 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
28354 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
28355 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
28356 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
28357 default:
28358 llvm_unreachable("rounding mode is not supported by X86 hardware");
28359 // clang-format on
28360 }
28361 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28362 } else {
28363 // Need to convert argument into bits of control word:
28364 // 0 Round to 0 -> 11
28365 // 1 Round to nearest -> 00
28366 // 2 Round to +inf -> 10
28367 // 3 Round to -inf -> 01
28368 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28369 // To make the conversion, put all these values into a value 0xc9 and shift
28370 // it left depending on the rounding mode:
28371 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28372 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28373 // ...
28374 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28375 SDValue ShiftValue =
28376 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28377 DAG.getNode(ISD::ADD, DL, MVT::i32,
28378 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28379 DAG.getConstant(1, DL, MVT::i8)),
28380 DAG.getConstant(4, DL, MVT::i32)));
28381 SDValue Shifted =
28382 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28383 ShiftValue);
28384 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28385 DAG.getConstant(0xc00, DL, MVT::i16));
28386 }
28387
28388 // Update rounding mode bits and store the new FP Control Word into stack.
28389 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28390 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28391
28392 // Load FP control word from the slot.
28393 SDValue OpsLD[] = {Chain, StackSlot};
28394 MachineMemOperand *MMOL =
28396 Chain = DAG.getMemIntrinsicNode(
28397 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28398
28399 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28400 // same way but in bits 14:13.
28401 if (Subtarget.hasSSE1()) {
28402 // Store MXCSR into memory.
28403 Chain = DAG.getNode(
28404 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28405 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28406 StackSlot);
28407
28408 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28409 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28410 Chain = CWD.getValue(1);
28411 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28412 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28413
28414 // Shift X87 RM bits from 11:10 to 14:13.
28415 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28416 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28417 DAG.getConstant(3, DL, MVT::i8));
28418
28419 // Update rounding mode bits and store the new FP Control Word into stack.
28420 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28421 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28422
28423 // Load MXCSR from the slot.
28424 Chain = DAG.getNode(
28425 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28426 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28427 StackSlot);
28428 }
28429
28430 return Chain;
28431}
28432
28433const unsigned X87StateSize = 28;
28434const unsigned FPStateSize = 32;
28435[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28436
28437SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28438 SelectionDAG &DAG) const {
28440 SDLoc DL(Op);
28441 SDValue Chain = Op->getOperand(0);
28442 SDValue Ptr = Op->getOperand(1);
28443 auto *Node = cast<FPStateAccessSDNode>(Op);
28444 EVT MemVT = Node->getMemoryVT();
28446 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28447
28448 // Get x87 state, if it presents.
28449 if (Subtarget.hasX87()) {
28450 Chain =
28451 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28452 {Chain, Ptr}, MemVT, MMO);
28453
28454 // FNSTENV changes the exception mask, so load back the stored environment.
28455 MachineMemOperand::Flags NewFlags =
28457 (MMO->getFlags() & ~MachineMemOperand::MOStore);
28458 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28459 Chain =
28460 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28461 {Chain, Ptr}, MemVT, MMO);
28462 }
28463
28464 // If target supports SSE, get MXCSR as well.
28465 if (Subtarget.hasSSE1()) {
28466 // Get pointer to the MXCSR location in memory.
28468 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28469 DAG.getConstant(X87StateSize, DL, PtrVT));
28470 // Store MXCSR into memory.
28471 Chain = DAG.getNode(
28472 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28473 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28474 MXCSRAddr);
28475 }
28476
28477 return Chain;
28478}
28479
28481 EVT MemVT, MachineMemOperand *MMO,
28482 SelectionDAG &DAG,
28483 const X86Subtarget &Subtarget) {
28484 // Set x87 state, if it presents.
28485 if (Subtarget.hasX87())
28486 Chain =
28487 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28488 {Chain, Ptr}, MemVT, MMO);
28489 // If target supports SSE, set MXCSR as well.
28490 if (Subtarget.hasSSE1()) {
28491 // Get pointer to the MXCSR location in memory.
28493 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28494 DAG.getConstant(X87StateSize, DL, PtrVT));
28495 // Load MXCSR from memory.
28496 Chain = DAG.getNode(
28497 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28498 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28499 MXCSRAddr);
28500 }
28501 return Chain;
28502}
28503
28504SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28505 SelectionDAG &DAG) const {
28506 SDLoc DL(Op);
28507 SDValue Chain = Op->getOperand(0);
28508 SDValue Ptr = Op->getOperand(1);
28509 auto *Node = cast<FPStateAccessSDNode>(Op);
28510 EVT MemVT = Node->getMemoryVT();
28512 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28513 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28514}
28515
28516SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28517 SelectionDAG &DAG) const {
28519 SDLoc DL(Op);
28520 SDValue Chain = Op.getNode()->getOperand(0);
28521
28522 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28523 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28525
28526 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28527 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28528 // for compatibility with glibc.
28529 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28530 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28531 Constant *Zero = ConstantInt::get(ItemTy, 0);
28532 for (unsigned I = 0; I < 6; ++I)
28533 FPEnvVals.push_back(Zero);
28534
28535 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28536 // all exceptions, sets DAZ and FTZ to 0.
28537 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28538 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28540 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28541 MachinePointerInfo MPI =
28545
28546 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28547}
28548
28549// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28550uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28551 assert((Amt < 8) && "Shift/Rotation amount out of range");
28552 switch (Opcode) {
28553 case ISD::BITREVERSE:
28554 return 0x8040201008040201ULL;
28555 case ISD::SHL:
28556 return ((0x0102040810204080ULL >> (Amt)) &
28557 (0x0101010101010101ULL * (0xFF >> (Amt))));
28558 case ISD::SRL:
28559 return ((0x0102040810204080ULL << (Amt)) &
28560 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28561 case ISD::SRA:
28562 return (getGFNICtrlImm(ISD::SRL, Amt) |
28563 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28564 case ISD::ROTL:
28565 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28566 case ISD::ROTR:
28567 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28568 }
28569 llvm_unreachable("Unsupported GFNI opcode");
28570}
28571
28572// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28573SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28574 MVT VT, unsigned Amt = 0) {
28575 assert(VT.getVectorElementType() == MVT::i8 &&
28576 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28577 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28578 SmallVector<SDValue> MaskBits;
28579 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28580 uint64_t Bits = (Imm >> (I % 64)) & 255;
28581 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28582 }
28583 return DAG.getBuildVector(VT, DL, MaskBits);
28584}
28585
28586/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28587//
28588// i8/i16 vector implemented using dword LZCNT vector instruction
28589// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28590// split the vector, perform operation on it's Lo a Hi part and
28591// concatenate the results.
28593 const X86Subtarget &Subtarget) {
28594 assert(Op.getOpcode() == ISD::CTLZ);
28595 SDLoc dl(Op);
28596 MVT VT = Op.getSimpleValueType();
28597 MVT EltVT = VT.getVectorElementType();
28598 unsigned NumElems = VT.getVectorNumElements();
28599
28600 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28601 "Unsupported element type");
28602
28603 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28604 if (NumElems > 16 ||
28605 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28606 return splitVectorIntUnary(Op, DAG, dl);
28607
28608 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28609 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28610 "Unsupported value type for operation");
28611
28612 // Use native supported vector instruction vplzcntd.
28613 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28614 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28615 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28616 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28617
28618 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28619}
28620
28621// Lower CTLZ using a PSHUFB lookup table implementation.
28623 const X86Subtarget &Subtarget,
28624 SelectionDAG &DAG) {
28625 MVT VT = Op.getSimpleValueType();
28626 int NumElts = VT.getVectorNumElements();
28627 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28628 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28629
28630 // Per-nibble leading zero PSHUFB lookup table.
28631 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28632 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28633 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28634 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28635
28637 for (int i = 0; i < NumBytes; ++i)
28638 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28639 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
28640
28641 // Begin by bitcasting the input to byte vector, then split those bytes
28642 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
28643 // If the hi input nibble is zero then we add both results together, otherwise
28644 // we just take the hi result (by masking the lo result to zero before the
28645 // add).
28646 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
28647 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28648
28649 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
28650 SDValue Lo = Op0;
28651 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
28652 SDValue HiZ;
28653 if (CurrVT.is512BitVector()) {
28654 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28655 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
28656 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28657 } else {
28658 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
28659 }
28660
28661 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
28662 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
28663 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
28664 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
28665
28666 // Merge result back from vXi8 back to VT, working on the lo/hi halves
28667 // of the current vector width in the same way we did for the nibbles.
28668 // If the upper half of the input element is zero then add the halves'
28669 // leading zero counts together, otherwise just use the upper half's.
28670 // Double the width of the result until we are at target width.
28671 while (CurrVT != VT) {
28672 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
28673 int CurrNumElts = CurrVT.getVectorNumElements();
28674 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
28675 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
28676 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
28677
28678 // Check if the upper half of the input element is zero.
28679 if (CurrVT.is512BitVector()) {
28680 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28681 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
28682 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28683 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28684 } else {
28685 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
28686 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28687 }
28688 HiZ = DAG.getBitcast(NextVT, HiZ);
28689
28690 // Move the upper/lower halves to the lower bits as we'll be extending to
28691 // NextVT. Mask the lower result to zero if HiZ is true and add the results
28692 // together.
28693 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
28694 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
28695 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
28696 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
28697 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
28698 CurrVT = NextVT;
28699 }
28700
28701 return Res;
28702}
28703
28705 const X86Subtarget &Subtarget,
28706 SelectionDAG &DAG) {
28707 MVT VT = Op.getSimpleValueType();
28708
28709 if (Subtarget.hasCDI() &&
28710 // vXi8 vectors need to be promoted to 512-bits for vXi32.
28711 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
28712 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
28713
28714 // Decompose 256-bit ops into smaller 128-bit ops.
28715 if (VT.is256BitVector() && !Subtarget.hasInt256())
28716 return splitVectorIntUnary(Op, DAG, DL);
28717
28718 // Decompose 512-bit ops into smaller 256-bit ops.
28719 if (VT.is512BitVector() && !Subtarget.hasBWI())
28720 return splitVectorIntUnary(Op, DAG, DL);
28721
28722 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
28723 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
28724}
28725
28726static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
28727 SelectionDAG &DAG) {
28728 MVT VT = Op.getSimpleValueType();
28729 MVT OpVT = VT;
28730 unsigned NumBits = VT.getSizeInBits();
28731 SDLoc dl(Op);
28732 unsigned Opc = Op.getOpcode();
28733
28734 if (VT.isVector())
28735 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
28736
28737 Op = Op.getOperand(0);
28738 if (VT == MVT::i8) {
28739 // Zero extend to i32 since there is not an i8 bsr.
28740 OpVT = MVT::i32;
28741 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
28742 }
28743
28744 // Check if we can safely pass a result though BSR for zero sources.
28745 SDValue PassThru = DAG.getUNDEF(OpVT);
28746 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
28747 !DAG.isKnownNeverZero(Op))
28748 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
28749
28750 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28751 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28752 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
28753
28754 // Skip CMOV if we're using a pass through value.
28755 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
28756 // If src is zero (i.e. bsr sets ZF), returns NumBits.
28757 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28758 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28759 Op.getValue(1)};
28760 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28761 }
28762
28763 // Finally xor with NumBits-1.
28764 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28765 DAG.getConstant(NumBits - 1, dl, OpVT));
28766
28767 if (VT == MVT::i8)
28768 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28769 return Op;
28770}
28771
28772static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28773 SelectionDAG &DAG) {
28774 MVT VT = Op.getSimpleValueType();
28775 unsigned NumBits = VT.getScalarSizeInBits();
28776 SDValue N0 = Op.getOperand(0);
28777 SDLoc dl(Op);
28778 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
28779
28780 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
28781 "Only scalar CTTZ requires custom lowering");
28782
28783 // Check if we can safely pass a result though BSF for zero sources.
28784 SDValue PassThru = DAG.getUNDEF(VT);
28785 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
28786 PassThru = DAG.getConstant(NumBits, dl, VT);
28787
28788 // Issue a bsf (scan bits forward) which also sets EFLAGS.
28789 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28790 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
28791
28792 // Skip CMOV if src is never zero or we're using a pass through value.
28793 if (NonZeroSrc || !PassThru.isUndef())
28794 return Op;
28795
28796 // If src is zero (i.e. bsf sets ZF), returns NumBits.
28797 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28798 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28799 Op.getValue(1)};
28800 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28801}
28802
28804 const X86Subtarget &Subtarget) {
28805 MVT VT = Op.getSimpleValueType();
28806 SDLoc DL(Op);
28807
28808 if (VT == MVT::i16 || VT == MVT::i32)
28809 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
28810
28811 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28812 return splitVectorIntBinary(Op, DAG, DL);
28813
28814 assert(Op.getSimpleValueType().is256BitVector() &&
28815 Op.getSimpleValueType().isInteger() &&
28816 "Only handle AVX 256-bit vector integer operation");
28817 return splitVectorIntBinary(Op, DAG, DL);
28818}
28819
28821 const X86Subtarget &Subtarget) {
28822 MVT VT = Op.getSimpleValueType();
28823 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28824 unsigned Opcode = Op.getOpcode();
28825 SDLoc DL(Op);
28826
28827 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28828 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28829 assert(Op.getSimpleValueType().isInteger() &&
28830 "Only handle AVX vector integer operation");
28831 return splitVectorIntBinary(Op, DAG, DL);
28832 }
28833
28834 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28835 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28836 EVT SetCCResultType =
28837 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28838
28839 unsigned BitWidth = VT.getScalarSizeInBits();
28840 if (Opcode == ISD::USUBSAT) {
28841 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28842 // Handle a special-case with a bit-hack instead of cmp+select:
28843 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28844 // If the target can use VPTERNLOG, DAGToDAG will match this as
28845 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28846 // "broadcast" constant load.
28848 if (C && C->getAPIntValue().isSignMask()) {
28849 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28850 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28851 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28852 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28853 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28854 }
28855 }
28856 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28857 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28858 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28859 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28860 // TODO: Move this to DAGCombiner?
28861 if (SetCCResultType == VT &&
28862 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28863 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28864 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28865 }
28866 }
28867
28868 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28869 (!VT.isVector() || VT == MVT::v2i64)) {
28872 SDValue Zero = DAG.getConstant(0, DL, VT);
28873 SDValue Result =
28874 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28875 DAG.getVTList(VT, SetCCResultType), X, Y);
28876 SDValue SumDiff = Result.getValue(0);
28877 SDValue Overflow = Result.getValue(1);
28878 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28879 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28880 SDValue SumNeg =
28881 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28882 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28883 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28884 }
28885
28886 // Use default expansion.
28887 return SDValue();
28888}
28889
28890static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28891 SelectionDAG &DAG) {
28892 MVT VT = Op.getSimpleValueType();
28893 SDLoc DL(Op);
28894
28895 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28896 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28897 // 8-bit integer abs to NEG and CMOV.
28898 SDValue N0 = Op.getOperand(0);
28899 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28900 DAG.getConstant(0, DL, VT), N0);
28901 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28902 SDValue(Neg.getNode(), 1)};
28903 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28904 }
28905
28906 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28907 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28908 SDValue Src = Op.getOperand(0);
28909 SDValue Neg = DAG.getNegative(Src, DL, VT);
28910 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
28911 }
28912
28913 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28914 assert(VT.isInteger() &&
28915 "Only handle AVX 256-bit vector integer operation");
28916 return splitVectorIntUnary(Op, DAG, DL);
28917 }
28918
28919 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28920 return splitVectorIntUnary(Op, DAG, DL);
28921
28922 // Default to expand.
28923 return SDValue();
28924}
28925
28926static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
28927 SelectionDAG &DAG) {
28928 MVT VT = Op.getSimpleValueType();
28929 SDLoc DL(Op);
28930
28931 // For AVX1 cases, split to use legal ops.
28932 if (VT.is256BitVector() && !Subtarget.hasInt256())
28933 return splitVectorIntBinary(Op, DAG, DL);
28934
28935 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28936 return splitVectorIntBinary(Op, DAG, DL);
28937
28938 // Default to expand.
28939 return SDValue();
28940}
28941
28942static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
28943 SelectionDAG &DAG) {
28944 MVT VT = Op.getSimpleValueType();
28945 SDLoc DL(Op);
28946
28947 // For AVX1 cases, split to use legal ops.
28948 if (VT.is256BitVector() && !Subtarget.hasInt256())
28949 return splitVectorIntBinary(Op, DAG, DL);
28950
28951 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28952 return splitVectorIntBinary(Op, DAG, DL);
28953
28954 // Default to expand.
28955 return SDValue();
28956}
28957
28959 SelectionDAG &DAG) {
28960 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28961 EVT VT = Op.getValueType();
28962 SDValue X = Op.getOperand(0);
28963 SDValue Y = Op.getOperand(1);
28964 SDLoc DL(Op);
28965 bool IsMaxOp =
28966 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
28967 bool IsNum =
28968 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
28969 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
28970 unsigned Opc = 0;
28971 if (VT.isVector())
28972 Opc = X86ISD::VMINMAX;
28973 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
28974 Opc = X86ISD::VMINMAXS;
28975
28976 if (Opc) {
28977 SDValue Imm =
28978 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
28979 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
28980 }
28981 }
28982
28983 uint64_t SizeInBits = VT.getScalarSizeInBits();
28984 APInt PreferredZero = APInt::getZero(SizeInBits);
28985 APInt OppositeZero = PreferredZero;
28986 EVT IVT = VT.changeTypeToInteger();
28987 X86ISD::NodeType MinMaxOp;
28988 if (IsMaxOp) {
28989 MinMaxOp = X86ISD::FMAX;
28990 OppositeZero.setSignBit();
28991 } else {
28992 PreferredZero.setSignBit();
28993 MinMaxOp = X86ISD::FMIN;
28994 }
28995 EVT SetCCType =
28996 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28997
28998 // The tables below show the expected result of Max in cases of NaN and
28999 // signed zeros.
29000 //
29001 // Y Y
29002 // Num xNaN +0 -0
29003 // --------------- ---------------
29004 // Num | Max | Y | +0 | +0 | +0 |
29005 // X --------------- X ---------------
29006 // xNaN | X | X/Y | -0 | +0 | -0 |
29007 // --------------- ---------------
29008 //
29009 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29010 // reordering.
29011 //
29012 // We check if any of operands is NaN and return NaN. Then we check if any of
29013 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29014 // to ensure the correct zero is returned.
29015 auto MatchesZero = [](SDValue Op, APInt Zero) {
29017 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29018 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29019 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29020 return CstOp->getAPIntValue() == Zero;
29021 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29022 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29023 for (const SDValue &OpVal : Op->op_values()) {
29024 if (OpVal.isUndef())
29025 continue;
29026 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29027 if (!CstOp)
29028 return false;
29029 if (!CstOp->getValueAPF().isZero())
29030 continue;
29031 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29032 return false;
29033 }
29034 return true;
29035 }
29036 return false;
29037 };
29038
29039 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29040 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29041 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29042 Op->getFlags().hasNoSignedZeros() ||
29043 DAG.isKnownNeverZeroFloat(X) ||
29045 SDValue NewX, NewY;
29046 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29047 MatchesZero(X, OppositeZero)) {
29048 // Operands are already in right order or order does not matter.
29049 NewX = X;
29050 NewY = Y;
29051 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29052 NewX = Y;
29053 NewY = X;
29054 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29055 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29056 if (IsXNeverNaN)
29057 std::swap(X, Y);
29058 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29059 // xmm register.
29060 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29062 // Bits of classes:
29063 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29064 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29065 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29066 DL, MVT::i32);
29067 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29068 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29069 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29070 DAG.getVectorIdxConstant(0, DL));
29071 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29072 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29073 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29074 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29075 } else {
29076 SDValue IsXSigned;
29077 if (Subtarget.is64Bit() || VT != MVT::f64) {
29078 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29079 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29080 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29081 } else {
29082 assert(VT == MVT::f64);
29083 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29084 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29085 DAG.getVectorIdxConstant(0, DL));
29086 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29087 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29088 DAG.getVectorIdxConstant(1, DL));
29089 Hi = DAG.getBitcast(MVT::i32, Hi);
29090 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29091 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29092 *DAG.getContext(), MVT::i32);
29093 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29094 }
29095 if (MinMaxOp == X86ISD::FMAX) {
29096 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29097 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29098 } else {
29099 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29100 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29101 }
29102 }
29103
29104 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29105 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29106
29107 // If we did no ordering operands for signed zero handling and we need
29108 // to process NaN and we know that the second operand is not NaN then put
29109 // it in first operand and we will not need to post handle NaN after max/min.
29110 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
29111 std::swap(NewX, NewY);
29112
29113 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29114
29115 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
29116 return MinMax;
29117
29118 SDValue IsNaN =
29119 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29120
29121 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29122}
29123
29124static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29125 SelectionDAG &DAG) {
29126 MVT VT = Op.getSimpleValueType();
29127 SDLoc dl(Op);
29128
29129 // For AVX1 cases, split to use legal ops.
29130 if (VT.is256BitVector() && !Subtarget.hasInt256())
29131 return splitVectorIntBinary(Op, DAG, dl);
29132
29133 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29134 return splitVectorIntBinary(Op, DAG, dl);
29135
29136 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29137 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29138
29139 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29140 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29141 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29142
29143 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29144 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29145 if (VT.bitsGE(MVT::i32)) {
29146 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29147 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29148 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29149 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29150 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29151 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29152 DAG.getTargetConstant(CC, dl, MVT::i8),
29153 Diff1.getValue(1));
29154 }
29155
29156 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29157 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29158 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29159 MVT WideVT = MVT::getIntegerVT(WideBits);
29160 if (TLI.isTypeLegal(WideVT)) {
29161 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29162 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29163 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29164 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29165 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29166 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29167 DAG.getTargetConstant(CC, dl, MVT::i8),
29168 Diff1.getValue(1));
29169 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29170 }
29171 }
29172
29173 // Default to expand.
29174 return SDValue();
29175}
29176
29177static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29178 SelectionDAG &DAG) {
29179 SDLoc dl(Op);
29180 MVT VT = Op.getSimpleValueType();
29181
29182 // Decompose 256-bit ops into 128-bit ops.
29183 if (VT.is256BitVector() && !Subtarget.hasInt256())
29184 return splitVectorIntBinary(Op, DAG, dl);
29185
29186 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29187 return splitVectorIntBinary(Op, DAG, dl);
29188
29189 SDValue A = Op.getOperand(0);
29190 SDValue B = Op.getOperand(1);
29191
29192 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29193 // vector pairs, multiply and truncate.
29194 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29195 unsigned NumElts = VT.getVectorNumElements();
29196 unsigned NumLanes = VT.getSizeInBits() / 128;
29197 unsigned NumEltsPerLane = NumElts / NumLanes;
29198
29199 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29200 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29201 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29202 return DAG.getNode(
29203 ISD::TRUNCATE, dl, VT,
29204 DAG.getNode(ISD::MUL, dl, ExVT,
29205 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29206 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29207 }
29208
29209 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29210
29211 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29212 // Don't do this if we only need to unpack one half.
29213 if (Subtarget.hasSSSE3()) {
29214 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29215 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29216 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29217 if (BIsBuildVector) {
29218 for (auto [Idx, Val] : enumerate(B->ops())) {
29219 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29220 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29221 else
29222 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29223 }
29224 }
29225 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29226 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29227 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29228 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29229 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29230 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29231 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29232 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29233 DAG.getTargetConstant(8, dl, MVT::i8));
29234 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29235 }
29236 }
29237
29238 // Extract the lo/hi parts to any extend to i16.
29239 // We're going to mask off the low byte of each result element of the
29240 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29241 // element.
29242 SDValue Undef = DAG.getUNDEF(VT);
29243 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29244 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29245
29246 SDValue BLo, BHi;
29247 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29248 // If the RHS is a constant, manually unpackl/unpackh.
29249 SmallVector<SDValue, 16> LoOps, HiOps;
29250 for (unsigned i = 0; i != NumElts; i += 16) {
29251 for (unsigned j = 0; j != 8; ++j) {
29252 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29253 MVT::i16));
29254 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29255 MVT::i16));
29256 }
29257 }
29258
29259 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29260 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29261 } else {
29262 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29263 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29264 }
29265
29266 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29267 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29268 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29269 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29270 }
29271
29272 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29273 if (VT == MVT::v4i32) {
29274 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29275 "Should not custom lower when pmulld is available!");
29276
29277 // Extract the odd parts.
29278 static const int UnpackMask[] = {1, 1, 3, 3};
29279 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29280 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29281
29282 // Multiply the even parts.
29283 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29284 DAG.getBitcast(MVT::v2i64, A),
29285 DAG.getBitcast(MVT::v2i64, B));
29286 // Now multiply odd parts.
29287 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29288 DAG.getBitcast(MVT::v2i64, Aodds),
29289 DAG.getBitcast(MVT::v2i64, Bodds));
29290
29291 Evens = DAG.getBitcast(VT, Evens);
29292 Odds = DAG.getBitcast(VT, Odds);
29293
29294 // Merge the two vectors back together with a shuffle. This expands into 2
29295 // shuffles.
29296 static const int ShufMask[] = { 0, 4, 2, 6 };
29297 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29298 }
29299
29300 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29301 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29302 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29303
29304 // Ahi = psrlqi(a, 32);
29305 // Bhi = psrlqi(b, 32);
29306 //
29307 // AloBlo = pmuludq(a, b);
29308 // AloBhi = pmuludq(a, Bhi);
29309 // AhiBlo = pmuludq(Ahi, b);
29310 //
29311 // Hi = psllqi(AloBhi + AhiBlo, 32);
29312 // return AloBlo + Hi;
29313 KnownBits AKnown = DAG.computeKnownBits(A);
29314 KnownBits BKnown = DAG.computeKnownBits(B);
29315
29316 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29317 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29318 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29319
29320 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29321 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29322 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29323
29324 SDValue Zero = DAG.getConstant(0, dl, VT);
29325
29326 // Only multiply lo/hi halves that aren't known to be zero.
29327 SDValue AloBlo = Zero;
29328 if (!ALoIsZero && !BLoIsZero)
29329 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29330
29331 SDValue AloBhi = Zero;
29332 if (!ALoIsZero && !BHiIsZero) {
29333 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29334 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29335 }
29336
29337 SDValue AhiBlo = Zero;
29338 if (!AHiIsZero && !BLoIsZero) {
29339 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29340 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29341 }
29342
29343 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29344 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29345
29346 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29347}
29348
29350 MVT VT, bool IsSigned,
29351 const X86Subtarget &Subtarget,
29352 SelectionDAG &DAG,
29353 SDValue *Low = nullptr) {
29354 unsigned NumElts = VT.getVectorNumElements();
29355
29356 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29357 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29358 // lane results back together.
29359
29360 // We'll take different approaches for signed and unsigned.
29361 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29362 // and use pmullw to calculate the full 16-bit product.
29363 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29364 // shift them left into the upper byte of each word. This allows us to use
29365 // pmulhw to calculate the full 16-bit product. This trick means we don't
29366 // need to sign extend the bytes to use pmullw.
29367
29368 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29369 SDValue Zero = DAG.getConstant(0, dl, VT);
29370
29371 SDValue ALo, AHi;
29372 if (IsSigned) {
29373 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29374 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29375 } else {
29376 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29377 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29378 }
29379
29380 SDValue BLo, BHi;
29381 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29382 // If the RHS is a constant, manually unpackl/unpackh and extend.
29383 SmallVector<SDValue, 16> LoOps, HiOps;
29384 for (unsigned i = 0; i != NumElts; i += 16) {
29385 for (unsigned j = 0; j != 8; ++j) {
29386 SDValue LoOp = B.getOperand(i + j);
29387 SDValue HiOp = B.getOperand(i + j + 8);
29388
29389 if (IsSigned) {
29390 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29391 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29392 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29393 DAG.getConstant(8, dl, MVT::i16));
29394 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29395 DAG.getConstant(8, dl, MVT::i16));
29396 } else {
29397 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29398 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29399 }
29400
29401 LoOps.push_back(LoOp);
29402 HiOps.push_back(HiOp);
29403 }
29404 }
29405
29406 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29407 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29408 } else if (IsSigned) {
29409 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29410 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29411 } else {
29412 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29413 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29414 }
29415
29416 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29417 // pack back to vXi8.
29418 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29419 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29420 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29421
29422 if (Low)
29423 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29424
29425 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29426}
29427
29428static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29429 SelectionDAG &DAG) {
29430 SDLoc dl(Op);
29431 MVT VT = Op.getSimpleValueType();
29432 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29433 unsigned NumElts = VT.getVectorNumElements();
29434 SDValue A = Op.getOperand(0);
29435 SDValue B = Op.getOperand(1);
29436
29437 // Decompose 256-bit ops into 128-bit ops.
29438 if (VT.is256BitVector() && !Subtarget.hasInt256())
29439 return splitVectorIntBinary(Op, DAG, dl);
29440
29441 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29442 return splitVectorIntBinary(Op, DAG, dl);
29443
29444 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29445 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29446 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29447 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29448
29449 // PMULxD operations multiply each even value (starting at 0) of LHS with
29450 // the related value of RHS and produce a widen result.
29451 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29452 // => <2 x i64> <ae|cg>
29453 //
29454 // In other word, to have all the results, we need to perform two PMULxD:
29455 // 1. one with the even values.
29456 // 2. one with the odd values.
29457 // To achieve #2, with need to place the odd values at an even position.
29458 //
29459 // Place the odd value at an even position (basically, shift all values 1
29460 // step to the left):
29461 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29462 9, -1, 11, -1, 13, -1, 15, -1};
29463 // <a|b|c|d> => <b|undef|d|undef>
29464 SDValue Odd0 =
29465 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29466 // <e|f|g|h> => <f|undef|h|undef>
29467 SDValue Odd1 =
29468 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29469
29470 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29471 // ints.
29472 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29473 unsigned Opcode =
29474 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29475 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29476 // => <2 x i64> <ae|cg>
29477 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29478 DAG.getBitcast(MulVT, A),
29479 DAG.getBitcast(MulVT, B)));
29480 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29481 // => <2 x i64> <bf|dh>
29482 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29483 DAG.getBitcast(MulVT, Odd0),
29484 DAG.getBitcast(MulVT, Odd1)));
29485
29486 // Shuffle it back into the right order.
29487 SmallVector<int, 16> ShufMask(NumElts);
29488 for (int i = 0; i != (int)NumElts; ++i)
29489 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29490
29491 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29492
29493 // If we have a signed multiply but no PMULDQ fix up the result of an
29494 // unsigned multiply.
29495 if (IsSigned && !Subtarget.hasSSE41()) {
29496 SDValue Zero = DAG.getConstant(0, dl, VT);
29497 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29498 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29499 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29500 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29501
29502 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29503 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29504 }
29505
29506 return Res;
29507 }
29508
29509 // Only i8 vectors should need custom lowering after this.
29510 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29511 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29512 "Unsupported vector type");
29513
29514 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29515 // logical shift down the upper half and pack back to i8.
29516
29517 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29518 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29519
29520 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29521 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29522 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29523 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29524 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29525 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29526 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29527 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29528 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29529 }
29530
29531 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29532}
29533
29534// Custom lowering for SMULO/UMULO.
29535static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29536 SelectionDAG &DAG) {
29537 MVT VT = Op.getSimpleValueType();
29538
29539 // Scalars defer to LowerXALUO.
29540 if (!VT.isVector())
29541 return LowerXALUO(Op, DAG);
29542
29543 SDLoc dl(Op);
29544 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29545 SDValue A = Op.getOperand(0);
29546 SDValue B = Op.getOperand(1);
29547 EVT OvfVT = Op->getValueType(1);
29548
29549 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29550 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29551 // Extract the LHS Lo/Hi vectors
29552 SDValue LHSLo, LHSHi;
29553 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29554
29555 // Extract the RHS Lo/Hi vectors
29556 SDValue RHSLo, RHSHi;
29557 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29558
29559 EVT LoOvfVT, HiOvfVT;
29560 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29561 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29562 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29563
29564 // Issue the split operations.
29565 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29566 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29567
29568 // Join the separate data results and the overflow results.
29569 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29570 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29571 Hi.getValue(1));
29572
29573 return DAG.getMergeValues({Res, Ovf}, dl);
29574 }
29575
29576 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29577 EVT SetccVT =
29578 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29579
29580 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29581 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29582 unsigned NumElts = VT.getVectorNumElements();
29583 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29584 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29585 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29586 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29587 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29588
29589 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29590
29591 SDValue Ovf;
29592 if (IsSigned) {
29593 SDValue High, LowSign;
29594 if (OvfVT.getVectorElementType() == MVT::i1 &&
29595 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29596 // Rather the truncating try to do the compare on vXi16 or vXi32.
29597 // Shift the high down filling with sign bits.
29598 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29599 // Fill all 16 bits with the sign bit from the low.
29600 LowSign =
29601 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29602 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29603 15, DAG);
29604 SetccVT = OvfVT;
29605 if (!Subtarget.hasBWI()) {
29606 // We can't do a vXi16 compare so sign extend to v16i32.
29607 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29608 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29609 }
29610 } else {
29611 // Otherwise do the compare at vXi8.
29612 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29613 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29614 LowSign =
29615 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29616 }
29617
29618 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29619 } else {
29620 SDValue High =
29621 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29622 if (OvfVT.getVectorElementType() == MVT::i1 &&
29623 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29624 // Rather the truncating try to do the compare on vXi16 or vXi32.
29625 SetccVT = OvfVT;
29626 if (!Subtarget.hasBWI()) {
29627 // We can't do a vXi16 compare so sign extend to v16i32.
29628 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
29629 }
29630 } else {
29631 // Otherwise do the compare at vXi8.
29632 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29633 }
29634
29635 Ovf =
29636 DAG.getSetCC(dl, SetccVT, High,
29637 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
29638 }
29639
29640 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
29641
29642 return DAG.getMergeValues({Low, Ovf}, dl);
29643 }
29644
29645 SDValue Low;
29646 SDValue High =
29647 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
29648
29649 SDValue Ovf;
29650 if (IsSigned) {
29651 // SMULO overflows if the high bits don't match the sign of the low.
29652 SDValue LowSign =
29653 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29654 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29655 } else {
29656 // UMULO overflows if the high bits are non-zero.
29657 Ovf =
29658 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
29659 }
29660
29661 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
29662
29663 return DAG.getMergeValues({Low, Ovf}, dl);
29664}
29665
29666SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
29667 assert(Subtarget.isTargetWin64() && "Unexpected target");
29668 EVT VT = Op.getValueType();
29669 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
29670 "Unexpected return type for lowering");
29671
29672 if (isa<ConstantSDNode>(Op->getOperand(1))) {
29674 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
29675 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
29676 }
29677
29678 RTLIB::Libcall LC;
29679 bool isSigned;
29680 switch (Op->getOpcode()) {
29681 // clang-format off
29682 default: llvm_unreachable("Unexpected request for libcall!");
29683 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
29684 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
29685 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
29686 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
29687 // clang-format on
29688 }
29689
29690 SDLoc dl(Op);
29691 SDValue InChain = DAG.getEntryNode();
29692
29695 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
29696 EVT ArgVT = Op->getOperand(i).getValueType();
29697 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
29698 "Unexpected argument type for lowering");
29699 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29700 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29701 MachinePointerInfo MPI =
29703 Entry.Node = StackPtr;
29704 InChain =
29705 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
29706 Entry.Ty = PointerType::get(*DAG.getContext(), 0);
29707 Entry.IsSExt = false;
29708 Entry.IsZExt = false;
29709 Args.push_back(Entry);
29710 }
29711
29714
29716 CLI.setDebugLoc(dl)
29717 .setChain(InChain)
29718 .setLibCallee(
29720 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
29721 std::move(Args))
29722 .setInRegister()
29723 .setSExtResult(isSigned)
29724 .setZExtResult(!isSigned);
29725
29726 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
29727 return DAG.getBitcast(VT, CallInfo.first);
29728}
29729
29730SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
29731 SelectionDAG &DAG,
29732 SDValue &Chain) const {
29733 assert(Subtarget.isTargetWin64() && "Unexpected target");
29734 EVT VT = Op.getValueType();
29735 bool IsStrict = Op->isStrictFPOpcode();
29736
29737 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29738 EVT ArgVT = Arg.getValueType();
29739
29740 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
29741 "Unexpected return type for lowering");
29742
29743 RTLIB::Libcall LC;
29744 if (Op->getOpcode() == ISD::FP_TO_SINT ||
29745 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
29746 LC = RTLIB::getFPTOSINT(ArgVT, VT);
29747 else
29748 LC = RTLIB::getFPTOUINT(ArgVT, VT);
29749 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
29750
29751 SDLoc dl(Op);
29752 MakeLibCallOptions CallOptions;
29753 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29754
29756 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
29757 // expected VT (i128).
29758 std::tie(Result, Chain) =
29759 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
29760 Result = DAG.getBitcast(VT, Result);
29761 return Result;
29762}
29763
29764SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
29765 SelectionDAG &DAG) const {
29766 assert(Subtarget.isTargetWin64() && "Unexpected target");
29767 EVT VT = Op.getValueType();
29768 bool IsStrict = Op->isStrictFPOpcode();
29769
29770 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29771 EVT ArgVT = Arg.getValueType();
29772
29773 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
29774 "Unexpected argument type for lowering");
29775
29776 RTLIB::Libcall LC;
29777 if (Op->getOpcode() == ISD::SINT_TO_FP ||
29778 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
29779 LC = RTLIB::getSINTTOFP(ArgVT, VT);
29780 else
29781 LC = RTLIB::getUINTTOFP(ArgVT, VT);
29782 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
29783
29784 SDLoc dl(Op);
29785 MakeLibCallOptions CallOptions;
29786 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29787
29788 // Pass the i128 argument as an indirect argument on the stack.
29789 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29790 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29791 MachinePointerInfo MPI =
29793 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
29794
29796 std::tie(Result, Chain) =
29797 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
29798 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
29799}
29800
29801// Return true if the required (according to Opcode) shift-imm form is natively
29802// supported by the Subtarget
29803static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
29804 unsigned Opcode) {
29805 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29806 "Unexpected shift opcode");
29807
29808 if (!VT.isSimple())
29809 return false;
29810
29811 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29812 return false;
29813
29814 if (VT.getScalarSizeInBits() < 16)
29815 return false;
29816
29817 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29818 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29819 return true;
29820
29821 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29822 (VT.is256BitVector() && Subtarget.hasInt256());
29823
29824 bool AShift = LShift && (Subtarget.hasAVX512() ||
29825 (VT != MVT::v2i64 && VT != MVT::v4i64));
29826 return (Opcode == ISD::SRA) ? AShift : LShift;
29827}
29828
29829// The shift amount is a variable, but it is the same for all vector lanes.
29830// These instructions are defined together with shift-immediate.
29831static
29833 unsigned Opcode) {
29834 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29835}
29836
29837// Return true if the required (according to Opcode) variable-shift form is
29838// natively supported by the Subtarget
29839static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
29840 unsigned Opcode) {
29841 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29842 "Unexpected shift opcode");
29843
29844 if (!VT.isSimple())
29845 return false;
29846
29847 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29848 return false;
29849
29850 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29851 return false;
29852
29853 // vXi16 supported only on AVX-512, BWI
29854 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29855 return false;
29856
29857 if (Subtarget.hasAVX512() &&
29858 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29859 return true;
29860
29861 bool LShift = VT.is128BitVector() || VT.is256BitVector();
29862 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
29863 return (Opcode == ISD::SRA) ? AShift : LShift;
29864}
29865
29867 const X86Subtarget &Subtarget) {
29868 MVT VT = Op.getSimpleValueType();
29869 SDLoc dl(Op);
29870 SDValue R = Op.getOperand(0);
29871 SDValue Amt = Op.getOperand(1);
29872 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29873 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29874
29875 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29876 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
29877 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29878 SDValue Ex = DAG.getBitcast(ExVT, R);
29879
29880 // ashr(R, 63) === cmp_slt(R, 0)
29881 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29882 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
29883 "Unsupported PCMPGT op");
29884 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29885 }
29886
29887 if (ShiftAmt >= 32) {
29888 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29889 SDValue Upper =
29890 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29892 ShiftAmt - 32, DAG);
29893 if (VT == MVT::v2i64)
29894 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29895 if (VT == MVT::v4i64)
29896 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29897 {9, 1, 11, 3, 13, 5, 15, 7});
29898 } else {
29899 // SRA upper i32, SRL whole i64 and select lower i32.
29901 ShiftAmt, DAG);
29902 SDValue Lower =
29903 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29904 Lower = DAG.getBitcast(ExVT, Lower);
29905 if (VT == MVT::v2i64)
29906 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29907 if (VT == MVT::v4i64)
29908 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29909 {8, 1, 10, 3, 12, 5, 14, 7});
29910 }
29911 return DAG.getBitcast(VT, Ex);
29912 };
29913
29914 // Optimize shl/srl/sra with constant shift amount.
29915 APInt APIntShiftAmt;
29916 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29917 return SDValue();
29918
29919 // If the shift amount is out of range, return undef.
29920 if (APIntShiftAmt.uge(EltSizeInBits))
29921 return DAG.getUNDEF(VT);
29922
29923 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29924
29925 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
29926 // Hardware support for vector shifts is sparse which makes us scalarize the
29927 // vector operations in many cases. Also, on sandybridge ADD is faster than
29928 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29929 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29930 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29931 // must be 0). (add undef, undef) however can be any value. To make this
29932 // safe, we must freeze R to ensure that register allocation uses the same
29933 // register for an undefined value. This ensures that the result will
29934 // still be even and preserves the original semantics.
29935 R = DAG.getFreeze(R);
29936 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29937 }
29938
29939 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29940 }
29941
29942 // i64 SRA needs to be performed as partial shifts.
29943 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29944 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29945 Op.getOpcode() == ISD::SRA)
29946 return ArithmeticShiftRight64(ShiftAmt);
29947
29948 // If we're logical shifting an all-signbits value then we can just perform as
29949 // a mask.
29950 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
29951 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
29952 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
29953 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
29954 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
29955 }
29956
29957 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29958 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29959 unsigned NumElts = VT.getVectorNumElements();
29960 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29961
29962 // Simple i8 add case
29963 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29964 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29965 // must be 0). (add undef, undef) however can be any value. To make this
29966 // safe, we must freeze R to ensure that register allocation uses the same
29967 // register for an undefined value. This ensures that the result will
29968 // still be even and preserves the original semantics.
29969 R = DAG.getFreeze(R);
29970 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29971 }
29972
29973 // ashr(R, 7) === cmp_slt(R, 0)
29974 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29975 SDValue Zeros = DAG.getConstant(0, dl, VT);
29976 if (VT.is512BitVector()) {
29977 assert(VT == MVT::v64i8 && "Unexpected element type!");
29978 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29979 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29980 }
29981 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29982 }
29983
29984 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29985 if (VT == MVT::v16i8 && Subtarget.hasXOP())
29986 return SDValue();
29987
29988 if (Subtarget.hasGFNI()) {
29989 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
29990 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
29991 DAG.getTargetConstant(0, dl, MVT::i8));
29992 }
29993
29994 if (Op.getOpcode() == ISD::SHL) {
29995 // Make a large shift.
29996 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29997 ShiftAmt, DAG);
29998 SHL = DAG.getBitcast(VT, SHL);
29999 // Zero out the rightmost bits.
30000 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30001 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30002 }
30003 if (Op.getOpcode() == ISD::SRL) {
30004 // Make a large shift.
30005 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30006 ShiftAmt, DAG);
30007 SRL = DAG.getBitcast(VT, SRL);
30008 // Zero out the leftmost bits.
30009 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30010 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30011 }
30012 if (Op.getOpcode() == ISD::SRA) {
30013 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30014 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30015
30016 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30017 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30018 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30019 return Res;
30020 }
30021 llvm_unreachable("Unknown shift opcode.");
30022 }
30023
30024 return SDValue();
30025}
30026
30028 const X86Subtarget &Subtarget) {
30029 MVT VT = Op.getSimpleValueType();
30030 SDLoc dl(Op);
30031 SDValue R = Op.getOperand(0);
30032 SDValue Amt = Op.getOperand(1);
30033 unsigned Opcode = Op.getOpcode();
30034 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30035
30036 int BaseShAmtIdx = -1;
30037 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30038 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30039 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30040 Subtarget, DAG);
30041
30042 // vXi8 shifts - shift as v8i16 + mask result.
30043 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30044 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30045 VT == MVT::v64i8) &&
30046 !Subtarget.hasXOP()) {
30047 unsigned NumElts = VT.getVectorNumElements();
30048 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30049 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30050 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30051 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30052
30053 // Create the mask using vXi16 shifts. For shift-rights we need to move
30054 // the upper byte down before splatting the vXi8 mask.
30055 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30056 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30057 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30058 if (Opcode != ISD::SHL)
30059 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30060 8, DAG);
30061 BitMask = DAG.getBitcast(VT, BitMask);
30062 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30063 SmallVector<int, 64>(NumElts, 0));
30064
30065 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30066 DAG.getBitcast(ExtVT, R), BaseShAmt,
30067 BaseShAmtIdx, Subtarget, DAG);
30068 Res = DAG.getBitcast(VT, Res);
30069 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30070
30071 if (Opcode == ISD::SRA) {
30072 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30073 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30074 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30075 SignMask =
30076 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30077 BaseShAmtIdx, Subtarget, DAG);
30078 SignMask = DAG.getBitcast(VT, SignMask);
30079 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30080 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30081 }
30082 return Res;
30083 }
30084 }
30085 }
30086
30087 return SDValue();
30088}
30089
30090// Convert a shift/rotate left amount to a multiplication scale factor.
30092 const X86Subtarget &Subtarget,
30093 SelectionDAG &DAG) {
30094 MVT VT = Amt.getSimpleValueType();
30095 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30096 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30097 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30098 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30099 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30100 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30101 return SDValue();
30102
30103 MVT SVT = VT.getVectorElementType();
30104 unsigned SVTBits = SVT.getSizeInBits();
30105 unsigned NumElems = VT.getVectorNumElements();
30106
30107 APInt UndefElts;
30108 SmallVector<APInt> EltBits;
30109 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30110 APInt One(SVTBits, 1);
30111 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30112 for (unsigned I = 0; I != NumElems; ++I) {
30113 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30114 continue;
30115 uint64_t ShAmt = EltBits[I].getZExtValue();
30116 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30117 }
30118 return DAG.getBuildVector(VT, dl, Elts);
30119 }
30120
30121 // If the target doesn't support variable shifts, use either FP conversion
30122 // or integer multiplication to avoid shifting each element individually.
30123 if (VT == MVT::v4i32) {
30124 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30125 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30126 DAG.getConstant(0x3f800000U, dl, VT));
30127 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30128 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30129 }
30130
30131 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30132 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30133 SDValue Z = DAG.getConstant(0, dl, VT);
30134 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30135 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30136 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30137 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30138 if (Subtarget.hasSSE41())
30139 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30140 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30141 }
30142
30143 return SDValue();
30144}
30145
30146static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30147 SelectionDAG &DAG) {
30148 MVT VT = Op.getSimpleValueType();
30149 SDLoc dl(Op);
30150 SDValue R = Op.getOperand(0);
30151 SDValue Amt = Op.getOperand(1);
30152 unsigned NumElts = VT.getVectorNumElements();
30153 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30154 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30155
30156 unsigned Opc = Op.getOpcode();
30157 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30158 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30159
30160 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30161 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30162
30163 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30164 return V;
30165
30166 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30167 return V;
30168
30169 if (supportedVectorVarShift(VT, Subtarget, Opc))
30170 return Op;
30171
30172 // i64 vector arithmetic shift can be emulated with the transform:
30173 // M = lshr(SIGN_MASK, Amt)
30174 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30175 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30176 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30177 Opc == ISD::SRA) {
30178 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30179 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30180 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30181 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30182 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30183 return R;
30184 }
30185
30186 // XOP has 128-bit variable logical/arithmetic shifts.
30187 // +ve/-ve Amt = shift left/right.
30188 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30189 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30190 if (Opc == ISD::SRL || Opc == ISD::SRA)
30191 Amt = DAG.getNegative(Amt, dl, VT);
30192 if (Opc == ISD::SHL || Opc == ISD::SRL)
30193 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30194 if (Opc == ISD::SRA)
30195 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30196 }
30197
30198 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30199 // shifts per-lane and then shuffle the partial results back together.
30200 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30201 // Splat the shift amounts so the scalar shifts above will catch it.
30202 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30203 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30204 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30205 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30206 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30207 }
30208
30209 // Build a map of inrange constant amounts with element mask where they occur.
30211 if (ConstantAmt) {
30212 for (unsigned I = 0; I != NumElts; ++I) {
30213 SDValue A = Amt.getOperand(I);
30214 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30215 continue;
30216 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30217 if (UniqueCstAmt.count(CstAmt)) {
30218 UniqueCstAmt[CstAmt].setBit(I);
30219 continue;
30220 }
30221 UniqueCstAmt[CstAmt] = APInt::getOneBitSet(NumElts, I);
30222 }
30223 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30224 }
30225
30226 // If possible, lower this shift as a sequence of two shifts by
30227 // constant plus a BLENDing shuffle instead of scalarizing it.
30228 // Example:
30229 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30230 //
30231 // Could be rewritten as:
30232 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30233 //
30234 // The advantage is that the two shifts from the example would be
30235 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30236 if (UniqueCstAmt.size() == 2 &&
30237 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30238 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30239 unsigned AmtA = UniqueCstAmt.begin()->first;
30240 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30241 const APInt &MaskA = UniqueCstAmt.begin()->second;
30242 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30243 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30244 for (unsigned I = 0; I != NumElts; ++I) {
30245 if (MaskA[I])
30246 ShuffleMask[I] = I;
30247 if (MaskB[I])
30248 ShuffleMask[I] = I + NumElts;
30249 }
30250
30251 // Only perform this blend if we can perform it without loading a mask.
30252 if ((VT != MVT::v16i16 ||
30253 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30254 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30255 canWidenShuffleElements(ShuffleMask))) {
30256 SDValue Shift1 =
30257 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30258 SDValue Shift2 =
30259 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30260 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30261 }
30262 }
30263
30264 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30265 // using vYiM vector operations where X*N == Y*M and M > N.
30266 if (ConstantAmt &&
30267 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30268 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30269 !Subtarget.hasXOP()) {
30270 MVT NarrowScalarVT = VT.getScalarType();
30271 // We can do this extra fast if each pair of narrow elements is shifted by
30272 // the same amount by doing this SWAR style: use a shift to move the valid
30273 // bits to the right position, mask out any bits which crossed from one
30274 // element to the other.
30275 // This optimized lowering is only valid if the elements in a pair can
30276 // be treated identically.
30277 SmallVector<SDValue, 32> AmtWideElts(Amt->op_begin(), Amt->op_end());
30278 SmallVector<SDValue, 32> TmpAmtWideElts;
30279 int WideEltSizeInBits = EltSizeInBits;
30280 while (WideEltSizeInBits < 32) {
30281 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30282 // unprofitable.
30283 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30284 break;
30285 }
30286 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30287 bool SameShifts = true;
30288 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30289 unsigned DstI = SrcI / 2;
30290 // Both elements are undef? Make a note and keep going.
30291 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30292 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30293 continue;
30294 }
30295 // Even element is undef? We will shift it by the same shift amount as
30296 // the odd element.
30297 if (AmtWideElts[SrcI].isUndef()) {
30298 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30299 continue;
30300 }
30301 // Odd element is undef? We will shift it by the same shift amount as
30302 // the even element.
30303 if (AmtWideElts[SrcI + 1].isUndef()) {
30304 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30305 continue;
30306 }
30307 // Both elements are equal.
30308 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30309 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30310 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30311 continue;
30312 }
30313 // One of the provisional wide elements will not have the same shift
30314 // amount. Let's bail.
30315 SameShifts = false;
30316 break;
30317 }
30318 if (!SameShifts) {
30319 break;
30320 }
30321 WideEltSizeInBits *= 2;
30322 std::swap(TmpAmtWideElts, AmtWideElts);
30323 }
30324 APInt APIntShiftAmt;
30325 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30326 bool Profitable = WidenShift;
30327 // AVX512BW brings support for vpsllvw.
30328 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30329 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30330 Profitable = false;
30331 }
30332 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30333 // fairly cheaply in other ways.
30334 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30335 Profitable = false;
30336 }
30337 // Leave it up to GFNI if we have it around.
30338 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30339 // is probably a win to use other strategies in some cases.
30340 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30341 Profitable = false;
30342 }
30343
30344 // AVX1 does not have vpand which makes our masking impractical. It does
30345 // have vandps but that is an FP instruction and crossing FP<->int typically
30346 // has some cost.
30347 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30348 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30349 Profitable = false;
30350 }
30351 unsigned WideNumElts = AmtWideElts.size();
30352 // We are only dealing with identical pairs.
30353 if (Profitable && WideNumElts != NumElts) {
30354 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30355 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30356 // Cast the operand to vXiM.
30357 SDValue RWide = DAG.getBitcast(WideVT, R);
30358 // Create our new vector of shift amounts.
30359 SDValue AmtWide = DAG.getBuildVector(
30360 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30361 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30362 // Perform the actual shift.
30363 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30364 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30365 // Now we need to construct a mask which will "drop" bits that get
30366 // shifted past the LSB/MSB. For a logical shift left, it will look
30367 // like:
30368 // FullMask = (1 << EltSizeInBits) - 1
30369 // Mask = FullMask << Amt
30370 //
30371 // This masking ensures that bits cannot migrate from one narrow lane to
30372 // another. The construction of this mask will be constant folded.
30373 // The mask for a logical right shift is nearly identical, the only
30374 // difference is that the all ones mask is shifted right instead of left.
30375 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30376 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30377 Mask = DAG.getBitcast(WideVT, Mask);
30378 // Finally, we mask the shifted vector with the SWAR mask.
30379 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30380 Masked = DAG.getBitcast(VT, Masked);
30381 if (Opc != ISD::SRA) {
30382 // Logical shifts are complete at this point.
30383 return Masked;
30384 }
30385 // At this point, we have done a *logical* shift right. We now need to
30386 // sign extend the result so that we get behavior equivalent to an
30387 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30388 // are `EltSizeInBits-AmtWide` bits wide.
30389 //
30390 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30391 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30392 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30393 // can use the following trick to accomplish this:
30394 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30395 // (Masked ^ SignBitMask) - SignBitMask
30396 //
30397 // When the sign bit is already clear, this will compute:
30398 // Masked + SignBitMask - SignBitMask
30399 //
30400 // This is equal to Masked which is what we want: the sign bit was clear
30401 // so sign extending should be a no-op.
30402 //
30403 // When the sign bit is set, this will compute:
30404 // Masked - SignBitmask - SignBitMask
30405 //
30406 // This is equal to Masked - 2*SignBitMask which will correctly sign
30407 // extend our result.
30408 SDValue SplatHighBit =
30409 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30410 // This does not induce recursion, all operands are constants.
30411 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30412 SDValue FlippedSignBit =
30413 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30414 SDValue Subtraction =
30415 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30416 return Subtraction;
30417 }
30418 }
30419
30420 // If possible, lower this packed shift into a vector multiply instead of
30421 // expanding it into a sequence of scalar shifts.
30422 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30423 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30424 Subtarget.canExtendTo512BW())))
30425 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30426 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30427
30428 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30429 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30430 if (Opc == ISD::SRL && ConstantAmt &&
30431 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30432 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30433 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30434 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30435 SDValue Zero = DAG.getConstant(0, dl, VT);
30436 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30437 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30438 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30439 }
30440 }
30441
30442 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30443 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30444 // TODO: Special case handling for shift by 0/1, really we can afford either
30445 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30446 if (Opc == ISD::SRA && ConstantAmt &&
30447 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30448 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30449 !Subtarget.hasAVX512()) ||
30450 DAG.isKnownNeverZero(Amt))) {
30451 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30452 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30453 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30454 SDValue Amt0 =
30455 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30456 SDValue Amt1 =
30457 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30458 SDValue Sra1 =
30459 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30460 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30461 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30462 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30463 }
30464 }
30465
30466 // v4i32 Non Uniform Shifts.
30467 // If the shift amount is constant we can shift each lane using the SSE2
30468 // immediate shifts, else we need to zero-extend each lane to the lower i64
30469 // and shift using the SSE2 variable shifts.
30470 // The separate results can then be blended together.
30471 if (VT == MVT::v4i32) {
30472 SDValue Amt0, Amt1, Amt2, Amt3;
30473 if (ConstantAmt) {
30474 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30475 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30476 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30477 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30478 } else {
30479 // The SSE2 shifts use the lower i64 as the same shift amount for
30480 // all lanes and the upper i64 is ignored. On AVX we're better off
30481 // just zero-extending, but for SSE just duplicating the top 16-bits is
30482 // cheaper and has the same effect for out of range values.
30483 if (Subtarget.hasAVX()) {
30484 SDValue Z = DAG.getConstant(0, dl, VT);
30485 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30486 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30487 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30488 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30489 } else {
30490 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30491 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30492 {4, 5, 6, 7, -1, -1, -1, -1});
30493 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30494 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30495 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30496 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30497 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30498 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30499 }
30500 }
30501
30502 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30503 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30504 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30505 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30506 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30507
30508 // Merge the shifted lane results optimally with/without PBLENDW.
30509 // TODO - ideally shuffle combining would handle this.
30510 if (Subtarget.hasSSE41()) {
30511 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30512 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30513 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30514 }
30515 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30516 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30517 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30518 }
30519
30520 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30521 // look up the pre-computed shift values.
30522 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30523 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30524 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30525 unsigned NumLanes = VT.getSizeInBits() / 128u;
30526 unsigned NumEltsPerLane = NumElts / NumLanes;
30528 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30529 unsigned LoElt = Lane * NumEltsPerLane;
30530 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30531 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30532 if (!KnownLane.isConstant())
30533 break;
30534 const APInt &LaneSplat = KnownLane.getConstant();
30535 for (unsigned I = 0; I != 8; ++I) {
30536 if (Opc == ISD::SHL)
30537 LUT.push_back(LaneSplat.shl(I));
30538 else if (Opc == ISD::SRL)
30539 LUT.push_back(LaneSplat.lshr(I));
30540 else if (Opc == ISD::SRA)
30541 LUT.push_back(LaneSplat.ashr(I));
30542 }
30543 LUT.append(8, APInt::getZero(8));
30544 }
30545 if (LUT.size() == NumElts) {
30546 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30547 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30548 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30549 }
30550 }
30551
30552 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30553 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30554 // make the existing SSE solution better.
30555 // NOTE: We honor prefered vector width before promoting to 512-bits.
30556 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30557 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30558 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30559 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30560 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30561 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30562 "Unexpected vector type");
30563 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30564 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30565 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30566 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30567 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30568 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30569 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30570 }
30571
30572 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30573 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30574 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30575 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30576 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30577 !Subtarget.hasXOP()) {
30578 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30579 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30580
30581 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30582 // isn't legal).
30583 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30584 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30585 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30586 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30588 "Constant build vector expected");
30589
30590 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30591 bool IsSigned = Opc == ISD::SRA;
30592 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30593 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30594 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30595 return DAG.getZExtOrTrunc(R, dl, VT);
30596 }
30597
30598 SmallVector<SDValue, 16> LoAmt, HiAmt;
30599 for (unsigned i = 0; i != NumElts; i += 16) {
30600 for (int j = 0; j != 8; ++j) {
30601 LoAmt.push_back(Amt.getOperand(i + j));
30602 HiAmt.push_back(Amt.getOperand(i + j + 8));
30603 }
30604 }
30605
30606 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30607 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30608
30609 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30610 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30611 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30612 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30613 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30614 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30615 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30616 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30617 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30618 }
30619
30620 if (VT == MVT::v16i8 ||
30621 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
30622 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30623 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30624
30625 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30626 if (VT.is512BitVector()) {
30627 // On AVX512BW targets we make use of the fact that VSELECT lowers
30628 // to a masked blend which selects bytes based just on the sign bit
30629 // extracted to a mask.
30630 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30631 V0 = DAG.getBitcast(VT, V0);
30632 V1 = DAG.getBitcast(VT, V1);
30633 Sel = DAG.getBitcast(VT, Sel);
30634 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
30635 ISD::SETGT);
30636 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
30637 } else if (Subtarget.hasSSE41()) {
30638 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30639 // on the sign bit.
30640 V0 = DAG.getBitcast(VT, V0);
30641 V1 = DAG.getBitcast(VT, V1);
30642 Sel = DAG.getBitcast(VT, Sel);
30643 return DAG.getBitcast(SelVT,
30644 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
30645 }
30646 // On pre-SSE41 targets we test for the sign bit by comparing to
30647 // zero - a negative value will set all bits of the lanes to true
30648 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30649 SDValue Z = DAG.getConstant(0, dl, SelVT);
30650 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
30651 return DAG.getSelect(dl, SelVT, C, V0, V1);
30652 };
30653
30654 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30655 // We can safely do this using i16 shifts as we're only interested in
30656 // the 3 lower bits of each byte.
30657 Amt = DAG.getBitcast(ExtVT, Amt);
30658 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
30659 Amt = DAG.getBitcast(VT, Amt);
30660
30661 if (Opc == ISD::SHL || Opc == ISD::SRL) {
30662 // r = VSELECT(r, shift(r, 4), a);
30663 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
30664 R = SignBitSelect(VT, Amt, M, R);
30665
30666 // a += a
30667 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30668
30669 // r = VSELECT(r, shift(r, 2), a);
30670 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
30671 R = SignBitSelect(VT, Amt, M, R);
30672
30673 // a += a
30674 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30675
30676 // return VSELECT(r, shift(r, 1), a);
30677 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
30678 R = SignBitSelect(VT, Amt, M, R);
30679 return R;
30680 }
30681
30682 if (Opc == ISD::SRA) {
30683 // For SRA we need to unpack each byte to the higher byte of a i16 vector
30684 // so we can correctly sign extend. We don't care what happens to the
30685 // lower byte.
30686 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
30687 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
30688 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
30689 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
30690 ALo = DAG.getBitcast(ExtVT, ALo);
30691 AHi = DAG.getBitcast(ExtVT, AHi);
30692 RLo = DAG.getBitcast(ExtVT, RLo);
30693 RHi = DAG.getBitcast(ExtVT, RHi);
30694
30695 // r = VSELECT(r, shift(r, 4), a);
30696 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
30697 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
30698 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30699 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30700
30701 // a += a
30702 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
30703 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
30704
30705 // r = VSELECT(r, shift(r, 2), a);
30706 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
30707 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
30708 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30709 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30710
30711 // a += a
30712 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
30713 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
30714
30715 // r = VSELECT(r, shift(r, 1), a);
30716 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
30717 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
30718 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30719 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30720
30721 // Logical shift the result back to the lower byte, leaving a zero upper
30722 // byte meaning that we can safely pack with PACKUSWB.
30723 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
30724 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
30725 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
30726 }
30727 }
30728
30729 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
30730 MVT ExtVT = MVT::v8i32;
30731 SDValue Z = DAG.getConstant(0, dl, VT);
30732 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
30733 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
30734 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
30735 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
30736 ALo = DAG.getBitcast(ExtVT, ALo);
30737 AHi = DAG.getBitcast(ExtVT, AHi);
30738 RLo = DAG.getBitcast(ExtVT, RLo);
30739 RHi = DAG.getBitcast(ExtVT, RHi);
30740 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
30741 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
30742 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
30743 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
30744 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30745 }
30746
30747 if (VT == MVT::v8i16) {
30748 // If we have a constant shift amount, the non-SSE41 path is best as
30749 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
30750 bool UseSSE41 = Subtarget.hasSSE41() &&
30752
30753 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
30754 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
30755 // the sign bit.
30756 if (UseSSE41) {
30757 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
30758 V0 = DAG.getBitcast(ExtVT, V0);
30759 V1 = DAG.getBitcast(ExtVT, V1);
30760 Sel = DAG.getBitcast(ExtVT, Sel);
30761 return DAG.getBitcast(
30762 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
30763 }
30764 // On pre-SSE41 targets we splat the sign bit - a negative value will
30765 // set all bits of the lanes to true and VSELECT uses that in
30766 // its OR(AND(V0,C),AND(V1,~C)) lowering.
30767 SDValue C =
30768 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
30769 return DAG.getSelect(dl, VT, C, V0, V1);
30770 };
30771
30772 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
30773 if (UseSSE41) {
30774 // On SSE41 targets we need to replicate the shift mask in both
30775 // bytes for PBLENDVB.
30776 Amt = DAG.getNode(
30777 ISD::OR, dl, VT,
30778 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
30779 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
30780 } else {
30781 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
30782 }
30783
30784 // r = VSELECT(r, shift(r, 8), a);
30785 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
30786 R = SignBitSelect(Amt, M, R);
30787
30788 // a += a
30789 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30790
30791 // r = VSELECT(r, shift(r, 4), a);
30792 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
30793 R = SignBitSelect(Amt, M, R);
30794
30795 // a += a
30796 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30797
30798 // r = VSELECT(r, shift(r, 2), a);
30799 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
30800 R = SignBitSelect(Amt, M, R);
30801
30802 // a += a
30803 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30804
30805 // return VSELECT(r, shift(r, 1), a);
30806 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
30807 R = SignBitSelect(Amt, M, R);
30808 return R;
30809 }
30810
30811 // Decompose 256-bit shifts into 128-bit shifts.
30812 if (VT.is256BitVector())
30813 return splitVectorIntBinary(Op, DAG, dl);
30814
30815 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30816 return splitVectorIntBinary(Op, DAG, dl);
30817
30818 return SDValue();
30819}
30820
30822 SelectionDAG &DAG) {
30823 MVT VT = Op.getSimpleValueType();
30824 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
30825 "Unexpected funnel shift opcode!");
30826
30827 SDLoc DL(Op);
30828 SDValue Op0 = Op.getOperand(0);
30829 SDValue Op1 = Op.getOperand(1);
30830 SDValue Amt = Op.getOperand(2);
30831 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30832 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
30833
30834 if (VT.isVector()) {
30835 APInt APIntShiftAmt;
30836 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30837 unsigned NumElts = VT.getVectorNumElements();
30838
30839 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
30840 if (IsFSHR)
30841 std::swap(Op0, Op1);
30842
30843 if (IsCstSplat) {
30844 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
30845 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
30846 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
30847 {Op0, Op1, Imm}, DAG, Subtarget);
30848 }
30849 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
30850 {Op0, Op1, Amt}, DAG, Subtarget);
30851 }
30852 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30853 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
30854 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
30855 "Unexpected funnel shift type!");
30856
30857 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
30858 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
30859 if (IsCstSplat) {
30860 // TODO: Can't use generic expansion as UNDEF amt elements can be
30861 // converted to other values when folded to shift amounts, losing the
30862 // splat.
30863 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
30864 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
30865 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
30866 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
30867 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30868
30869 if (EltSizeInBits == 8 &&
30870 (Subtarget.hasXOP() ||
30871 (useVPTERNLOG(Subtarget, VT) &&
30872 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
30873 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
30874 // bit-select - lower using vXi16 shifts and then perform the bitmask at
30875 // the original vector width to handle cases where we split.
30876 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
30877 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
30878 SDValue ShX =
30879 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
30880 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
30881 SDValue ShY =
30882 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
30883 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
30884 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
30885 DAG.getConstant(MaskX, DL, VT));
30886 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
30887 DAG.getConstant(MaskY, DL, VT));
30888 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
30889 }
30890
30891 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
30892 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
30893 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
30894 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
30895 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
30896 }
30897
30898 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30899 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30900 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
30901
30902 // Constant vXi16 funnel shifts can be efficiently handled by default.
30903 if (IsCst && EltSizeInBits == 16)
30904 return SDValue();
30905
30906 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
30907 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30908 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30909
30910 // Split 256-bit integers on XOP/pre-AVX2 targets.
30911 // Split 512-bit integers on non 512-bit BWI targets.
30912 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
30913 !Subtarget.hasAVX2())) ||
30914 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
30915 EltSizeInBits < 32)) {
30916 // Pre-mask the amount modulo using the wider vector.
30917 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
30918 return splitVectorOp(Op, DAG, DL);
30919 }
30920
30921 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
30922 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
30923 int ScalarAmtIdx = -1;
30924 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
30925 // Uniform vXi16 funnel shifts can be efficiently handled by default.
30926 if (EltSizeInBits == 16)
30927 return SDValue();
30928
30929 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30930 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30931 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
30932 ScalarAmtIdx, Subtarget, DAG);
30933 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
30934 ScalarAmtIdx, Subtarget, DAG);
30935 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30936 }
30937 }
30938
30939 MVT WideSVT = MVT::getIntegerVT(
30940 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
30941 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
30942
30943 // If per-element shifts are legal, fallback to generic expansion.
30944 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
30945 return SDValue();
30946
30947 // Attempt to fold as:
30948 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30949 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30950 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30951 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30952 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
30953 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
30954 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30955 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
30956 EltSizeInBits, DAG);
30957 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
30958 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
30959 if (!IsFSHR)
30960 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
30961 EltSizeInBits, DAG);
30962 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
30963 }
30964
30965 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
30966 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
30967 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
30968 SDValue Z = DAG.getConstant(0, DL, VT);
30969 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30970 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30971 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30972 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30973 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30974 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30975 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30976 }
30977
30978 // Fallback to generic expansion.
30979 return SDValue();
30980 }
30981 assert(
30982 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
30983 "Unexpected funnel shift type!");
30984
30985 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
30986 bool OptForSize = DAG.shouldOptForSize();
30987 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
30988
30989 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30990 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30991 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
30992 !isa<ConstantSDNode>(Amt)) {
30993 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
30994 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
30995 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
30996 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
30997 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
30998 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
30999 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31000 if (IsFSHR) {
31001 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31002 } else {
31003 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31004 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31005 }
31006 return DAG.getZExtOrTrunc(Res, DL, VT);
31007 }
31008
31009 if (VT == MVT::i8 || ExpandFunnel)
31010 return SDValue();
31011
31012 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31013 if (VT == MVT::i16) {
31014 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31015 DAG.getConstant(15, DL, Amt.getValueType()));
31016 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31017 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31018 }
31019
31020 return Op;
31021}
31022
31023static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31024 SelectionDAG &DAG) {
31025 MVT VT = Op.getSimpleValueType();
31026 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31027
31028 SDLoc DL(Op);
31029 SDValue R = Op.getOperand(0);
31030 SDValue Amt = Op.getOperand(1);
31031 unsigned Opcode = Op.getOpcode();
31032 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31033 int NumElts = VT.getVectorNumElements();
31034 bool IsROTL = Opcode == ISD::ROTL;
31035
31036 // Check for constant splat rotation amount.
31037 APInt CstSplatValue;
31038 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31039
31040 // Check for splat rotate by zero.
31041 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31042 return R;
31043
31044 // AVX512 implicitly uses modulo rotation amounts.
31045 if ((Subtarget.hasVLX() ||
31046 (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
31047 32 <= EltSizeInBits) {
31048 // Attempt to rotate by immediate.
31049 if (IsCstSplat) {
31050 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31051 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31052 return DAG.getNode(RotOpc, DL, VT, R,
31053 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31054 }
31055
31056 // Else, fall-back on VPROLV/VPRORV.
31057 return Op;
31058 }
31059
31060 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31061 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31062 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31063 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31064 }
31065
31066 SDValue Z = DAG.getConstant(0, DL, VT);
31067
31068 if (!IsROTL) {
31069 // If the ISD::ROTR amount is constant, we're always better converting to
31070 // ISD::ROTL.
31071 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31072 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31073
31074 // XOP targets always prefers ISD::ROTL.
31075 if (Subtarget.hasXOP())
31076 return DAG.getNode(ISD::ROTL, DL, VT, R,
31077 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31078 }
31079
31080 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31081 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31083 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31084 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31085 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31086 DAG.getTargetConstant(0, DL, MVT::i8));
31087 }
31088
31089 // Split 256-bit integers on XOP/pre-AVX2 targets.
31090 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31091 return splitVectorIntBinary(Op, DAG, DL);
31092
31093 // XOP has 128-bit vector variable + immediate rotates.
31094 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31095 // XOP implicitly uses modulo rotation amounts.
31096 if (Subtarget.hasXOP()) {
31097 assert(IsROTL && "Only ROTL expected");
31098 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31099
31100 // Attempt to rotate by immediate.
31101 if (IsCstSplat) {
31102 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31103 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31104 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31105 }
31106
31107 // Use general rotate by variable (per-element).
31108 return Op;
31109 }
31110
31111 // Rotate by an uniform constant - expand back to shifts.
31112 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31113 // to other values when folded to shift amounts, losing the splat.
31114 if (IsCstSplat) {
31115 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31116 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31117 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31118 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31119 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31120 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31121 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31122 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31123 }
31124
31125 // Split 512-bit integers on non 512-bit BWI targets.
31126 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31127 return splitVectorIntBinary(Op, DAG, DL);
31128
31129 assert(
31130 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31131 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31132 Subtarget.hasAVX2()) ||
31133 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31134 "Only vXi32/vXi16/vXi8 vector rotates supported");
31135
31136 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31137 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31138
31139 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31140 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31141
31142 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31143 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31144 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31145 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31146 int BaseRotAmtIdx = -1;
31147 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31148 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31149 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31150 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31151 }
31152 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31153 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31154 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31155 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31156 BaseRotAmtIdx, Subtarget, DAG);
31157 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31158 BaseRotAmtIdx, Subtarget, DAG);
31159 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31160 }
31161 }
31162
31163 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31164 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31165
31166 // Attempt to fold as unpack(x,x) << zext(y):
31167 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31168 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31169 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31170 if (!(ConstantAmt && EltSizeInBits != 8) &&
31171 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31172 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31173 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31174 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31175 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31176 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31177 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31178 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31179 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31180 }
31181
31182 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31183 // the amount bit.
31184 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31185 if (EltSizeInBits == 8) {
31186 MVT WideVT =
31187 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31188
31189 // Attempt to fold as:
31190 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31191 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31192 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31193 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31194 // If we're rotating by constant, just use default promotion.
31195 if (ConstantAmt)
31196 return SDValue();
31197 // See if we can perform this by widening to vXi16 or vXi32.
31198 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31199 R = DAG.getNode(
31200 ISD::OR, DL, WideVT, R,
31201 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31202 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31203 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31204 if (IsROTL)
31205 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31206 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31207 }
31208
31209 // We don't need ModuloAmt here as we just peek at individual bits.
31210 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31211 if (Subtarget.hasSSE41()) {
31212 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31213 // on the sign bit.
31214 V0 = DAG.getBitcast(VT, V0);
31215 V1 = DAG.getBitcast(VT, V1);
31216 Sel = DAG.getBitcast(VT, Sel);
31217 return DAG.getBitcast(SelVT,
31218 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31219 }
31220 // On pre-SSE41 targets we test for the sign bit by comparing to
31221 // zero - a negative value will set all bits of the lanes to true
31222 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31223 SDValue Z = DAG.getConstant(0, DL, SelVT);
31224 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31225 return DAG.getSelect(DL, SelVT, C, V0, V1);
31226 };
31227
31228 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31229 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31230 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31231 IsROTL = true;
31232 }
31233
31234 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31235 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31236
31237 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31238 // We can safely do this using i16 shifts as we're only interested in
31239 // the 3 lower bits of each byte.
31240 Amt = DAG.getBitcast(ExtVT, Amt);
31241 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31242 Amt = DAG.getBitcast(VT, Amt);
31243
31244 // r = VSELECT(r, rot(r, 4), a);
31245 SDValue M;
31246 M = DAG.getNode(
31247 ISD::OR, DL, VT,
31248 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31249 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31250 R = SignBitSelect(VT, Amt, M, R);
31251
31252 // a += a
31253 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31254
31255 // r = VSELECT(r, rot(r, 2), a);
31256 M = DAG.getNode(
31257 ISD::OR, DL, VT,
31258 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31259 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31260 R = SignBitSelect(VT, Amt, M, R);
31261
31262 // a += a
31263 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31264
31265 // return VSELECT(r, rot(r, 1), a);
31266 M = DAG.getNode(
31267 ISD::OR, DL, VT,
31268 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31269 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31270 return SignBitSelect(VT, Amt, M, R);
31271 }
31272
31273 bool IsSplatAmt = DAG.isSplatValue(Amt);
31274 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31275 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31276
31277 // Fallback for splats + all supported variable shifts.
31278 // Fallback for non-constants AVX2 vXi16 as well.
31279 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31280 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31281 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31282 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31283 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31284 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31285 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31286 }
31287
31288 // Everything below assumes ISD::ROTL.
31289 if (!IsROTL) {
31290 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31291 IsROTL = true;
31292 }
31293
31294 // ISD::ROT* uses modulo rotate amounts.
31295 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31296
31297 assert(IsROTL && "Only ROTL supported");
31298
31299 // As with shifts, attempt to convert the rotation amount to a multiplication
31300 // factor, fallback to general expansion.
31301 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31302 if (!Scale)
31303 return SDValue();
31304
31305 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31306 if (EltSizeInBits == 16) {
31307 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31308 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31309 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31310 }
31311
31312 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31313 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31314 // that can then be OR'd with the lower 32-bits.
31315 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31316 static const int OddMask[] = {1, 1, 3, 3};
31317 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31318 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31319
31320 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31321 DAG.getBitcast(MVT::v2i64, R),
31322 DAG.getBitcast(MVT::v2i64, Scale));
31323 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31324 DAG.getBitcast(MVT::v2i64, R13),
31325 DAG.getBitcast(MVT::v2i64, Scale13));
31326 Res02 = DAG.getBitcast(VT, Res02);
31327 Res13 = DAG.getBitcast(VT, Res13);
31328
31329 return DAG.getNode(ISD::OR, DL, VT,
31330 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31331 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31332}
31333
31334/// Returns true if the operand type is exactly twice the native width, and
31335/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31336/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31337/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31338bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31339 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31340
31341 if (OpWidth == 64)
31342 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31343 if (OpWidth == 128)
31344 return Subtarget.canUseCMPXCHG16B();
31345
31346 return false;
31347}
31348
31350X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31351 Type *MemType = SI->getValueOperand()->getType();
31352
31353 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31354 !Subtarget.useSoftFloat()) {
31355 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31356 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31358
31359 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31360 Subtarget.hasAVX())
31362 }
31363
31364 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31366}
31367
31368// Note: this turns large loads into lock cmpxchg8b/16b.
31370X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31371 Type *MemType = LI->getType();
31372
31373 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31374 !Subtarget.useSoftFloat()) {
31375 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31376 // can use movq to do the load. If we have X87 we can load into an 80-bit
31377 // X87 register and store it to a stack temporary.
31378 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31379 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31381
31382 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31383 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31384 Subtarget.hasAVX())
31386 }
31387
31388 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31390}
31391
31392enum BitTestKind : unsigned {
31399
31400static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31401 using namespace llvm::PatternMatch;
31402 BitTestKind BTK = UndefBit;
31403 if (auto *C = dyn_cast<ConstantInt>(V)) {
31404 // Check if V is a power of 2 or NOT power of 2.
31405 if (isPowerOf2_64(C->getZExtValue()))
31406 BTK = ConstantBit;
31407 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31408 BTK = NotConstantBit;
31409 return {V, BTK};
31410 }
31411
31412 // Check if V is some power of 2 pattern known to be non-zero
31413 if (auto *I = dyn_cast<Instruction>(V)) {
31414 bool Not = false;
31415 // Check if we have a NOT
31416 Value *PeekI;
31417 if (match(I, m_Not(m_Value(PeekI))) ||
31418 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31419 Not = true;
31420 I = dyn_cast<Instruction>(PeekI);
31421
31422 // If I is constant, it will fold and we can evaluate later. If its an
31423 // argument or something of that nature, we can't analyze.
31424 if (I == nullptr)
31425 return {nullptr, UndefBit};
31426 }
31427 // We can only use 1 << X without more sophisticated analysis. C << X where
31428 // C is a power of 2 but not 1 can result in zero which cannot be translated
31429 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31430 if (I->getOpcode() == Instruction::Shl) {
31431 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31432 // -X` and some other provable power of 2 patterns that we can use CTZ on
31433 // may be profitable.
31434 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31435 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31436 // be provably a non-zero power of 2.
31437 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31438 // transformable to bittest.
31439 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31440 if (!ShiftVal)
31441 return {nullptr, UndefBit};
31442 if (ShiftVal->equalsInt(1))
31443 BTK = Not ? NotShiftBit : ShiftBit;
31444
31445 if (BTK == UndefBit)
31446 return {nullptr, UndefBit};
31447
31448 Value *BitV = I->getOperand(1);
31449
31450 // Read past a shiftmask instruction to find count
31451 Value *AndOp;
31452 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31453 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31454 BitV = AndOp;
31455
31456 return {BitV, BTK};
31457 }
31458 }
31459 return {nullptr, UndefBit};
31460}
31461
31463X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31464 using namespace llvm::PatternMatch;
31465 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31466 // prefix to a normal instruction for these operations.
31467 if (AI->use_empty())
31469
31470 if (AI->getOperation() == AtomicRMWInst::Xor) {
31471 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31472 // preferable to both `cmpxchg` and `btc`.
31473 if (match(AI->getOperand(1), m_SignMask()))
31475 }
31476
31477 // If the atomicrmw's result is used by a single bit AND, we may use
31478 // bts/btr/btc instruction for these operations.
31479 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31480 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31481 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31482 // detect it.
31483 Instruction *I = AI->user_back();
31484 auto BitChange = FindSingleBitChange(AI->getValOperand());
31485 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31486 I->getOpcode() != Instruction::And ||
31487 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31488 AI->getParent() != I->getParent())
31490
31491 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31492
31493 // This is a redundant AND, it should get cleaned up elsewhere.
31494 if (AI == I->getOperand(OtherIdx))
31496
31497 // The following instruction must be a AND single bit.
31498 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31499 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31500 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31501 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31503 }
31504 if (AI->getOperation() == AtomicRMWInst::And) {
31505 return ~C1->getValue() == C2->getValue()
31508 }
31511 }
31512
31513 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31514
31515 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31516 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31518
31519 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31520
31521 // If shift amounts are not the same we can't use BitTestIntrinsic.
31522 if (BitChange.first != BitTested.first)
31524
31525 // If atomic AND need to be masking all be one bit and testing the one bit
31526 // unset in the mask.
31527 if (AI->getOperation() == AtomicRMWInst::And)
31528 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31531
31532 // If atomic XOR/OR need to be setting and testing the same bit.
31533 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31536}
31537
31538void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31539 IRBuilder<> Builder(AI);
31540 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31543 switch (AI->getOperation()) {
31544 default:
31545 llvm_unreachable("Unknown atomic operation");
31546 case AtomicRMWInst::Or:
31547 IID_C = Intrinsic::x86_atomic_bts;
31548 IID_I = Intrinsic::x86_atomic_bts_rm;
31549 break;
31550 case AtomicRMWInst::Xor:
31551 IID_C = Intrinsic::x86_atomic_btc;
31552 IID_I = Intrinsic::x86_atomic_btc_rm;
31553 break;
31554 case AtomicRMWInst::And:
31555 IID_C = Intrinsic::x86_atomic_btr;
31556 IID_I = Intrinsic::x86_atomic_btr_rm;
31557 break;
31558 }
31559 Instruction *I = AI->user_back();
31560 LLVMContext &Ctx = AI->getContext();
31561 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31563 Value *Result = nullptr;
31564 auto BitTested = FindSingleBitChange(AI->getValOperand());
31565 assert(BitTested.first != nullptr);
31566
31567 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31568 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31569
31570 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31571 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31572 {Addr, Builder.getInt8(Imm)});
31573 } else {
31574 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31575
31576 Value *SI = BitTested.first;
31577 assert(SI != nullptr);
31578
31579 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31580 // mask it.
31581 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31582 Value *BitPos =
31583 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31584 // Todo(1): In many cases it may be provable that SI is less than
31585 // ShiftBits in which case this mask is unnecessary
31586 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31587 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31588 // favor of just a raw BT{S|R|C}.
31589
31590 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31591 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31592
31593 // If the result is only used for zero/non-zero status then we don't need to
31594 // shift value back. Otherwise do so.
31595 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31596 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31597 if (ICmp->isEquality()) {
31598 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31599 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31600 if (C0 || C1) {
31601 assert(C0 == nullptr || C1 == nullptr);
31602 if ((C0 ? C0 : C1)->isZero())
31603 continue;
31604 }
31605 }
31606 }
31607 Result = Builder.CreateShl(Result, BitPos);
31608 break;
31609 }
31610 }
31611
31612 I->replaceAllUsesWith(Result);
31613 I->eraseFromParent();
31614 AI->eraseFromParent();
31615}
31616
31618 using namespace llvm::PatternMatch;
31619 if (!AI->hasOneUse())
31620 return false;
31621
31622 Value *Op = AI->getOperand(1);
31623 CmpPredicate Pred;
31624 Instruction *I = AI->user_back();
31626 if (Opc == AtomicRMWInst::Add) {
31627 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
31628 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31629 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
31630 if (match(I->user_back(),
31632 return true;
31633 if (match(I->user_back(),
31635 return true;
31636 }
31637 return false;
31638 }
31639 if (Opc == AtomicRMWInst::Sub) {
31640 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
31641 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31642 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
31643 if (match(I->user_back(),
31645 return true;
31646 if (match(I->user_back(),
31648 return true;
31649 }
31650 return false;
31651 }
31652 if ((Opc == AtomicRMWInst::Or &&
31654 (Opc == AtomicRMWInst::And &&
31656 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
31657 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
31658 Pred == CmpInst::ICMP_SLT;
31659 if (match(I->user_back(),
31661 return true;
31662 return false;
31663 }
31664 if (Opc == AtomicRMWInst::Xor) {
31665 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
31666 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31667 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
31668 if (match(I->user_back(),
31670 return true;
31671 if (match(I->user_back(),
31673 return true;
31674 }
31675 return false;
31676 }
31677
31678 return false;
31679}
31680
31681void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
31682 AtomicRMWInst *AI) const {
31683 IRBuilder<> Builder(AI);
31684 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31685 Instruction *TempI = nullptr;
31686 LLVMContext &Ctx = AI->getContext();
31687 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
31688 if (!ICI) {
31689 TempI = AI->user_back();
31690 assert(TempI->hasOneUse() && "Must have one use");
31691 ICI = cast<ICmpInst>(TempI->user_back());
31692 }
31694 ICmpInst::Predicate Pred = ICI->getPredicate();
31695 switch (Pred) {
31696 default:
31697 llvm_unreachable("Not supported Pred");
31698 case CmpInst::ICMP_EQ:
31699 CC = X86::COND_E;
31700 break;
31701 case CmpInst::ICMP_NE:
31702 CC = X86::COND_NE;
31703 break;
31704 case CmpInst::ICMP_SLT:
31705 CC = X86::COND_S;
31706 break;
31707 case CmpInst::ICMP_SGT:
31708 CC = X86::COND_NS;
31709 break;
31710 }
31712 switch (AI->getOperation()) {
31713 default:
31714 llvm_unreachable("Unknown atomic operation");
31715 case AtomicRMWInst::Add:
31716 IID = Intrinsic::x86_atomic_add_cc;
31717 break;
31718 case AtomicRMWInst::Sub:
31719 IID = Intrinsic::x86_atomic_sub_cc;
31720 break;
31721 case AtomicRMWInst::Or:
31722 IID = Intrinsic::x86_atomic_or_cc;
31723 break;
31724 case AtomicRMWInst::And:
31725 IID = Intrinsic::x86_atomic_and_cc;
31726 break;
31727 case AtomicRMWInst::Xor:
31728 IID = Intrinsic::x86_atomic_xor_cc;
31729 break;
31730 }
31731 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31733 Value *Call = Builder.CreateIntrinsic(
31734 IID, AI->getType(),
31735 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
31736 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
31737 ICI->replaceAllUsesWith(Result);
31738 ICI->eraseFromParent();
31739 if (TempI)
31740 TempI->eraseFromParent();
31741 AI->eraseFromParent();
31742}
31743
31745X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
31746 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
31747 Type *MemType = AI->getType();
31748
31749 // If the operand is too big, we must see if cmpxchg8/16b is available
31750 // and default to library calls otherwise.
31751 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
31752 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31754 }
31755
31757 switch (Op) {
31760 case AtomicRMWInst::Add:
31761 case AtomicRMWInst::Sub:
31764 // It's better to use xadd, xsub or xchg for these in other cases.
31766 case AtomicRMWInst::Or:
31767 case AtomicRMWInst::And:
31768 case AtomicRMWInst::Xor:
31771 return shouldExpandLogicAtomicRMWInIR(AI);
31773 case AtomicRMWInst::Max:
31774 case AtomicRMWInst::Min:
31785 default:
31786 // These always require a non-trivial set of data operations on x86. We must
31787 // use a cmpxchg loop.
31789 }
31790}
31791
31792LoadInst *
31793X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
31794 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
31795 Type *MemType = AI->getType();
31796 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
31797 // there is no benefit in turning such RMWs into loads, and it is actually
31798 // harmful as it introduces a mfence.
31799 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
31800 return nullptr;
31801
31802 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
31803 // lowering available in lowerAtomicArith.
31804 // TODO: push more cases through this path.
31805 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
31806 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
31807 AI->use_empty())
31808 return nullptr;
31809
31810 IRBuilder<> Builder(AI);
31811 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31812 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
31813 auto SSID = AI->getSyncScopeID();
31814 // We must restrict the ordering to avoid generating loads with Release or
31815 // ReleaseAcquire orderings.
31817
31818 // Before the load we need a fence. Here is an example lifted from
31819 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
31820 // is required:
31821 // Thread 0:
31822 // x.store(1, relaxed);
31823 // r1 = y.fetch_add(0, release);
31824 // Thread 1:
31825 // y.fetch_add(42, acquire);
31826 // r2 = x.load(relaxed);
31827 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
31828 // lowered to just a load without a fence. A mfence flushes the store buffer,
31829 // making the optimization clearly correct.
31830 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
31831 // otherwise, we might be able to be more aggressive on relaxed idempotent
31832 // rmw. In practice, they do not look useful, so we don't try to be
31833 // especially clever.
31834 if (SSID == SyncScope::SingleThread)
31835 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
31836 // the IR level, so we must wrap it in an intrinsic.
31837 return nullptr;
31838
31839 if (!Subtarget.hasMFence())
31840 // FIXME: it might make sense to use a locked operation here but on a
31841 // different cache-line to prevent cache-line bouncing. In practice it
31842 // is probably a small win, and x86 processors without mfence are rare
31843 // enough that we do not bother.
31844 return nullptr;
31845
31846 Function *MFence =
31847 llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse2_mfence);
31848 Builder.CreateCall(MFence, {});
31849
31850 // Finally we can emit the atomic load.
31851 LoadInst *Loaded = Builder.CreateAlignedLoad(
31852 AI->getType(), AI->getPointerOperand(), AI->getAlign());
31853 Loaded->setAtomic(Order, SSID);
31854 AI->replaceAllUsesWith(Loaded);
31855 AI->eraseFromParent();
31856 return Loaded;
31857}
31858
31859/// Emit a locked operation on a stack location which does not change any
31860/// memory location, but does involve a lock prefix. Location is chosen to be
31861/// a) very likely accessed only by a single thread to minimize cache traffic,
31862/// and b) definitely dereferenceable. Returns the new Chain result.
31864 const X86Subtarget &Subtarget, SDValue Chain,
31865 const SDLoc &DL) {
31866 // Implementation notes:
31867 // 1) LOCK prefix creates a full read/write reordering barrier for memory
31868 // operations issued by the current processor. As such, the location
31869 // referenced is not relevant for the ordering properties of the instruction.
31870 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
31871 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
31872 // 2) Using an immediate operand appears to be the best encoding choice
31873 // here since it doesn't require an extra register.
31874 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
31875 // is small enough it might just be measurement noise.)
31876 // 4) When choosing offsets, there are several contributing factors:
31877 // a) If there's no redzone, we default to TOS. (We could allocate a cache
31878 // line aligned stack object to improve this case.)
31879 // b) To minimize our chances of introducing a false dependence, we prefer
31880 // to offset the stack usage from TOS slightly.
31881 // c) To minimize concerns about cross thread stack usage - in particular,
31882 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
31883 // captures state in the TOS frame and accesses it from many threads -
31884 // we want to use an offset such that the offset is in a distinct cache
31885 // line from the TOS frame.
31886 //
31887 // For a general discussion of the tradeoffs and benchmark results, see:
31888 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
31889
31890 auto &MF = DAG.getMachineFunction();
31891 auto &TFL = *Subtarget.getFrameLowering();
31892 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
31893
31894 if (Subtarget.is64Bit()) {
31895 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
31896 SDValue Ops[] = {
31897 DAG.getRegister(X86::RSP, MVT::i64), // Base
31898 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
31899 DAG.getRegister(0, MVT::i64), // Index
31900 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
31901 DAG.getRegister(0, MVT::i16), // Segment.
31902 Zero,
31903 Chain};
31904 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
31905 MVT::Other, Ops);
31906 return SDValue(Res, 1);
31907 }
31908
31909 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
31910 SDValue Ops[] = {
31911 DAG.getRegister(X86::ESP, MVT::i32), // Base
31912 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
31913 DAG.getRegister(0, MVT::i32), // Index
31914 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
31915 DAG.getRegister(0, MVT::i16), // Segment.
31916 Zero,
31917 Chain
31918 };
31919 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
31920 MVT::Other, Ops);
31921 return SDValue(Res, 1);
31922}
31923
31925 SelectionDAG &DAG) {
31926 SDLoc dl(Op);
31927 AtomicOrdering FenceOrdering =
31928 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
31929 SyncScope::ID FenceSSID =
31930 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
31931
31932 // The only fence that needs an instruction is a sequentially-consistent
31933 // cross-thread fence.
31934 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
31935 FenceSSID == SyncScope::System) {
31936 if (Subtarget.hasMFence())
31937 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
31938
31939 SDValue Chain = Op.getOperand(0);
31940 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
31941 }
31942
31943 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31944 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
31945}
31946
31948 SelectionDAG &DAG) {
31949 MVT T = Op.getSimpleValueType();
31950 SDLoc DL(Op);
31951 unsigned Reg = 0;
31952 unsigned size = 0;
31953 switch(T.SimpleTy) {
31954 default: llvm_unreachable("Invalid value type!");
31955 case MVT::i8: Reg = X86::AL; size = 1; break;
31956 case MVT::i16: Reg = X86::AX; size = 2; break;
31957 case MVT::i32: Reg = X86::EAX; size = 4; break;
31958 case MVT::i64:
31959 assert(Subtarget.is64Bit() && "Node not type legal!");
31960 Reg = X86::RAX; size = 8;
31961 break;
31962 }
31963 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
31964 Op.getOperand(2), SDValue());
31965 SDValue Ops[] = { cpIn.getValue(0),
31966 Op.getOperand(1),
31967 Op.getOperand(3),
31968 DAG.getTargetConstant(size, DL, MVT::i8),
31969 cpIn.getValue(1) };
31970 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31971 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
31973 Ops, T, MMO);
31974
31975 SDValue cpOut =
31976 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
31977 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
31978 MVT::i32, cpOut.getValue(2));
31979 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
31980
31981 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
31982 cpOut, Success, EFLAGS.getValue(1));
31983}
31984
31985// Create MOVMSKB, taking into account whether we need to split for AVX1.
31987 const X86Subtarget &Subtarget) {
31988 MVT InVT = V.getSimpleValueType();
31989
31990 if (InVT == MVT::v64i8) {
31991 SDValue Lo, Hi;
31992 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31993 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
31994 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
31995 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
31996 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
31997 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
31998 DAG.getConstant(32, DL, MVT::i8));
31999 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32000 }
32001 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32002 SDValue Lo, Hi;
32003 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32004 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32005 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32006 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32007 DAG.getConstant(16, DL, MVT::i8));
32008 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32009 }
32010
32011 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32012}
32013
32014static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32015 SelectionDAG &DAG) {
32016 SDValue Src = Op.getOperand(0);
32017 MVT SrcVT = Src.getSimpleValueType();
32018 MVT DstVT = Op.getSimpleValueType();
32019
32020 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32021 // half to v32i1 and concatenating the result.
32022 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32023 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32024 assert(Subtarget.hasBWI() && "Expected BWI target");
32025 SDLoc dl(Op);
32026 SDValue Lo, Hi;
32027 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32028 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32029 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32030 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32031 }
32032
32033 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32034 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32035 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32036 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32037 SDLoc DL(Op);
32038 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32039 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32040 return DAG.getZExtOrTrunc(V, DL, DstVT);
32041 }
32042
32043 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32044 SrcVT == MVT::i64) && "Unexpected VT!");
32045
32046 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32047 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32048 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32049 // This conversion needs to be expanded.
32050 return SDValue();
32051
32052 SDLoc dl(Op);
32053 if (SrcVT.isVector()) {
32054 // Widen the vector in input in the case of MVT::v2i32.
32055 // Example: from MVT::v2i32 to MVT::v4i32.
32057 SrcVT.getVectorNumElements() * 2);
32058 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32059 DAG.getUNDEF(SrcVT));
32060 } else {
32061 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32062 "Unexpected source type in LowerBITCAST");
32063 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32064 }
32065
32066 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32067 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32068
32069 if (DstVT == MVT::x86mmx)
32070 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32071
32072 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32073 DAG.getVectorIdxConstant(0, dl));
32074}
32075
32076/// Compute the horizontal sum of bytes in V for the elements of VT.
32077///
32078/// Requires V to be a byte vector and VT to be an integer vector type with
32079/// wider elements than V's type. The width of the elements of VT determines
32080/// how many bytes of V are summed horizontally to produce each element of the
32081/// result.
32083 const X86Subtarget &Subtarget,
32084 SelectionDAG &DAG) {
32085 SDLoc DL(V);
32086 MVT ByteVecVT = V.getSimpleValueType();
32087 MVT EltVT = VT.getVectorElementType();
32088 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32089 "Expected value to have byte element type.");
32090 assert(EltVT != MVT::i8 &&
32091 "Horizontal byte sum only makes sense for wider elements!");
32092 unsigned VecSize = VT.getSizeInBits();
32093 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32094
32095 // PSADBW instruction horizontally add all bytes and leave the result in i64
32096 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32097 if (EltVT == MVT::i64) {
32098 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32099 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32100 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32101 return DAG.getBitcast(VT, V);
32102 }
32103
32104 if (EltVT == MVT::i32) {
32105 // We unpack the low half and high half into i32s interleaved with zeros so
32106 // that we can use PSADBW to horizontally sum them. The most useful part of
32107 // this is that it lines up the results of two PSADBW instructions to be
32108 // two v2i64 vectors which concatenated are the 4 population counts. We can
32109 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32110 SDValue Zeros = DAG.getConstant(0, DL, VT);
32111 SDValue V32 = DAG.getBitcast(VT, V);
32112 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32113 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32114
32115 // Do the horizontal sums into two v2i64s.
32116 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32117 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32118 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32119 DAG.getBitcast(ByteVecVT, Low), Zeros);
32120 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32121 DAG.getBitcast(ByteVecVT, High), Zeros);
32122
32123 // Merge them together.
32124 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32125 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32126 DAG.getBitcast(ShortVecVT, Low),
32127 DAG.getBitcast(ShortVecVT, High));
32128
32129 return DAG.getBitcast(VT, V);
32130 }
32131
32132 // The only element type left is i16.
32133 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32134
32135 // To obtain pop count for each i16 element starting from the pop count for
32136 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32137 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32138 // directly supported.
32139 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32140 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32141 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32142 DAG.getBitcast(ByteVecVT, V));
32143 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32144}
32145
32147 const X86Subtarget &Subtarget,
32148 SelectionDAG &DAG) {
32149 MVT VT = Op.getSimpleValueType();
32150 MVT EltVT = VT.getVectorElementType();
32151 int NumElts = VT.getVectorNumElements();
32152 (void)EltVT;
32153 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32154
32155 // Implement a lookup table in register by using an algorithm based on:
32156 // http://wm.ite.pl/articles/sse-popcount.html
32157 //
32158 // The general idea is that every lower byte nibble in the input vector is an
32159 // index into a in-register pre-computed pop count table. We then split up the
32160 // input vector in two new ones: (1) a vector with only the shifted-right
32161 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32162 // masked out higher ones) for each byte. PSHUFB is used separately with both
32163 // to index the in-register table. Next, both are added and the result is a
32164 // i8 vector where each element contains the pop count for input byte.
32165 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32166 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32167 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32168 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32169
32171 for (int i = 0; i < NumElts; ++i)
32172 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32173 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32174 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32175
32176 // High nibbles
32177 SDValue FourV = DAG.getConstant(4, DL, VT);
32178 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32179
32180 // Low nibbles
32181 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32182
32183 // The input vector is used as the shuffle mask that index elements into the
32184 // LUT. After counting low and high nibbles, add the vector to obtain the
32185 // final pop count per i8 element.
32186 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32187 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32188 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32189}
32190
32191// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32192// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32194 const X86Subtarget &Subtarget,
32195 SelectionDAG &DAG) {
32196 MVT VT = Op.getSimpleValueType();
32197 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32198 "Unknown CTPOP type to handle");
32199 SDValue Op0 = Op.getOperand(0);
32200
32201 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32202 if (Subtarget.hasVPOPCNTDQ()) {
32203 unsigned NumElems = VT.getVectorNumElements();
32204 assert((VT.getVectorElementType() == MVT::i8 ||
32205 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32206 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32207 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32208 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32209 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32210 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32211 }
32212 }
32213
32214 // Decompose 256-bit ops into smaller 128-bit ops.
32215 if (VT.is256BitVector() && !Subtarget.hasInt256())
32216 return splitVectorIntUnary(Op, DAG, DL);
32217
32218 // Decompose 512-bit ops into smaller 256-bit ops.
32219 if (VT.is512BitVector() && !Subtarget.hasBWI())
32220 return splitVectorIntUnary(Op, DAG, DL);
32221
32222 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32223 if (VT.getScalarType() != MVT::i8) {
32224 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32225 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32226 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32227 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32228 }
32229
32230 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32231 if (!Subtarget.hasSSSE3())
32232 return SDValue();
32233
32234 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32235}
32236
32237static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32238 SelectionDAG &DAG) {
32239 MVT VT = N.getSimpleValueType();
32240 SDValue Op = N.getOperand(0);
32241 SDLoc DL(N);
32242
32243 if (VT.isScalarInteger()) {
32244 // Compute the lower/upper bounds of the active bits of the value,
32245 // allowing us to shift the active bits down if necessary to fit into the
32246 // special cases below.
32247 KnownBits Known = DAG.computeKnownBits(Op);
32248 if (Known.isConstant())
32249 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32250 unsigned LZ = Known.countMinLeadingZeros();
32251 unsigned TZ = Known.countMinTrailingZeros();
32252 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32253 unsigned ActiveBits = Known.getBitWidth() - LZ;
32254 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32255
32256 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32257 if (ShiftedActiveBits <= 2) {
32258 if (ActiveBits > 2)
32259 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32260 DAG.getShiftAmountConstant(TZ, VT, DL));
32261 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32262 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32263 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32264 DAG.getShiftAmountConstant(1, VT, DL)));
32265 return DAG.getZExtOrTrunc(Op, DL, VT);
32266 }
32267
32268 // i3 CTPOP - perform LUT into i32 integer.
32269 if (ShiftedActiveBits <= 3) {
32270 if (ActiveBits > 3)
32271 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32272 DAG.getShiftAmountConstant(TZ, VT, DL));
32273 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32274 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32275 DAG.getShiftAmountConstant(1, VT, DL));
32276 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32277 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32278 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32279 DAG.getConstant(0x3, DL, MVT::i32));
32280 return DAG.getZExtOrTrunc(Op, DL, VT);
32281 }
32282
32283 // i4 CTPOP - perform LUT into i64 integer.
32284 if (ShiftedActiveBits <= 4 &&
32285 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32286 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32287 if (ActiveBits > 4)
32288 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32289 DAG.getShiftAmountConstant(TZ, VT, DL));
32290 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32291 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32292 DAG.getConstant(4, DL, MVT::i32));
32293 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32294 DAG.getShiftAmountOperand(MVT::i64, Op));
32295 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32296 DAG.getConstant(0x7, DL, MVT::i64));
32297 return DAG.getZExtOrTrunc(Op, DL, VT);
32298 }
32299
32300 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32301 if (ShiftedActiveBits <= 8) {
32302 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32303 if (ActiveBits > 8)
32304 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32305 DAG.getShiftAmountConstant(TZ, VT, DL));
32306 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32307 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32308 DAG.getConstant(0x08040201U, DL, MVT::i32));
32309 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32310 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32311 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32312 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32313 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32314 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32315 return DAG.getZExtOrTrunc(Op, DL, VT);
32316 }
32317
32318 return SDValue(); // fallback to generic expansion.
32319 }
32320
32321 assert(VT.isVector() &&
32322 "We only do custom lowering for vector population count.");
32323 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32324}
32325
32327 MVT VT = Op.getSimpleValueType();
32328 SDValue In = Op.getOperand(0);
32329 SDLoc DL(Op);
32330
32331 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32332 // perform the BITREVERSE.
32333 if (!VT.isVector()) {
32334 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32335 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32336 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32337 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32338 DAG.getVectorIdxConstant(0, DL));
32339 }
32340
32341 int NumElts = VT.getVectorNumElements();
32342 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32343
32344 // Decompose 256-bit ops into smaller 128-bit ops.
32345 if (VT.is256BitVector())
32346 return splitVectorIntUnary(Op, DAG, DL);
32347
32348 assert(VT.is128BitVector() &&
32349 "Only 128-bit vector bitreverse lowering supported.");
32350
32351 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32352 // perform the BSWAP in the shuffle.
32353 // Its best to shuffle using the second operand as this will implicitly allow
32354 // memory folding for multiple vectors.
32355 SmallVector<SDValue, 16> MaskElts;
32356 for (int i = 0; i != NumElts; ++i) {
32357 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32358 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32359 int PermuteByte = SourceByte | (2 << 5);
32360 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32361 }
32362 }
32363
32364 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32365 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32366 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32367 Res, Mask);
32368 return DAG.getBitcast(VT, Res);
32369}
32370
32372 SelectionDAG &DAG) {
32373 MVT VT = Op.getSimpleValueType();
32374
32375 if (Subtarget.hasXOP() && !VT.is512BitVector())
32376 return LowerBITREVERSE_XOP(Op, DAG);
32377
32378 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
32379
32380 SDValue In = Op.getOperand(0);
32381 SDLoc DL(Op);
32382
32383 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32384 if (VT.is512BitVector() && !Subtarget.hasBWI())
32385 return splitVectorIntUnary(Op, DAG, DL);
32386
32387 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32388 if (VT.is256BitVector() && !Subtarget.hasInt256())
32389 return splitVectorIntUnary(Op, DAG, DL);
32390
32391 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32392 if (!VT.isVector()) {
32393 assert(
32394 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32395 "Only tested for i8/i16/i32/i64");
32396 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32397 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32398 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32399 DAG.getBitcast(MVT::v16i8, Res));
32400 Res =
32401 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32402 DAG.getVectorIdxConstant(0, DL));
32403 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32404 }
32405
32406 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32407
32408 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32409 if (VT.getScalarType() != MVT::i8) {
32410 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32411 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32412 Res = DAG.getBitcast(ByteVT, Res);
32413 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32414 return DAG.getBitcast(VT, Res);
32415 }
32416 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32417 "Only byte vector BITREVERSE supported");
32418
32419 unsigned NumElts = VT.getVectorNumElements();
32420
32421 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32422 if (Subtarget.hasGFNI()) {
32424 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32425 DAG.getTargetConstant(0, DL, MVT::i8));
32426 }
32427
32428 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32429 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32430 // 0-15 value (moved to the other nibble).
32431 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32432 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32433 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32434
32435 const int LoLUT[16] = {
32436 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32437 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32438 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32439 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32440 const int HiLUT[16] = {
32441 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32442 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32443 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32444 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32445
32446 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32447 for (unsigned i = 0; i < NumElts; ++i) {
32448 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32449 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32450 }
32451
32452 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32453 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32454 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32455 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32456 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32457}
32458
32459static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32460 SelectionDAG &DAG) {
32461 SDLoc DL(Op);
32462 SDValue X = Op.getOperand(0);
32463 MVT VT = Op.getSimpleValueType();
32464
32465 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32466 if (VT == MVT::i8 ||
32468 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32469 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32470 DAG.getConstant(0, DL, MVT::i8));
32471 // Copy the inverse of the parity flag into a register with setcc.
32472 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32473 // Extend to the original type.
32474 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32475 }
32476
32477 // If we have POPCNT, use the default expansion.
32478 if (Subtarget.hasPOPCNT())
32479 return SDValue();
32480
32481 if (VT == MVT::i64) {
32482 // Xor the high and low 16-bits together using a 32-bit operation.
32483 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32484 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32485 DAG.getConstant(32, DL, MVT::i8)));
32486 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32487 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32488 }
32489
32490 if (VT != MVT::i16) {
32491 // Xor the high and low 16-bits together using a 32-bit operation.
32492 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32493 DAG.getConstant(16, DL, MVT::i8));
32494 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32495 } else {
32496 // If the input is 16-bits, we need to extend to use an i32 shift below.
32497 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32498 }
32499
32500 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32501 // This should allow an h-reg to be used to save a shift.
32502 SDValue Hi = DAG.getNode(
32503 ISD::TRUNCATE, DL, MVT::i8,
32504 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32505 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32506 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32507 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32508
32509 // Copy the inverse of the parity flag into a register with setcc.
32510 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32511 // Extend to the original type.
32512 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32513}
32514
32516 const X86Subtarget &Subtarget) {
32517 unsigned NewOpc = 0;
32518 switch (N->getOpcode()) {
32520 NewOpc = X86ISD::LADD;
32521 break;
32523 NewOpc = X86ISD::LSUB;
32524 break;
32526 NewOpc = X86ISD::LOR;
32527 break;
32529 NewOpc = X86ISD::LXOR;
32530 break;
32532 NewOpc = X86ISD::LAND;
32533 break;
32534 default:
32535 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32536 }
32537
32538 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32539
32540 return DAG.getMemIntrinsicNode(
32541 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32542 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32543 /*MemVT=*/N->getSimpleValueType(0), MMO);
32544}
32545
32546/// Lower atomic_load_ops into LOCK-prefixed operations.
32548 const X86Subtarget &Subtarget) {
32549 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32550 SDValue Chain = N->getOperand(0);
32551 SDValue LHS = N->getOperand(1);
32552 SDValue RHS = N->getOperand(2);
32553 unsigned Opc = N->getOpcode();
32554 MVT VT = N->getSimpleValueType(0);
32555 SDLoc DL(N);
32556
32557 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32558 // can only be lowered when the result is unused. They should have already
32559 // been transformed into a cmpxchg loop in AtomicExpand.
32560 if (N->hasAnyUseOfValue(0)) {
32561 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32562 // select LXADD if LOCK_SUB can't be selected.
32563 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32564 // can use LXADD as opposed to cmpxchg.
32565 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32567 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32568 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32569
32571 "Used AtomicRMW ops other than Add should have been expanded!");
32572 return N;
32573 }
32574
32575 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32576 // The core idea here is that since the memory location isn't actually
32577 // changing, all we need is a lowering for the *ordering* impacts of the
32578 // atomicrmw. As such, we can chose a different operation and memory
32579 // location to minimize impact on other code.
32580 // The above holds unless the node is marked volatile in which
32581 // case it needs to be preserved according to the langref.
32582 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32583 // On X86, the only ordering which actually requires an instruction is
32584 // seq_cst which isn't SingleThread, everything just needs to be preserved
32585 // during codegen and then dropped. Note that we expect (but don't assume),
32586 // that orderings other than seq_cst and acq_rel have been canonicalized to
32587 // a store or load.
32590 // Prefer a locked operation against a stack location to minimize cache
32591 // traffic. This assumes that stack locations are very likely to be
32592 // accessed only by the owning thread.
32593 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32594 assert(!N->hasAnyUseOfValue(0));
32595 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32596 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32597 DAG.getUNDEF(VT), NewChain);
32598 }
32599 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32600 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32601 assert(!N->hasAnyUseOfValue(0));
32602 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32603 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32604 DAG.getUNDEF(VT), NewChain);
32605 }
32606
32607 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32608 // RAUW the chain, but don't worry about the result, as it's unused.
32609 assert(!N->hasAnyUseOfValue(0));
32610 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32611 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32612 DAG.getUNDEF(VT), LockOp.getValue(1));
32613}
32614
32616 const X86Subtarget &Subtarget) {
32617 auto *Node = cast<AtomicSDNode>(Op.getNode());
32618 SDLoc dl(Node);
32619 EVT VT = Node->getMemoryVT();
32620
32621 bool IsSeqCst =
32622 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32623 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32624
32625 // If this store is not sequentially consistent and the type is legal
32626 // we can just keep it.
32627 if (!IsSeqCst && IsTypeLegal)
32628 return Op;
32629
32630 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32632 Attribute::NoImplicitFloat)) {
32633 SDValue Chain;
32634 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
32635 // vector store.
32636 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
32637 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
32638 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
32639 Node->getMemOperand());
32640 }
32641
32642 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
32643 // is enabled.
32644 if (VT == MVT::i64) {
32645 if (Subtarget.hasSSE1()) {
32646 SDValue SclToVec =
32647 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
32648 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
32649 SclToVec = DAG.getBitcast(StVT, SclToVec);
32650 SDVTList Tys = DAG.getVTList(MVT::Other);
32651 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
32652 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
32653 MVT::i64, Node->getMemOperand());
32654 } else if (Subtarget.hasX87()) {
32655 // First load this into an 80-bit X87 register using a stack temporary.
32656 // This will put the whole integer into the significand.
32657 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
32658 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32659 MachinePointerInfo MPI =
32661 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
32663 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
32664 SDValue LdOps[] = {Chain, StackPtr};
32666 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
32667 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
32668 Chain = Value.getValue(1);
32669
32670 // Now use an FIST to do the atomic store.
32671 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
32672 Chain =
32673 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
32674 StoreOps, MVT::i64, Node->getMemOperand());
32675 }
32676 }
32677
32678 if (Chain) {
32679 // If this is a sequentially consistent store, also emit an appropriate
32680 // barrier.
32681 if (IsSeqCst)
32682 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
32683
32684 return Chain;
32685 }
32686 }
32687
32688 // Convert seq_cst store -> xchg
32689 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
32690 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
32691 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
32692 Node->getOperand(0), Node->getOperand(2),
32693 Node->getOperand(1), Node->getMemOperand());
32694 return Swap.getValue(1);
32695}
32696
32698 SDNode *N = Op.getNode();
32699 MVT VT = N->getSimpleValueType(0);
32700 unsigned Opc = Op.getOpcode();
32701
32702 // Let legalize expand this if it isn't a legal type yet.
32703 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32704 return SDValue();
32705
32706 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
32707 SDLoc DL(N);
32708
32709 // Set the carry flag.
32710 SDValue Carry = Op.getOperand(2);
32711 EVT CarryVT = Carry.getValueType();
32712 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
32713 Carry, DAG.getAllOnesConstant(DL, CarryVT));
32714
32715 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
32716 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
32717 Op.getOperand(0), Op.getOperand(1),
32718 Carry.getValue(1));
32719
32720 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
32721 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
32722 Sum.getValue(1), DL, DAG);
32723 if (N->getValueType(1) == MVT::i1)
32724 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
32725
32726 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
32727}
32728
32729static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
32730 SelectionDAG &DAG) {
32731 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
32732
32733 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
32734 // which returns the values as { float, float } (in XMM0) or
32735 // { double, double } (which is returned in XMM0, XMM1).
32736 SDLoc dl(Op);
32737 SDValue Arg = Op.getOperand(0);
32738 EVT ArgVT = Arg.getValueType();
32739 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
32740
32743
32744 Entry.Node = Arg;
32745 Entry.Ty = ArgTy;
32746 Entry.IsSExt = false;
32747 Entry.IsZExt = false;
32748 Args.push_back(Entry);
32749
32750 bool isF64 = ArgVT == MVT::f64;
32751 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
32752 // the small struct {f32, f32} is returned in (eax, edx). For f64,
32753 // the results are returned via SRet in memory.
32754 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32755 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
32756 const char *LibcallName = TLI.getLibcallName(LC);
32757 SDValue Callee =
32758 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
32759
32760 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
32761 : (Type *)FixedVectorType::get(ArgTy, 4);
32762
32764 CLI.setDebugLoc(dl)
32765 .setChain(DAG.getEntryNode())
32766 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
32767
32768 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
32769
32770 if (isF64)
32771 // Returned in xmm0 and xmm1.
32772 return CallResult.first;
32773
32774 // Returned in bits 0:31 and 32:64 xmm0.
32775 SDValue SinVal =
32776 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
32777 DAG.getVectorIdxConstant(0, dl));
32778 SDValue CosVal =
32779 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
32780 DAG.getVectorIdxConstant(1, dl));
32781 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
32782 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
32783}
32784
32785/// Widen a vector input to a vector of NVT. The
32786/// input vector must have the same element type as NVT.
32788 bool FillWithZeroes = false) {
32789 // Check if InOp already has the right width.
32790 MVT InVT = InOp.getSimpleValueType();
32791 if (InVT == NVT)
32792 return InOp;
32793
32794 if (InOp.isUndef())
32795 return DAG.getUNDEF(NVT);
32796
32798 "input and widen element type must match");
32799
32800 unsigned InNumElts = InVT.getVectorNumElements();
32801 unsigned WidenNumElts = NVT.getVectorNumElements();
32802 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
32803 "Unexpected request for vector widening");
32804
32805 SDLoc dl(InOp);
32806 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
32807 SDValue N1 = InOp.getOperand(1);
32808 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
32809 N1.isUndef()) {
32810 InOp = InOp.getOperand(0);
32811 InVT = InOp.getSimpleValueType();
32812 InNumElts = InVT.getVectorNumElements();
32813 }
32814 }
32817 EVT EltVT = InOp.getOperand(0).getValueType();
32818 SDValue FillVal =
32819 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
32820 SmallVector<SDValue, 16> Ops(InOp->op_begin(), InOp->op_end());
32821 Ops.append(WidenNumElts - InNumElts, FillVal);
32822 return DAG.getBuildVector(NVT, dl, Ops);
32823 }
32824 SDValue FillVal =
32825 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
32826 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
32827 DAG.getVectorIdxConstant(0, dl));
32828}
32829
32831 SelectionDAG &DAG) {
32832 assert(Subtarget.hasAVX512() &&
32833 "MGATHER/MSCATTER are supported on AVX-512 arch only");
32834
32835 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
32836 SDValue Src = N->getValue();
32837 MVT VT = Src.getSimpleValueType();
32838 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
32839 SDLoc dl(Op);
32840
32841 SDValue Scale = N->getScale();
32842 SDValue Index = N->getIndex();
32843 SDValue Mask = N->getMask();
32844 SDValue Chain = N->getChain();
32845 SDValue BasePtr = N->getBasePtr();
32846
32847 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
32848 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
32849 // If the index is v2i64 and we have VLX we can use xmm for data and index.
32850 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
32851 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32852 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
32853 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
32854 SDVTList VTs = DAG.getVTList(MVT::Other);
32855 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
32856 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
32857 N->getMemoryVT(), N->getMemOperand());
32858 }
32859 return SDValue();
32860 }
32861
32862 MVT IndexVT = Index.getSimpleValueType();
32863
32864 // If the index is v2i32, we're being called by type legalization and we
32865 // should just let the default handling take care of it.
32866 if (IndexVT == MVT::v2i32)
32867 return SDValue();
32868
32869 // If we don't have VLX and neither the passthru or index is 512-bits, we
32870 // need to widen until one is.
32871 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
32872 !Index.getSimpleValueType().is512BitVector()) {
32873 // Determine how much we need to widen by to get a 512-bit type.
32874 unsigned Factor = std::min(512/VT.getSizeInBits(),
32875 512/IndexVT.getSizeInBits());
32876 unsigned NumElts = VT.getVectorNumElements() * Factor;
32877
32878 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32879 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32880 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32881
32882 Src = ExtendToType(Src, VT, DAG);
32883 Index = ExtendToType(Index, IndexVT, DAG);
32884 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32885 }
32886
32887 SDVTList VTs = DAG.getVTList(MVT::Other);
32888 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
32889 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
32890 N->getMemoryVT(), N->getMemOperand());
32891}
32892
32893static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
32894 SelectionDAG &DAG) {
32895
32896 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
32897 MVT VT = Op.getSimpleValueType();
32898 MVT ScalarVT = VT.getScalarType();
32899 SDValue Mask = N->getMask();
32900 MVT MaskVT = Mask.getSimpleValueType();
32901 SDValue PassThru = N->getPassThru();
32902 SDLoc dl(Op);
32903
32904 // Handle AVX masked loads which don't support passthru other than 0.
32905 if (MaskVT.getVectorElementType() != MVT::i1) {
32906 // We also allow undef in the isel pattern.
32907 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
32908 return Op;
32909
32910 SDValue NewLoad = DAG.getMaskedLoad(
32911 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32912 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
32913 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
32914 N->isExpandingLoad());
32915 // Emit a blend.
32916 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
32917 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
32918 }
32919
32920 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
32921 "Expanding masked load is supported on AVX-512 target only!");
32922
32923 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
32924 "Expanding masked load is supported for 32 and 64-bit types only!");
32925
32926 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32927 "Cannot lower masked load op.");
32928
32929 assert((ScalarVT.getSizeInBits() >= 32 ||
32930 (Subtarget.hasBWI() &&
32931 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
32932 "Unsupported masked load op.");
32933
32934 // This operation is legal for targets with VLX, but without
32935 // VLX the vector should be widened to 512 bit
32936 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
32937 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32938 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
32939
32940 // Mask element has to be i1.
32941 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
32942 "Unexpected mask type");
32943
32944 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32945
32946 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32947 SDValue NewLoad = DAG.getMaskedLoad(
32948 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32949 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
32950 N->getExtensionType(), N->isExpandingLoad());
32951
32952 SDValue Extract =
32953 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
32954 DAG.getVectorIdxConstant(0, dl));
32955 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
32956 return DAG.getMergeValues(RetOps, dl);
32957}
32958
32959static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
32960 SelectionDAG &DAG) {
32961 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
32962 SDValue DataToStore = N->getValue();
32963 MVT VT = DataToStore.getSimpleValueType();
32964 MVT ScalarVT = VT.getScalarType();
32965 SDValue Mask = N->getMask();
32966 SDLoc dl(Op);
32967
32968 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
32969 "Expanding masked load is supported on AVX-512 target only!");
32970
32971 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
32972 "Expanding masked load is supported for 32 and 64-bit types only!");
32973
32974 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32975 "Cannot lower masked store op.");
32976
32977 assert((ScalarVT.getSizeInBits() >= 32 ||
32978 (Subtarget.hasBWI() &&
32979 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
32980 "Unsupported masked store op.");
32981
32982 // This operation is legal for targets with VLX, but without
32983 // VLX the vector should be widened to 512 bit
32984 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
32985 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32986
32987 // Mask element has to be i1.
32988 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
32989 "Unexpected mask type");
32990
32991 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32992
32993 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
32994 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32995 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
32996 N->getOffset(), Mask, N->getMemoryVT(),
32997 N->getMemOperand(), N->getAddressingMode(),
32998 N->isTruncatingStore(), N->isCompressingStore());
32999}
33000
33001static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33002 SelectionDAG &DAG) {
33003 assert(Subtarget.hasAVX2() &&
33004 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33005
33006 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
33007 SDLoc dl(Op);
33008 MVT VT = Op.getSimpleValueType();
33009 SDValue Index = N->getIndex();
33010 SDValue Mask = N->getMask();
33011 SDValue PassThru = N->getPassThru();
33012 MVT IndexVT = Index.getSimpleValueType();
33013
33014 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33015
33016 // If the index is v2i32, we're being called by type legalization.
33017 if (IndexVT == MVT::v2i32)
33018 return SDValue();
33019
33020 // If we don't have VLX and neither the passthru or index is 512-bits, we
33021 // need to widen until one is.
33022 MVT OrigVT = VT;
33023 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33024 !IndexVT.is512BitVector()) {
33025 // Determine how much we need to widen by to get a 512-bit type.
33026 unsigned Factor = std::min(512/VT.getSizeInBits(),
33027 512/IndexVT.getSizeInBits());
33028
33029 unsigned NumElts = VT.getVectorNumElements() * Factor;
33030
33031 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33032 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33033 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33034
33035 PassThru = ExtendToType(PassThru, VT, DAG);
33036 Index = ExtendToType(Index, IndexVT, DAG);
33037 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33038 }
33039
33040 // Break dependency on the data register.
33041 if (PassThru.isUndef())
33042 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33043
33044 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33045 N->getScale() };
33046 SDValue NewGather = DAG.getMemIntrinsicNode(
33047 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33048 N->getMemOperand());
33049 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33050 DAG.getVectorIdxConstant(0, dl));
33051 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33052}
33053
33055 SDLoc dl(Op);
33056 SDValue Src = Op.getOperand(0);
33057 MVT DstVT = Op.getSimpleValueType();
33058
33059 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
33060 unsigned SrcAS = N->getSrcAddressSpace();
33061
33062 assert(SrcAS != N->getDestAddressSpace() &&
33063 "addrspacecast must be between different address spaces");
33064
33065 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33066 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33067 } else if (DstVT == MVT::i64) {
33068 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33069 } else if (DstVT == MVT::i32) {
33070 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33071 } else {
33072 report_fatal_error("Bad address space in addrspacecast");
33073 }
33074 return Op;
33075}
33076
33077SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33078 SelectionDAG &DAG) const {
33079 // TODO: Eventually, the lowering of these nodes should be informed by or
33080 // deferred to the GC strategy for the function in which they appear. For
33081 // now, however, they must be lowered to something. Since they are logically
33082 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33083 // require special handling for these nodes), lower them as literal NOOPs for
33084 // the time being.
33086 Ops.push_back(Op.getOperand(0));
33087 if (Op->getGluedNode())
33088 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33089
33090 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33091 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33092}
33093
33094// Custom split CVTPS2PH with wide types.
33096 SDLoc dl(Op);
33097 EVT VT = Op.getValueType();
33098 SDValue Lo, Hi;
33099 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33100 EVT LoVT, HiVT;
33101 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33102 SDValue RC = Op.getOperand(1);
33103 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33104 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33105 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33106}
33107
33109 SelectionDAG &DAG) {
33110 unsigned IsData = Op.getConstantOperandVal(4);
33111
33112 // We don't support non-data prefetch without PREFETCHI.
33113 // Just preserve the chain.
33114 if (!IsData && !Subtarget.hasPREFETCHI())
33115 return Op.getOperand(0);
33116
33117 return Op;
33118}
33119
33121 SDNode *N = Op.getNode();
33122 SDValue Operand = N->getOperand(0);
33123 EVT VT = Operand.getValueType();
33124 SDLoc dl(N);
33125
33126 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33127
33128 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33129 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33130 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33131 // promote this operator's result!
33132 SDValue Chain = DAG.getEntryNode();
33133 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33134 {Chain, Operand, One});
33135 return StrictFmul;
33136}
33137
33139 unsigned OpNo) {
33140 const APInt Operand(32, OpNo);
33141 std::string OpNoStr = llvm::toString(Operand, 10, false);
33142 std::string Str(" $");
33143
33144 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33145 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33146
33147 auto I = StringRef::npos;
33148 for (auto &AsmStr : AsmStrs) {
33149 // Match the OpNo string. We should match exactly to exclude match
33150 // sub-string, e.g. "$12" contain "$1"
33151 if (AsmStr.ends_with(OpNoStr1))
33152 I = AsmStr.size() - OpNoStr1.size();
33153
33154 // Get the index of operand in AsmStr.
33155 if (I == StringRef::npos)
33156 I = AsmStr.find(OpNoStr1 + ",");
33157 if (I == StringRef::npos)
33158 I = AsmStr.find(OpNoStr2);
33159
33160 if (I == StringRef::npos)
33161 continue;
33162
33163 assert(I > 0 && "Unexpected inline asm string!");
33164 // Remove the operand string and label (if exsit).
33165 // For example:
33166 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33167 // ==>
33168 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33169 // ==>
33170 // "call dword ptr "
33171 auto TmpStr = AsmStr.substr(0, I);
33172 I = TmpStr.rfind(':');
33173 if (I != StringRef::npos)
33174 TmpStr = TmpStr.substr(I + 1);
33175 return TmpStr.take_while(llvm::isAlpha);
33176 }
33177
33178 return StringRef();
33179}
33180
33182 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33183 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33184 // changed from indirect TargetLowering::C_Memory to direct
33185 // TargetLowering::C_Address.
33186 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33187 // location.
33188 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33189 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33190}
33191
33193 SDValue Mask) {
33194 EVT Ty = MVT::i8;
33195 auto V = DAG.getBitcast(MVT::i1, Mask);
33196 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33197 auto Zero = DAG.getConstant(0, DL, Ty);
33198 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33199 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33200 return SDValue(CmpZero.getNode(), 1);
33201}
33202
33204 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33205 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33206 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33207 // ->
33208 // _, flags = SUB 0, mask
33209 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33210 // bit_cast_to_vector<res>
33211 EVT VTy = PassThru.getValueType();
33212 EVT Ty = VTy.getVectorElementType();
33213 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33214 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33215 : DAG.getBitcast(Ty, PassThru);
33216 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33217 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33218 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33219 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33220 return DAG.getBitcast(VTy, NewLoad);
33221}
33222
33224 SDValue Chain,
33226 SDValue Val, SDValue Mask) const {
33227 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33228 // ->
33229 // _, flags = SUB 0, mask
33230 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33232 SDVTList Tys = DAG.getVTList(MVT::Other);
33233 auto ScalarVal = DAG.getBitcast(Ty, Val);
33234 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33235 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33236 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33237 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33238}
33239
33240/// Provide custom lowering hooks for some operations.
33242 switch (Op.getOpcode()) {
33243 // clang-format off
33244 default: llvm_unreachable("Should not custom lower this!");
33245 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33247 return LowerCMP_SWAP(Op, Subtarget, DAG);
33248 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33253 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33254 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33255 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33256 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33257 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33258 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33259 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33260 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33261 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33262 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33263 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33264 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33265 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33266 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33267 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33268 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33269 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33270 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33271 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33272 case ISD::SHL_PARTS:
33273 case ISD::SRA_PARTS:
33274 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33275 case ISD::FSHL:
33276 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33277 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33279 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33281 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33282 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33283 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33284 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33285 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33288 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33289 case ISD::FP_TO_SINT:
33291 case ISD::FP_TO_UINT:
33292 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33294 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33295 case ISD::FP_EXTEND:
33296 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33297 case ISD::FP_ROUND:
33298 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33299 case ISD::FP16_TO_FP:
33300 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33301 case ISD::FP_TO_FP16:
33302 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33303 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33304 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33305 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33306 case ISD::FADD:
33307 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33308 case ISD::FROUND: return LowerFROUND(Op, DAG);
33309 case ISD::FABS:
33310 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33311 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33312 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33313 case ISD::LRINT:
33314 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33315 case ISD::SETCC:
33316 case ISD::STRICT_FSETCC:
33317 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33318 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33319 case ISD::SELECT: return LowerSELECT(Op, DAG);
33320 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33321 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33322 case ISD::VASTART: return LowerVASTART(Op, DAG);
33323 case ISD::VAARG: return LowerVAARG(Op, DAG);
33324 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33325 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33327 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33328 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33329 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33330 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33332 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33333 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33334 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33335 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33336 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33338 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33339 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33341 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33342 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33343 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33344 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33345 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33346 case ISD::CTLZ:
33347 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33348 case ISD::CTTZ:
33349 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33350 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33351 case ISD::MULHS:
33352 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33353 case ISD::ROTL:
33354 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33355 case ISD::SRA:
33356 case ISD::SRL:
33357 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33358 case ISD::SADDO:
33359 case ISD::UADDO:
33360 case ISD::SSUBO:
33361 case ISD::USUBO: return LowerXALUO(Op, DAG);
33362 case ISD::SMULO:
33363 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33364 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33365 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33366 case ISD::SADDO_CARRY:
33367 case ISD::SSUBO_CARRY:
33368 case ISD::UADDO_CARRY:
33369 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33370 case ISD::ADD:
33371 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33372 case ISD::UADDSAT:
33373 case ISD::SADDSAT:
33374 case ISD::USUBSAT:
33375 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33376 case ISD::SMAX:
33377 case ISD::SMIN:
33378 case ISD::UMAX:
33379 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33380 case ISD::FMINIMUM:
33381 case ISD::FMAXIMUM:
33382 case ISD::FMINIMUMNUM:
33383 case ISD::FMAXIMUMNUM:
33384 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33385 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33386 case ISD::ABDS:
33387 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33388 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33389 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33390 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33391 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33392 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33393 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33395 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33396 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33397 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33398 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33399 // clang-format on
33400 }
33401}
33402
33403/// Replace a node with an illegal result type with a new node built out of
33404/// custom code.
33407 SelectionDAG &DAG) const {
33408 SDLoc dl(N);
33409 unsigned Opc = N->getOpcode();
33410 switch (Opc) {
33411 default:
33412#ifndef NDEBUG
33413 dbgs() << "ReplaceNodeResults: ";
33414 N->dump(&DAG);
33415#endif
33416 llvm_unreachable("Do not know how to custom type legalize this operation!");
33417 case X86ISD::CVTPH2PS: {
33418 EVT VT = N->getValueType(0);
33419 SDValue Lo, Hi;
33420 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33421 EVT LoVT, HiVT;
33422 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33423 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33424 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33425 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33426 Results.push_back(Res);
33427 return;
33428 }
33430 EVT VT = N->getValueType(0);
33431 SDValue Lo, Hi;
33432 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33433 EVT LoVT, HiVT;
33434 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33435 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33436 {N->getOperand(0), Lo});
33437 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33438 {N->getOperand(0), Hi});
33439 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33440 Lo.getValue(1), Hi.getValue(1));
33441 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33442 Results.push_back(Res);
33443 Results.push_back(Chain);
33444 return;
33445 }
33446 case X86ISD::CVTPS2PH:
33447 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33448 return;
33449 case ISD::CTPOP: {
33450 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33451 // If we have at most 32 active bits, then perform as i32 CTPOP.
33452 // TODO: Perform this in generic legalizer?
33453 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33454 unsigned LZ = Known.countMinLeadingZeros();
33455 unsigned TZ = Known.countMinTrailingZeros();
33456 if ((LZ + TZ) >= 32) {
33457 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33458 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33459 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33460 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33461 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33462 Results.push_back(Op);
33463 return;
33464 }
33465 // Use a v2i64 if possible.
33466 bool NoImplicitFloatOps =
33468 Attribute::NoImplicitFloat);
33469 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33470 SDValue Wide =
33471 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33472 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33473 // Bit count should fit in 32-bits, extract it as that and then zero
33474 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33475 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33476 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33477 DAG.getVectorIdxConstant(0, dl));
33478 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33479 Results.push_back(Wide);
33480 }
33481 return;
33482 }
33483 case ISD::MUL: {
33484 EVT VT = N->getValueType(0);
33486 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33487 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33488 // elements are needed.
33489 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33490 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33491 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33492 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33493 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33494 unsigned NumConcats = 16 / VT.getVectorNumElements();
33495 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33496 ConcatOps[0] = Res;
33497 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33498 Results.push_back(Res);
33499 return;
33500 }
33501 case ISD::SMULO:
33502 case ISD::UMULO: {
33503 EVT VT = N->getValueType(0);
33505 VT == MVT::v2i32 && "Unexpected VT!");
33506 bool IsSigned = Opc == ISD::SMULO;
33507 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33508 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33509 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33510 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33511 // Extract the high 32 bits from each result using PSHUFD.
33512 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33513 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33514 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33515 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33516 DAG.getVectorIdxConstant(0, dl));
33517
33518 // Truncate the low bits of the result. This will become PSHUFD.
33519 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33520
33521 SDValue HiCmp;
33522 if (IsSigned) {
33523 // SMULO overflows if the high bits don't match the sign of the low.
33524 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33525 } else {
33526 // UMULO overflows if the high bits are non-zero.
33527 HiCmp = DAG.getConstant(0, dl, VT);
33528 }
33529 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33530
33531 // Widen the result with by padding with undef.
33532 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33533 DAG.getUNDEF(VT));
33534 Results.push_back(Res);
33535 Results.push_back(Ovf);
33536 return;
33537 }
33538 case X86ISD::VPMADDWD: {
33539 // Legalize types for X86ISD::VPMADDWD by widening.
33540 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33541
33542 EVT VT = N->getValueType(0);
33543 EVT InVT = N->getOperand(0).getValueType();
33544 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33545 "Expected a VT that divides into 128 bits.");
33547 "Unexpected type action!");
33548 unsigned NumConcat = 128 / InVT.getSizeInBits();
33549
33550 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33551 InVT.getVectorElementType(),
33552 NumConcat * InVT.getVectorNumElements());
33553 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33555 NumConcat * VT.getVectorNumElements());
33556
33557 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33558 Ops[0] = N->getOperand(0);
33559 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33560 Ops[0] = N->getOperand(1);
33561 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33562
33563 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33564 Results.push_back(Res);
33565 return;
33566 }
33567 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33568 case X86ISD::FMINC:
33569 case X86ISD::FMIN:
33570 case X86ISD::FMAXC:
33571 case X86ISD::FMAX:
33573 case X86ISD::STRICT_FMAX: {
33574 EVT VT = N->getValueType(0);
33575 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33576 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33577 SDValue UNDEF = DAG.getUNDEF(VT);
33578 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33579 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33580 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33581 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33582 SDValue Res;
33583 if (IsStrict)
33584 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33585 {N->getOperand(0), LHS, RHS});
33586 else
33587 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33588 Results.push_back(Res);
33589 if (IsStrict)
33590 Results.push_back(Res.getValue(1));
33591 return;
33592 }
33593 case ISD::SDIV:
33594 case ISD::UDIV:
33595 case ISD::SREM:
33596 case ISD::UREM: {
33597 EVT VT = N->getValueType(0);
33598 if (VT.isVector()) {
33600 "Unexpected type action!");
33601 // If this RHS is a constant splat vector we can widen this and let
33602 // division/remainder by constant optimize it.
33603 // TODO: Can we do something for non-splat?
33604 APInt SplatVal;
33605 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33606 unsigned NumConcats = 128 / VT.getSizeInBits();
33607 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33608 Ops0[0] = N->getOperand(0);
33609 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33610 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33611 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33612 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33613 Results.push_back(Res);
33614 }
33615 return;
33616 }
33617
33618 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33619 Results.push_back(V);
33620 return;
33621 }
33622 case ISD::TRUNCATE: {
33623 MVT VT = N->getSimpleValueType(0);
33624 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33625 return;
33626
33627 // The generic legalizer will try to widen the input type to the same
33628 // number of elements as the widened result type. But this isn't always
33629 // the best thing so do some custom legalization to avoid some cases.
33630 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33631 SDValue In = N->getOperand(0);
33632 EVT InVT = In.getValueType();
33633 EVT InEltVT = InVT.getVectorElementType();
33634 EVT EltVT = VT.getVectorElementType();
33635 unsigned MinElts = VT.getVectorNumElements();
33636 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33637 unsigned InBits = InVT.getSizeInBits();
33638
33639 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33640 unsigned PackOpcode;
33641 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
33642 Subtarget, N->getFlags())) {
33643 if (SDValue Res =
33644 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
33645 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
33646 Results.push_back(Res);
33647 return;
33648 }
33649 }
33650
33651 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
33652 // 128 bit and smaller inputs should avoid truncate all together and
33653 // use a shuffle.
33654 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
33655 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
33656 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
33657 for (unsigned I = 0; I < MinElts; ++I)
33658 TruncMask[I] = Scale * I;
33659 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
33660 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
33661 "Illegal vector type in truncation");
33662 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
33663 Results.push_back(
33664 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
33665 return;
33666 }
33667 }
33668
33669 // With AVX512 there are some cases that can use a target specific
33670 // truncate node to go from 256/512 to less than 128 with zeros in the
33671 // upper elements of the 128 bit result.
33672 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
33673 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
33674 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
33675 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
33676 return;
33677 }
33678 // There's one case we can widen to 512 bits and use VTRUNC.
33679 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
33680 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
33681 DAG.getUNDEF(MVT::v4i64));
33682 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
33683 return;
33684 }
33685 }
33686 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
33687 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
33688 isTypeLegal(MVT::v4i64)) {
33689 // Input needs to be split and output needs to widened. Let's use two
33690 // VTRUNCs, and shuffle their results together into the wider type.
33691 SDValue Lo, Hi;
33692 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
33693
33694 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
33695 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
33696 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
33697 { 0, 1, 2, 3, 16, 17, 18, 19,
33698 -1, -1, -1, -1, -1, -1, -1, -1 });
33699 Results.push_back(Res);
33700 return;
33701 }
33702
33703 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
33704 // this via type legalization.
33705 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
33706 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
33707 (!Subtarget.hasSSSE3() ||
33708 (!isTypeLegal(InVT) &&
33709 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
33710 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
33711 InEltVT.getSizeInBits() * WidenNumElts);
33712 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
33713 return;
33714 }
33715
33716 return;
33717 }
33718 case ISD::ANY_EXTEND:
33719 // Right now, only MVT::v8i8 has Custom action for an illegal type.
33720 // It's intended to custom handle the input type.
33721 assert(N->getValueType(0) == MVT::v8i8 &&
33722 "Do not know how to legalize this Node");
33723 return;
33724 case ISD::SIGN_EXTEND:
33725 case ISD::ZERO_EXTEND: {
33726 EVT VT = N->getValueType(0);
33727 SDValue In = N->getOperand(0);
33728 EVT InVT = In.getValueType();
33729 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
33730 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
33732 "Unexpected type action!");
33733 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
33734 // Custom split this so we can extend i8/i16->i32 invec. This is better
33735 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
33736 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
33737 // we allow the sra from the extend to i32 to be shared by the split.
33738 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
33739
33740 // Fill a vector with sign bits for each element.
33741 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
33742 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
33743
33744 // Create an unpackl and unpackh to interleave the sign bits then bitcast
33745 // to v2i64.
33746 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
33747 {0, 4, 1, 5});
33748 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
33749 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
33750 {2, 6, 3, 7});
33751 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
33752
33753 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33754 Results.push_back(Res);
33755 return;
33756 }
33757
33758 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
33759 if (!InVT.is128BitVector()) {
33760 // Not a 128 bit vector, but maybe type legalization will promote
33761 // it to 128 bits.
33762 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
33763 return;
33764 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
33765 if (!InVT.is128BitVector())
33766 return;
33767
33768 // Promote the input to 128 bits. Type legalization will turn this into
33769 // zext_inreg/sext_inreg.
33770 In = DAG.getNode(Opc, dl, InVT, In);
33771 }
33772
33773 // Perform custom splitting instead of the two stage extend we would get
33774 // by default.
33775 EVT LoVT, HiVT;
33776 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
33777 assert(isTypeLegal(LoVT) && "Split VT not legal?");
33778
33779 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
33780
33781 // We need to shift the input over by half the number of elements.
33782 unsigned NumElts = InVT.getVectorNumElements();
33783 unsigned HalfNumElts = NumElts / 2;
33784 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
33785 for (unsigned i = 0; i != HalfNumElts; ++i)
33786 ShufMask[i] = i + HalfNumElts;
33787
33788 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
33789 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
33790
33791 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33792 Results.push_back(Res);
33793 }
33794 return;
33795 }
33797 case ISD::FP_TO_UINT_SAT: {
33798 if (!Subtarget.hasAVX10_2())
33799 return;
33800
33801 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
33802 EVT VT = N->getValueType(0);
33803 SDValue Op = N->getOperand(0);
33804 EVT OpVT = Op.getValueType();
33805 SDValue Res;
33806
33807 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
33808 if (IsSigned)
33809 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
33810 else
33811 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
33812 Results.push_back(Res);
33813 }
33814 return;
33815 }
33816 case ISD::FP_TO_SINT:
33818 case ISD::FP_TO_UINT:
33820 bool IsStrict = N->isStrictFPOpcode();
33821 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
33822 EVT VT = N->getValueType(0);
33823 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33824 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33825 EVT SrcVT = Src.getValueType();
33826
33827 SDValue Res;
33828 if (isSoftF16(SrcVT, Subtarget)) {
33829 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
33830 if (IsStrict) {
33831 Res =
33832 DAG.getNode(Opc, dl, {VT, MVT::Other},
33833 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
33834 {NVT, MVT::Other}, {Chain, Src})});
33835 Chain = Res.getValue(1);
33836 } else {
33837 Res =
33838 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
33839 }
33840 Results.push_back(Res);
33841 if (IsStrict)
33842 Results.push_back(Chain);
33843
33844 return;
33845 }
33846
33847 if (VT.isVector() && Subtarget.hasFP16() &&
33848 SrcVT.getVectorElementType() == MVT::f16) {
33849 EVT EleVT = VT.getVectorElementType();
33850 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
33851
33852 if (SrcVT != MVT::v8f16) {
33853 SDValue Tmp =
33854 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
33855 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
33856 Ops[0] = Src;
33857 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
33858 }
33859
33860 if (IsStrict) {
33862 Res =
33863 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
33864 Chain = Res.getValue(1);
33865 } else {
33866 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33867 Res = DAG.getNode(Opc, dl, ResVT, Src);
33868 }
33869
33870 // TODO: Need to add exception check code for strict FP.
33871 if (EleVT.getSizeInBits() < 16) {
33872 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
33873 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
33874
33875 // Now widen to 128 bits.
33876 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
33877 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
33878 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
33879 ConcatOps[0] = Res;
33880 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
33881 }
33882
33883 Results.push_back(Res);
33884 if (IsStrict)
33885 Results.push_back(Chain);
33886
33887 return;
33888 }
33889
33890 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
33892 "Unexpected type action!");
33893
33894 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
33895 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
33896 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
33898 SDValue Res;
33899 SDValue Chain;
33900 if (IsStrict) {
33901 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
33902 {N->getOperand(0), Src});
33903 Chain = Res.getValue(1);
33904 } else
33905 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
33906
33907 // Preserve what we know about the size of the original result. If the
33908 // result is v2i32, we have to manually widen the assert.
33909 if (PromoteVT == MVT::v2i32)
33910 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33911 DAG.getUNDEF(MVT::v2i32));
33912
33913 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
33914 Res.getValueType(), Res,
33916
33917 if (PromoteVT == MVT::v2i32)
33918 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
33919 DAG.getVectorIdxConstant(0, dl));
33920
33921 // Truncate back to the original width.
33922 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33923
33924 // Now widen to 128 bits.
33925 unsigned NumConcats = 128 / VT.getSizeInBits();
33927 VT.getVectorNumElements() * NumConcats);
33928 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33929 ConcatOps[0] = Res;
33930 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
33931 Results.push_back(Res);
33932 if (IsStrict)
33933 Results.push_back(Chain);
33934 return;
33935 }
33936
33937
33938 if (VT == MVT::v2i32) {
33939 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
33940 "Strict unsigned conversion requires AVX512");
33941 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33943 "Unexpected type action!");
33944 if (Src.getValueType() == MVT::v2f64) {
33945 if (!IsSigned && !Subtarget.hasAVX512()) {
33946 SDValue Res =
33947 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
33948 Results.push_back(Res);
33949 return;
33950 }
33951
33952 if (IsStrict)
33954 else
33955 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33956
33957 // If we have VLX we can emit a target specific FP_TO_UINT node,.
33958 if (!IsSigned && !Subtarget.hasVLX()) {
33959 // Otherwise we can defer to the generic legalizer which will widen
33960 // the input as well. This will be further widened during op
33961 // legalization to v8i32<-v8f64.
33962 // For strict nodes we'll need to widen ourselves.
33963 // FIXME: Fix the type legalizer to safely widen strict nodes?
33964 if (!IsStrict)
33965 return;
33966 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
33967 DAG.getConstantFP(0.0, dl, MVT::v2f64));
33968 Opc = N->getOpcode();
33969 }
33970 SDValue Res;
33971 SDValue Chain;
33972 if (IsStrict) {
33973 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
33974 {N->getOperand(0), Src});
33975 Chain = Res.getValue(1);
33976 } else {
33977 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
33978 }
33979 Results.push_back(Res);
33980 if (IsStrict)
33981 Results.push_back(Chain);
33982 return;
33983 }
33984
33985 // Custom widen strict v2f32->v2i32 by padding with zeros.
33986 // FIXME: Should generic type legalizer do this?
33987 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
33988 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
33989 DAG.getConstantFP(0.0, dl, MVT::v2f32));
33990 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
33991 {N->getOperand(0), Src});
33992 Results.push_back(Res);
33993 Results.push_back(Res.getValue(1));
33994 return;
33995 }
33996
33997 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
33998 // so early out here.
33999 return;
34000 }
34001
34002 assert(!VT.isVector() && "Vectors should have been handled above!");
34003
34004 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34005 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34006 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34007 assert(!Subtarget.is64Bit() && "i64 should be legal");
34008 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34009 // If we use a 128-bit result we might need to use a target specific node.
34010 unsigned SrcElts =
34011 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34012 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34013 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34014 if (NumElts != SrcElts) {
34015 if (IsStrict)
34017 else
34018 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34019 }
34020
34021 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34022 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34023 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34024 ZeroIdx);
34025 SDValue Chain;
34026 if (IsStrict) {
34027 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34028 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34029 Chain = Res.getValue(1);
34030 } else
34031 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34032 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34033 Results.push_back(Res);
34034 if (IsStrict)
34035 Results.push_back(Chain);
34036 return;
34037 }
34038
34039 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34040 SDValue Chain;
34041 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34042 Results.push_back(V);
34043 if (IsStrict)
34044 Results.push_back(Chain);
34045 return;
34046 }
34047
34048 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34049 Results.push_back(V);
34050 if (IsStrict)
34051 Results.push_back(Chain);
34052 }
34053 return;
34054 }
34055 case ISD::LRINT:
34056 if (N->getValueType(0) == MVT::v2i32) {
34057 SDValue Src = N->getOperand(0);
34058 if (Src.getValueType() == MVT::v2f64)
34059 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34060 return;
34061 }
34062 [[fallthrough]];
34063 case ISD::LLRINT: {
34064 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34065 Results.push_back(V);
34066 return;
34067 }
34068
34069 case ISD::SINT_TO_FP:
34071 case ISD::UINT_TO_FP:
34073 bool IsStrict = N->isStrictFPOpcode();
34074 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34075 EVT VT = N->getValueType(0);
34076 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34077 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34078 Subtarget.hasVLX()) {
34079 if (Src.getValueType().getVectorElementType() == MVT::i16)
34080 return;
34081
34082 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34083 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34084 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34085 : DAG.getUNDEF(MVT::v2i32));
34086 if (IsStrict) {
34087 unsigned Opc =
34089 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34090 {N->getOperand(0), Src});
34091 Results.push_back(Res);
34092 Results.push_back(Res.getValue(1));
34093 } else {
34094 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34095 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34096 }
34097 return;
34098 }
34099 if (VT != MVT::v2f32)
34100 return;
34101 EVT SrcVT = Src.getValueType();
34102 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34103 if (IsStrict) {
34104 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34106 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34107 {N->getOperand(0), Src});
34108 Results.push_back(Res);
34109 Results.push_back(Res.getValue(1));
34110 } else {
34111 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34112 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34113 }
34114 return;
34115 }
34116 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34117 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34118 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34119 SDValue One = DAG.getConstant(1, dl, SrcVT);
34120 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34121 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34122 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34123 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34124 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34125 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34126 for (int i = 0; i != 2; ++i) {
34127 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34128 SignSrc, DAG.getVectorIdxConstant(i, dl));
34129 if (IsStrict)
34130 SignCvts[i] =
34131 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34132 {N->getOperand(0), Elt});
34133 else
34134 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34135 };
34136 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34137 SDValue Slow, Chain;
34138 if (IsStrict) {
34139 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34140 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34141 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34142 {Chain, SignCvt, SignCvt});
34143 Chain = Slow.getValue(1);
34144 } else {
34145 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34146 }
34147 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34148 IsNeg =
34149 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34150 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34151 Results.push_back(Cvt);
34152 if (IsStrict)
34153 Results.push_back(Chain);
34154 return;
34155 }
34156
34157 if (SrcVT != MVT::v2i32)
34158 return;
34159
34160 if (IsSigned || Subtarget.hasAVX512()) {
34161 if (!IsStrict)
34162 return;
34163
34164 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34165 // FIXME: Should generic type legalizer do this?
34166 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34167 DAG.getConstant(0, dl, MVT::v2i32));
34168 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34169 {N->getOperand(0), Src});
34170 Results.push_back(Res);
34171 Results.push_back(Res.getValue(1));
34172 return;
34173 }
34174
34175 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34176 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34177 SDValue VBias = DAG.getConstantFP(
34178 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34179 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34180 DAG.getBitcast(MVT::v2i64, VBias));
34181 Or = DAG.getBitcast(MVT::v2f64, Or);
34182 if (IsStrict) {
34183 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34184 {N->getOperand(0), Or, VBias});
34186 {MVT::v4f32, MVT::Other},
34187 {Sub.getValue(1), Sub});
34188 Results.push_back(Res);
34189 Results.push_back(Res.getValue(1));
34190 } else {
34191 // TODO: Are there any fast-math-flags to propagate here?
34192 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34193 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34194 }
34195 return;
34196 }
34198 case ISD::FP_ROUND: {
34199 bool IsStrict = N->isStrictFPOpcode();
34200 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34201 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34202 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34203 EVT SrcVT = Src.getValueType();
34204 EVT VT = N->getValueType(0);
34205 SDValue V;
34206 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34207 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34208 : DAG.getUNDEF(MVT::v2f32);
34209 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34210 }
34211 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34212 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34213 if (SrcVT.getVectorElementType() != MVT::f32)
34214 return;
34215
34216 if (IsStrict)
34217 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34218 {Chain, Src, Rnd});
34219 else
34220 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34221
34222 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34223 if (IsStrict)
34224 Results.push_back(V.getValue(1));
34225 return;
34226 }
34227 if (!isTypeLegal(Src.getValueType()))
34228 return;
34229 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34230 if (IsStrict)
34231 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34232 {Chain, Src});
34233 else
34234 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34235 Results.push_back(V);
34236 if (IsStrict)
34237 Results.push_back(V.getValue(1));
34238 return;
34239 }
34240 case ISD::FP_EXTEND:
34241 case ISD::STRICT_FP_EXTEND: {
34242 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34243 // No other ValueType for FP_EXTEND should reach this point.
34244 assert(N->getValueType(0) == MVT::v2f32 &&
34245 "Do not know how to legalize this Node");
34246 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34247 return;
34248 bool IsStrict = N->isStrictFPOpcode();
34249 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34250 if (Src.getValueType().getVectorElementType() != MVT::f16)
34251 return;
34252 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34253 : DAG.getUNDEF(MVT::v2f16);
34254 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34255 if (IsStrict)
34256 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34257 {N->getOperand(0), V});
34258 else
34259 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34260 Results.push_back(V);
34261 if (IsStrict)
34262 Results.push_back(V.getValue(1));
34263 return;
34264 }
34266 unsigned IntNo = N->getConstantOperandVal(1);
34267 switch (IntNo) {
34268 default : llvm_unreachable("Do not know how to custom type "
34269 "legalize this intrinsic operation!");
34270 case Intrinsic::x86_rdtsc:
34271 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34272 Results);
34273 case Intrinsic::x86_rdtscp:
34274 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34275 Results);
34276 case Intrinsic::x86_rdpmc:
34277 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34278 Results);
34279 return;
34280 case Intrinsic::x86_rdpru:
34281 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34282 Results);
34283 return;
34284 case Intrinsic::x86_xgetbv:
34285 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34286 Results);
34287 return;
34288 }
34289 }
34290 case ISD::READCYCLECOUNTER: {
34291 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34292 }
34294 EVT T = N->getValueType(0);
34295 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34296 bool Regs64bit = T == MVT::i128;
34297 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34298 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34299 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34300 SDValue cpInL, cpInH;
34301 std::tie(cpInL, cpInH) =
34302 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34303 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34304 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34305 cpInH =
34306 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34307 cpInH, cpInL.getValue(1));
34308 SDValue swapInL, swapInH;
34309 std::tie(swapInL, swapInH) =
34310 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34311 swapInH =
34312 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34313 swapInH, cpInH.getValue(1));
34314
34315 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34316 // until later. So we keep the RBX input in a vreg and use a custom
34317 // inserter.
34318 // Since RBX will be a reserved register the register allocator will not
34319 // make sure its value will be properly saved and restored around this
34320 // live-range.
34321 SDValue Result;
34322 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34323 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34324 if (Regs64bit) {
34325 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34326 swapInH.getValue(1)};
34327 Result =
34328 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34329 } else {
34330 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34331 swapInH.getValue(1));
34332 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34333 swapInL.getValue(1)};
34334 Result =
34335 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34336 }
34337
34338 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34339 Regs64bit ? X86::RAX : X86::EAX,
34340 HalfT, Result.getValue(1));
34341 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34342 Regs64bit ? X86::RDX : X86::EDX,
34343 HalfT, cpOutL.getValue(2));
34344 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34345
34346 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34347 MVT::i32, cpOutH.getValue(2));
34348 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34349 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34350
34351 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34352 Results.push_back(Success);
34353 Results.push_back(EFLAGS.getValue(1));
34354 return;
34355 }
34356 case ISD::ATOMIC_LOAD: {
34357 assert(
34358 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34359 "Unexpected VT!");
34360 bool NoImplicitFloatOps =
34362 Attribute::NoImplicitFloat);
34363 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34364 auto *Node = cast<AtomicSDNode>(N);
34365
34366 if (N->getValueType(0) == MVT::i128) {
34367 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34368 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34369 Node->getBasePtr(), Node->getMemOperand());
34370 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34371 DAG.getVectorIdxConstant(0, dl));
34372 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34373 DAG.getVectorIdxConstant(1, dl));
34374 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34375 {ResL, ResH}));
34376 Results.push_back(Ld.getValue(1));
34377 return;
34378 }
34379 break;
34380 }
34381 if (Subtarget.hasSSE1()) {
34382 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34383 // Then extract the lower 64-bits.
34384 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34385 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34386 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34387 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34388 MVT::i64, Node->getMemOperand());
34389 if (Subtarget.hasSSE2()) {
34390 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34391 DAG.getVectorIdxConstant(0, dl));
34392 Results.push_back(Res);
34393 Results.push_back(Ld.getValue(1));
34394 return;
34395 }
34396 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34397 // then casts to i64. This avoids a 128-bit stack temporary being
34398 // created by type legalization if we were to cast v4f32->v2i64.
34399 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34400 DAG.getVectorIdxConstant(0, dl));
34401 Res = DAG.getBitcast(MVT::i64, Res);
34402 Results.push_back(Res);
34403 Results.push_back(Ld.getValue(1));
34404 return;
34405 }
34406 if (Subtarget.hasX87()) {
34407 // First load this into an 80-bit X87 register. This will put the whole
34408 // integer into the significand.
34409 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34410 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34412 dl, Tys, Ops, MVT::i64,
34413 Node->getMemOperand());
34414 SDValue Chain = Result.getValue(1);
34415
34416 // Now store the X87 register to a stack temporary and convert to i64.
34417 // This store is not atomic and doesn't need to be.
34418 // FIXME: We don't need a stack temporary if the result of the load
34419 // is already being stored. We could just directly store there.
34420 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34421 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34422 MachinePointerInfo MPI =
34424 SDValue StoreOps[] = { Chain, Result, StackPtr };
34425 Chain = DAG.getMemIntrinsicNode(
34426 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34427 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34428
34429 // Finally load the value back from the stack temporary and return it.
34430 // This load is not atomic and doesn't need to be.
34431 // This load will be further type legalized.
34432 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34433 Results.push_back(Result);
34434 Results.push_back(Result.getValue(1));
34435 return;
34436 }
34437 }
34438 // TODO: Use MOVLPS when SSE1 is available?
34439 // Delegate to generic TypeLegalization. Situations we can really handle
34440 // should have already been dealt with by AtomicExpandPass.cpp.
34441 break;
34442 }
34443 case ISD::ATOMIC_SWAP:
34454 // Delegate to generic TypeLegalization. Situations we can really handle
34455 // should have already been dealt with by AtomicExpandPass.cpp.
34456 break;
34457
34458 case ISD::BITCAST: {
34459 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34460 EVT DstVT = N->getValueType(0);
34461 EVT SrcVT = N->getOperand(0).getValueType();
34462
34463 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34464 // we can split using the k-register rather than memory.
34465 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34466 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34467 SDValue Lo, Hi;
34468 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34469 Lo = DAG.getBitcast(MVT::i32, Lo);
34470 Hi = DAG.getBitcast(MVT::i32, Hi);
34471 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34472 Results.push_back(Res);
34473 return;
34474 }
34475
34476 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34477 // FIXME: Use v4f32 for SSE1?
34478 assert(Subtarget.hasSSE2() && "Requires SSE2");
34479 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34480 "Unexpected type action!");
34481 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34482 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34483 N->getOperand(0));
34484 Res = DAG.getBitcast(WideVT, Res);
34485 Results.push_back(Res);
34486 return;
34487 }
34488
34489 return;
34490 }
34491 case ISD::MGATHER: {
34492 EVT VT = N->getValueType(0);
34493 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34494 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34495 auto *Gather = cast<MaskedGatherSDNode>(N);
34496 SDValue Index = Gather->getIndex();
34497 if (Index.getValueType() != MVT::v2i64)
34498 return;
34500 "Unexpected type action!");
34501 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34502 SDValue Mask = Gather->getMask();
34503 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34504 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34505 Gather->getPassThru(),
34506 DAG.getUNDEF(VT));
34507 if (!Subtarget.hasVLX()) {
34508 // We need to widen the mask, but the instruction will only use 2
34509 // of its elements. So we can use undef.
34510 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34511 DAG.getUNDEF(MVT::v2i1));
34512 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34513 }
34514 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34515 Gather->getBasePtr(), Index, Gather->getScale() };
34516 SDValue Res = DAG.getMemIntrinsicNode(
34517 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34518 Gather->getMemoryVT(), Gather->getMemOperand());
34519 Results.push_back(Res);
34520 Results.push_back(Res.getValue(1));
34521 return;
34522 }
34523 return;
34524 }
34525 case ISD::LOAD: {
34526 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34527 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34528 // cast since type legalization will try to use an i64 load.
34529 MVT VT = N->getSimpleValueType(0);
34530 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34532 "Unexpected type action!");
34533 if (!ISD::isNON_EXTLoad(N))
34534 return;
34535 auto *Ld = cast<LoadSDNode>(N);
34536 if (Subtarget.hasSSE2()) {
34537 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34538 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34539 Ld->getPointerInfo(), Ld->getOriginalAlign(),
34540 Ld->getMemOperand()->getFlags());
34541 SDValue Chain = Res.getValue(1);
34542 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34543 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34544 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34545 Res = DAG.getBitcast(WideVT, Res);
34546 Results.push_back(Res);
34547 Results.push_back(Chain);
34548 return;
34549 }
34550 assert(Subtarget.hasSSE1() && "Expected SSE");
34551 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34552 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34553 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34554 MVT::i64, Ld->getMemOperand());
34555 Results.push_back(Res);
34556 Results.push_back(Res.getValue(1));
34557 return;
34558 }
34559 case ISD::ADDRSPACECAST: {
34560 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34561 Results.push_back(V);
34562 return;
34563 }
34564 case ISD::BITREVERSE: {
34565 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34566 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34567 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34568 // We'll need to move the scalar in two i32 pieces.
34569 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34570 return;
34571 }
34573 // f16 = extract vXf16 %vec, i64 %idx
34574 assert(N->getSimpleValueType(0) == MVT::f16 &&
34575 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34576 assert(Subtarget.hasFP16() && "Expected FP16");
34577 SDValue VecOp = N->getOperand(0);
34579 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34580 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34581 N->getOperand(1));
34582 Split = DAG.getBitcast(MVT::f16, Split);
34583 Results.push_back(Split);
34584 return;
34585 }
34586 }
34587}
34588
34589const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34590 switch ((X86ISD::NodeType)Opcode) {
34591 case X86ISD::FIRST_NUMBER: break;
34592#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34593 NODE_NAME_CASE(BSF)
34594 NODE_NAME_CASE(BSR)
34595 NODE_NAME_CASE(FSHL)
34596 NODE_NAME_CASE(FSHR)
34597 NODE_NAME_CASE(FAND)
34598 NODE_NAME_CASE(FANDN)
34599 NODE_NAME_CASE(FOR)
34600 NODE_NAME_CASE(FXOR)
34601 NODE_NAME_CASE(FILD)
34602 NODE_NAME_CASE(FIST)
34603 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34604 NODE_NAME_CASE(FLD)
34605 NODE_NAME_CASE(FST)
34606 NODE_NAME_CASE(CALL)
34607 NODE_NAME_CASE(CALL_RVMARKER)
34609 NODE_NAME_CASE(CMP)
34610 NODE_NAME_CASE(FCMP)
34611 NODE_NAME_CASE(STRICT_FCMP)
34612 NODE_NAME_CASE(STRICT_FCMPS)
34614 NODE_NAME_CASE(UCOMI)
34615 NODE_NAME_CASE(COMX)
34616 NODE_NAME_CASE(UCOMX)
34617 NODE_NAME_CASE(CMPM)
34618 NODE_NAME_CASE(CMPMM)
34619 NODE_NAME_CASE(STRICT_CMPM)
34620 NODE_NAME_CASE(CMPMM_SAE)
34621 NODE_NAME_CASE(SETCC)
34622 NODE_NAME_CASE(SETCC_CARRY)
34623 NODE_NAME_CASE(FSETCC)
34624 NODE_NAME_CASE(FSETCCM)
34625 NODE_NAME_CASE(FSETCCM_SAE)
34626 NODE_NAME_CASE(CMOV)
34627 NODE_NAME_CASE(BRCOND)
34628 NODE_NAME_CASE(RET_GLUE)
34629 NODE_NAME_CASE(IRET)
34630 NODE_NAME_CASE(REP_STOS)
34631 NODE_NAME_CASE(REP_MOVS)
34632 NODE_NAME_CASE(GlobalBaseReg)
34634 NODE_NAME_CASE(WrapperRIP)
34635 NODE_NAME_CASE(MOVQ2DQ)
34636 NODE_NAME_CASE(MOVDQ2Q)
34637 NODE_NAME_CASE(MMX_MOVD2W)
34638 NODE_NAME_CASE(MMX_MOVW2D)
34639 NODE_NAME_CASE(PEXTRB)
34640 NODE_NAME_CASE(PEXTRW)
34641 NODE_NAME_CASE(INSERTPS)
34642 NODE_NAME_CASE(PINSRB)
34643 NODE_NAME_CASE(PINSRW)
34644 NODE_NAME_CASE(PSHUFB)
34645 NODE_NAME_CASE(ANDNP)
34646 NODE_NAME_CASE(BLENDI)
34648 NODE_NAME_CASE(HADD)
34649 NODE_NAME_CASE(HSUB)
34650 NODE_NAME_CASE(FHADD)
34651 NODE_NAME_CASE(FHSUB)
34652 NODE_NAME_CASE(CONFLICT)
34653 NODE_NAME_CASE(FMAX)
34654 NODE_NAME_CASE(FMAXS)
34655 NODE_NAME_CASE(FMAX_SAE)
34656 NODE_NAME_CASE(FMAXS_SAE)
34657 NODE_NAME_CASE(STRICT_FMAX)
34658 NODE_NAME_CASE(FMIN)
34659 NODE_NAME_CASE(FMINS)
34660 NODE_NAME_CASE(FMIN_SAE)
34661 NODE_NAME_CASE(FMINS_SAE)
34662 NODE_NAME_CASE(STRICT_FMIN)
34663 NODE_NAME_CASE(FMAXC)
34664 NODE_NAME_CASE(FMINC)
34665 NODE_NAME_CASE(FRSQRT)
34666 NODE_NAME_CASE(FRCP)
34667 NODE_NAME_CASE(EXTRQI)
34668 NODE_NAME_CASE(INSERTQI)
34669 NODE_NAME_CASE(TLSADDR)
34670 NODE_NAME_CASE(TLSBASEADDR)
34671 NODE_NAME_CASE(TLSCALL)
34672 NODE_NAME_CASE(TLSDESC)
34673 NODE_NAME_CASE(EH_SJLJ_SETJMP)
34674 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
34675 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
34676 NODE_NAME_CASE(EH_RETURN)
34677 NODE_NAME_CASE(TC_RETURN)
34678 NODE_NAME_CASE(FNSTCW16m)
34679 NODE_NAME_CASE(FLDCW16m)
34680 NODE_NAME_CASE(FNSTENVm)
34681 NODE_NAME_CASE(FLDENVm)
34682 NODE_NAME_CASE(LCMPXCHG_DAG)
34683 NODE_NAME_CASE(LCMPXCHG8_DAG)
34684 NODE_NAME_CASE(LCMPXCHG16_DAG)
34685 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
34686 NODE_NAME_CASE(LADD)
34687 NODE_NAME_CASE(LSUB)
34688 NODE_NAME_CASE(LOR)
34689 NODE_NAME_CASE(LXOR)
34690 NODE_NAME_CASE(LAND)
34691 NODE_NAME_CASE(LBTS)
34692 NODE_NAME_CASE(LBTC)
34693 NODE_NAME_CASE(LBTR)
34694 NODE_NAME_CASE(LBTS_RM)
34695 NODE_NAME_CASE(LBTC_RM)
34696 NODE_NAME_CASE(LBTR_RM)
34697 NODE_NAME_CASE(AADD)
34698 NODE_NAME_CASE(AOR)
34699 NODE_NAME_CASE(AXOR)
34700 NODE_NAME_CASE(AAND)
34701 NODE_NAME_CASE(VZEXT_MOVL)
34702 NODE_NAME_CASE(VZEXT_LOAD)
34703 NODE_NAME_CASE(VEXTRACT_STORE)
34704 NODE_NAME_CASE(VTRUNC)
34705 NODE_NAME_CASE(VTRUNCS)
34706 NODE_NAME_CASE(VTRUNCUS)
34707 NODE_NAME_CASE(VMTRUNC)
34708 NODE_NAME_CASE(VMTRUNCS)
34709 NODE_NAME_CASE(VMTRUNCUS)
34710 NODE_NAME_CASE(VTRUNCSTORES)
34711 NODE_NAME_CASE(VTRUNCSTOREUS)
34712 NODE_NAME_CASE(VMTRUNCSTORES)
34713 NODE_NAME_CASE(VMTRUNCSTOREUS)
34714 NODE_NAME_CASE(VFPEXT)
34715 NODE_NAME_CASE(STRICT_VFPEXT)
34716 NODE_NAME_CASE(VFPEXT_SAE)
34717 NODE_NAME_CASE(VFPEXTS)
34718 NODE_NAME_CASE(VFPEXTS_SAE)
34719 NODE_NAME_CASE(VFPROUND)
34720 NODE_NAME_CASE(VFPROUND2)
34721 NODE_NAME_CASE(VFPROUND2_RND)
34722 NODE_NAME_CASE(STRICT_VFPROUND)
34723 NODE_NAME_CASE(VMFPROUND)
34724 NODE_NAME_CASE(VFPROUND_RND)
34725 NODE_NAME_CASE(VFPROUNDS)
34726 NODE_NAME_CASE(VFPROUNDS_RND)
34727 NODE_NAME_CASE(VSHLDQ)
34728 NODE_NAME_CASE(VSRLDQ)
34729 NODE_NAME_CASE(VSHL)
34730 NODE_NAME_CASE(VSRL)
34731 NODE_NAME_CASE(VSRA)
34732 NODE_NAME_CASE(VSHLI)
34733 NODE_NAME_CASE(VSRLI)
34734 NODE_NAME_CASE(VSRAI)
34735 NODE_NAME_CASE(VSHLV)
34736 NODE_NAME_CASE(VSRLV)
34737 NODE_NAME_CASE(VSRAV)
34738 NODE_NAME_CASE(VROTLI)
34739 NODE_NAME_CASE(VROTRI)
34740 NODE_NAME_CASE(VPPERM)
34741 NODE_NAME_CASE(CMPP)
34742 NODE_NAME_CASE(STRICT_CMPP)
34743 NODE_NAME_CASE(PCMPEQ)
34744 NODE_NAME_CASE(PCMPGT)
34745 NODE_NAME_CASE(PHMINPOS)
34746 NODE_NAME_CASE(ADD)
34747 NODE_NAME_CASE(SUB)
34748 NODE_NAME_CASE(ADC)
34749 NODE_NAME_CASE(SBB)
34750 NODE_NAME_CASE(SMUL)
34751 NODE_NAME_CASE(UMUL)
34752 NODE_NAME_CASE(OR)
34753 NODE_NAME_CASE(XOR)
34754 NODE_NAME_CASE(AND)
34755 NODE_NAME_CASE(BEXTR)
34757 NODE_NAME_CASE(BZHI)
34758 NODE_NAME_CASE(PDEP)
34759 NODE_NAME_CASE(PEXT)
34760 NODE_NAME_CASE(MUL_IMM)
34761 NODE_NAME_CASE(MOVMSK)
34762 NODE_NAME_CASE(PTEST)
34763 NODE_NAME_CASE(TESTP)
34764 NODE_NAME_CASE(KORTEST)
34765 NODE_NAME_CASE(KTEST)
34766 NODE_NAME_CASE(KADD)
34767 NODE_NAME_CASE(KSHIFTL)
34768 NODE_NAME_CASE(KSHIFTR)
34769 NODE_NAME_CASE(PACKSS)
34770 NODE_NAME_CASE(PACKUS)
34771 NODE_NAME_CASE(PALIGNR)
34772 NODE_NAME_CASE(VALIGN)
34773 NODE_NAME_CASE(VSHLD)
34774 NODE_NAME_CASE(VSHRD)
34775 NODE_NAME_CASE(VSHLDV)
34776 NODE_NAME_CASE(VSHRDV)
34777 NODE_NAME_CASE(PSHUFD)
34778 NODE_NAME_CASE(PSHUFHW)
34779 NODE_NAME_CASE(PSHUFLW)
34780 NODE_NAME_CASE(SHUFP)
34781 NODE_NAME_CASE(SHUF128)
34782 NODE_NAME_CASE(MOVLHPS)
34783 NODE_NAME_CASE(MOVHLPS)
34784 NODE_NAME_CASE(MOVDDUP)
34785 NODE_NAME_CASE(MOVSHDUP)
34786 NODE_NAME_CASE(MOVSLDUP)
34787 NODE_NAME_CASE(MOVSD)
34788 NODE_NAME_CASE(MOVSS)
34789 NODE_NAME_CASE(MOVSH)
34790 NODE_NAME_CASE(UNPCKL)
34791 NODE_NAME_CASE(UNPCKH)
34792 NODE_NAME_CASE(VBROADCAST)
34793 NODE_NAME_CASE(VBROADCAST_LOAD)
34794 NODE_NAME_CASE(VBROADCASTM)
34795 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
34796 NODE_NAME_CASE(VPERMILPV)
34797 NODE_NAME_CASE(VPERMILPI)
34798 NODE_NAME_CASE(VPERM2X128)
34799 NODE_NAME_CASE(VPERMV)
34800 NODE_NAME_CASE(VPERMV3)
34801 NODE_NAME_CASE(VPERMI)
34802 NODE_NAME_CASE(VPTERNLOG)
34803 NODE_NAME_CASE(FP_TO_SINT_SAT)
34804 NODE_NAME_CASE(FP_TO_UINT_SAT)
34805 NODE_NAME_CASE(VFIXUPIMM)
34806 NODE_NAME_CASE(VFIXUPIMM_SAE)
34807 NODE_NAME_CASE(VFIXUPIMMS)
34808 NODE_NAME_CASE(VFIXUPIMMS_SAE)
34809 NODE_NAME_CASE(VRANGE)
34810 NODE_NAME_CASE(VRANGE_SAE)
34811 NODE_NAME_CASE(VRANGES)
34812 NODE_NAME_CASE(VRANGES_SAE)
34813 NODE_NAME_CASE(PMULUDQ)
34814 NODE_NAME_CASE(PMULDQ)
34815 NODE_NAME_CASE(PSADBW)
34816 NODE_NAME_CASE(DBPSADBW)
34817 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
34818 NODE_NAME_CASE(VAARG_64)
34819 NODE_NAME_CASE(VAARG_X32)
34820 NODE_NAME_CASE(DYN_ALLOCA)
34821 NODE_NAME_CASE(MFENCE)
34822 NODE_NAME_CASE(SEG_ALLOCA)
34823 NODE_NAME_CASE(PROBED_ALLOCA)
34826 NODE_NAME_CASE(RDPKRU)
34827 NODE_NAME_CASE(WRPKRU)
34828 NODE_NAME_CASE(VPMADDUBSW)
34829 NODE_NAME_CASE(VPMADDWD)
34830 NODE_NAME_CASE(VPSHA)
34831 NODE_NAME_CASE(VPSHL)
34832 NODE_NAME_CASE(VPCOM)
34833 NODE_NAME_CASE(VPCOMU)
34834 NODE_NAME_CASE(VPERMIL2)
34836 NODE_NAME_CASE(STRICT_FMSUB)
34838 NODE_NAME_CASE(STRICT_FNMADD)
34840 NODE_NAME_CASE(STRICT_FNMSUB)
34841 NODE_NAME_CASE(FMADDSUB)
34842 NODE_NAME_CASE(FMSUBADD)
34843 NODE_NAME_CASE(FMADD_RND)
34844 NODE_NAME_CASE(FNMADD_RND)
34845 NODE_NAME_CASE(FMSUB_RND)
34846 NODE_NAME_CASE(FNMSUB_RND)
34847 NODE_NAME_CASE(FMADDSUB_RND)
34848 NODE_NAME_CASE(FMSUBADD_RND)
34849 NODE_NAME_CASE(VFMADDC)
34850 NODE_NAME_CASE(VFMADDC_RND)
34851 NODE_NAME_CASE(VFCMADDC)
34852 NODE_NAME_CASE(VFCMADDC_RND)
34853 NODE_NAME_CASE(VFMULC)
34854 NODE_NAME_CASE(VFMULC_RND)
34855 NODE_NAME_CASE(VFCMULC)
34856 NODE_NAME_CASE(VFCMULC_RND)
34857 NODE_NAME_CASE(VFMULCSH)
34858 NODE_NAME_CASE(VFMULCSH_RND)
34859 NODE_NAME_CASE(VFCMULCSH)
34860 NODE_NAME_CASE(VFCMULCSH_RND)
34861 NODE_NAME_CASE(VFMADDCSH)
34862 NODE_NAME_CASE(VFMADDCSH_RND)
34863 NODE_NAME_CASE(VFCMADDCSH)
34864 NODE_NAME_CASE(VFCMADDCSH_RND)
34865 NODE_NAME_CASE(VPMADD52H)
34866 NODE_NAME_CASE(VPMADD52L)
34867 NODE_NAME_CASE(VRNDSCALE)
34868 NODE_NAME_CASE(STRICT_VRNDSCALE)
34869 NODE_NAME_CASE(VRNDSCALE_SAE)
34870 NODE_NAME_CASE(VRNDSCALES)
34871 NODE_NAME_CASE(VRNDSCALES_SAE)
34872 NODE_NAME_CASE(VREDUCE)
34873 NODE_NAME_CASE(VREDUCE_SAE)
34874 NODE_NAME_CASE(VREDUCES)
34875 NODE_NAME_CASE(VREDUCES_SAE)
34876 NODE_NAME_CASE(VGETMANT)
34877 NODE_NAME_CASE(VGETMANT_SAE)
34878 NODE_NAME_CASE(VGETMANTS)
34879 NODE_NAME_CASE(VGETMANTS_SAE)
34880 NODE_NAME_CASE(PCMPESTR)
34881 NODE_NAME_CASE(PCMPISTR)
34883 NODE_NAME_CASE(COMPRESS)
34885 NODE_NAME_CASE(SELECTS)
34886 NODE_NAME_CASE(ADDSUB)
34887 NODE_NAME_CASE(RCP14)
34888 NODE_NAME_CASE(RCP14S)
34889 NODE_NAME_CASE(RSQRT14)
34890 NODE_NAME_CASE(RSQRT14S)
34891 NODE_NAME_CASE(FADD_RND)
34892 NODE_NAME_CASE(FADDS)
34893 NODE_NAME_CASE(FADDS_RND)
34894 NODE_NAME_CASE(FSUB_RND)
34895 NODE_NAME_CASE(FSUBS)
34896 NODE_NAME_CASE(FSUBS_RND)
34897 NODE_NAME_CASE(FMUL_RND)
34898 NODE_NAME_CASE(FMULS)
34899 NODE_NAME_CASE(FMULS_RND)
34900 NODE_NAME_CASE(FDIV_RND)
34901 NODE_NAME_CASE(FDIVS)
34902 NODE_NAME_CASE(FDIVS_RND)
34903 NODE_NAME_CASE(FSQRT_RND)
34904 NODE_NAME_CASE(FSQRTS)
34905 NODE_NAME_CASE(FSQRTS_RND)
34906 NODE_NAME_CASE(FGETEXP)
34907 NODE_NAME_CASE(FGETEXP_SAE)
34908 NODE_NAME_CASE(FGETEXPS)
34909 NODE_NAME_CASE(FGETEXPS_SAE)
34910 NODE_NAME_CASE(SCALEF)
34911 NODE_NAME_CASE(SCALEF_RND)
34912 NODE_NAME_CASE(SCALEFS)
34913 NODE_NAME_CASE(SCALEFS_RND)
34914 NODE_NAME_CASE(MULHRS)
34915 NODE_NAME_CASE(SINT_TO_FP_RND)
34916 NODE_NAME_CASE(UINT_TO_FP_RND)
34917 NODE_NAME_CASE(CVTTP2SI)
34918 NODE_NAME_CASE(CVTTP2UI)
34919 NODE_NAME_CASE(STRICT_CVTTP2SI)
34920 NODE_NAME_CASE(STRICT_CVTTP2UI)
34921 NODE_NAME_CASE(MCVTTP2SI)
34922 NODE_NAME_CASE(MCVTTP2UI)
34923 NODE_NAME_CASE(CVTTP2SI_SAE)
34924 NODE_NAME_CASE(CVTTP2UI_SAE)
34925 NODE_NAME_CASE(CVTTS2SI)
34926 NODE_NAME_CASE(CVTTS2UI)
34927 NODE_NAME_CASE(CVTTS2SI_SAE)
34928 NODE_NAME_CASE(CVTTS2UI_SAE)
34929 NODE_NAME_CASE(CVTSI2P)
34930 NODE_NAME_CASE(CVTUI2P)
34931 NODE_NAME_CASE(STRICT_CVTSI2P)
34932 NODE_NAME_CASE(STRICT_CVTUI2P)
34933 NODE_NAME_CASE(MCVTSI2P)
34934 NODE_NAME_CASE(MCVTUI2P)
34935 NODE_NAME_CASE(VFPCLASS)
34936 NODE_NAME_CASE(VFPCLASSS)
34937 NODE_NAME_CASE(MULTISHIFT)
34938 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
34939 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
34940 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
34941 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
34942 NODE_NAME_CASE(CVTPS2PH)
34943 NODE_NAME_CASE(STRICT_CVTPS2PH)
34944 NODE_NAME_CASE(CVTPS2PH_SAE)
34945 NODE_NAME_CASE(MCVTPS2PH)
34946 NODE_NAME_CASE(MCVTPS2PH_SAE)
34947 NODE_NAME_CASE(CVTPH2PS)
34948 NODE_NAME_CASE(STRICT_CVTPH2PS)
34949 NODE_NAME_CASE(CVTPH2PS_SAE)
34950 NODE_NAME_CASE(CVTP2SI)
34951 NODE_NAME_CASE(CVTP2UI)
34952 NODE_NAME_CASE(MCVTP2SI)
34953 NODE_NAME_CASE(MCVTP2UI)
34954 NODE_NAME_CASE(CVTP2SI_RND)
34955 NODE_NAME_CASE(CVTP2UI_RND)
34956 NODE_NAME_CASE(CVTS2SI)
34957 NODE_NAME_CASE(CVTS2UI)
34958 NODE_NAME_CASE(CVTS2SI_RND)
34959 NODE_NAME_CASE(CVTS2UI_RND)
34960 NODE_NAME_CASE(CVTNEPS2BF16)
34961 NODE_NAME_CASE(MCVTNEPS2BF16)
34962 NODE_NAME_CASE(DPBF16PS)
34963 NODE_NAME_CASE(DPFP16PS)
34964 NODE_NAME_CASE(MPSADBW)
34965 NODE_NAME_CASE(LWPINS)
34966 NODE_NAME_CASE(MGATHER)
34967 NODE_NAME_CASE(MSCATTER)
34968 NODE_NAME_CASE(VPDPBUSD)
34969 NODE_NAME_CASE(VPDPBUSDS)
34970 NODE_NAME_CASE(VPDPWSSD)
34971 NODE_NAME_CASE(VPDPWSSDS)
34972 NODE_NAME_CASE(VPSHUFBITQMB)
34973 NODE_NAME_CASE(GF2P8MULB)
34974 NODE_NAME_CASE(GF2P8AFFINEQB)
34975 NODE_NAME_CASE(GF2P8AFFINEINVQB)
34976 NODE_NAME_CASE(NT_CALL)
34977 NODE_NAME_CASE(NT_BRIND)
34978 NODE_NAME_CASE(UMWAIT)
34979 NODE_NAME_CASE(TPAUSE)
34980 NODE_NAME_CASE(ENQCMD)
34981 NODE_NAME_CASE(ENQCMDS)
34982 NODE_NAME_CASE(VP2INTERSECT)
34983 NODE_NAME_CASE(VPDPBSUD)
34984 NODE_NAME_CASE(VPDPBSUDS)
34985 NODE_NAME_CASE(VPDPBUUD)
34986 NODE_NAME_CASE(VPDPBUUDS)
34987 NODE_NAME_CASE(VPDPBSSD)
34988 NODE_NAME_CASE(VPDPBSSDS)
34989 NODE_NAME_CASE(VPDPWSUD)
34990 NODE_NAME_CASE(VPDPWSUDS)
34991 NODE_NAME_CASE(VPDPWUSD)
34992 NODE_NAME_CASE(VPDPWUSDS)
34993 NODE_NAME_CASE(VPDPWUUD)
34994 NODE_NAME_CASE(VPDPWUUDS)
34995 NODE_NAME_CASE(VMINMAX)
34996 NODE_NAME_CASE(VMINMAX_SAE)
34997 NODE_NAME_CASE(VMINMAXS)
34998 NODE_NAME_CASE(VMINMAXS_SAE)
34999 NODE_NAME_CASE(CVTP2IBS)
35000 NODE_NAME_CASE(CVTP2IUBS)
35001 NODE_NAME_CASE(CVTP2IBS_RND)
35002 NODE_NAME_CASE(CVTP2IUBS_RND)
35003 NODE_NAME_CASE(CVTTP2IBS)
35004 NODE_NAME_CASE(CVTTP2IUBS)
35005 NODE_NAME_CASE(CVTTP2IBS_SAE)
35006 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35007 NODE_NAME_CASE(VCVT2PH2BF8)
35008 NODE_NAME_CASE(VCVT2PH2BF8S)
35009 NODE_NAME_CASE(VCVT2PH2HF8)
35010 NODE_NAME_CASE(VCVT2PH2HF8S)
35011 NODE_NAME_CASE(VCVTBIASPH2BF8)
35012 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35013 NODE_NAME_CASE(VCVTBIASPH2HF8)
35014 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35015 NODE_NAME_CASE(VCVTPH2BF8)
35016 NODE_NAME_CASE(VCVTPH2BF8S)
35017 NODE_NAME_CASE(VCVTPH2HF8)
35018 NODE_NAME_CASE(VCVTPH2HF8S)
35019 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35020 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35021 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35022 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35023 NODE_NAME_CASE(VMCVTPH2BF8)
35024 NODE_NAME_CASE(VMCVTPH2BF8S)
35025 NODE_NAME_CASE(VMCVTPH2HF8)
35026 NODE_NAME_CASE(VMCVTPH2HF8S)
35027 NODE_NAME_CASE(VCVTHF82PH)
35028 NODE_NAME_CASE(AESENC128KL)
35029 NODE_NAME_CASE(AESDEC128KL)
35030 NODE_NAME_CASE(AESENC256KL)
35031 NODE_NAME_CASE(AESDEC256KL)
35032 NODE_NAME_CASE(AESENCWIDE128KL)
35033 NODE_NAME_CASE(AESDECWIDE128KL)
35034 NODE_NAME_CASE(AESENCWIDE256KL)
35035 NODE_NAME_CASE(AESDECWIDE256KL)
35036 NODE_NAME_CASE(CMPCCXADD)
35037 NODE_NAME_CASE(TESTUI)
35038 NODE_NAME_CASE(FP80_ADD)
35039 NODE_NAME_CASE(STRICT_FP80_ADD)
35040 NODE_NAME_CASE(CCMP)
35041 NODE_NAME_CASE(CTEST)
35042 NODE_NAME_CASE(CLOAD)
35043 NODE_NAME_CASE(CSTORE)
35044 NODE_NAME_CASE(CVTTS2SIS)
35045 NODE_NAME_CASE(CVTTS2UIS)
35046 NODE_NAME_CASE(CVTTS2SIS_SAE)
35047 NODE_NAME_CASE(CVTTS2UIS_SAE)
35048 NODE_NAME_CASE(CVTTP2SIS)
35049 NODE_NAME_CASE(MCVTTP2SIS)
35050 NODE_NAME_CASE(CVTTP2UIS_SAE)
35051 NODE_NAME_CASE(CVTTP2SIS_SAE)
35052 NODE_NAME_CASE(CVTTP2UIS)
35053 NODE_NAME_CASE(MCVTTP2UIS)
35054 }
35055 return nullptr;
35056#undef NODE_NAME_CASE
35057}
35058
35059/// Return true if the addressing mode represented by AM is legal for this
35060/// target, for a load/store of the specified type.
35062 const AddrMode &AM, Type *Ty,
35063 unsigned AS,
35064 Instruction *I) const {
35065 // X86 supports extremely general addressing modes.
35067
35068 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35069 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35070 return false;
35071
35072 if (AM.BaseGV) {
35073 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35074
35075 // If a reference to this global requires an extra load, we can't fold it.
35076 if (isGlobalStubReference(GVFlags))
35077 return false;
35078
35079 // If BaseGV requires a register for the PIC base, we cannot also have a
35080 // BaseReg specified.
35081 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35082 return false;
35083
35084 // If lower 4G is not available, then we must use rip-relative addressing.
35085 if ((M != CodeModel::Small || isPositionIndependent()) &&
35086 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35087 return false;
35088 }
35089
35090 switch (AM.Scale) {
35091 case 0:
35092 case 1:
35093 case 2:
35094 case 4:
35095 case 8:
35096 // These scales always work.
35097 break;
35098 case 3:
35099 case 5:
35100 case 9:
35101 // These scales are formed with basereg+scalereg. Only accept if there is
35102 // no basereg yet.
35103 if (AM.HasBaseReg)
35104 return false;
35105 break;
35106 default: // Other stuff never works.
35107 return false;
35108 }
35109
35110 return true;
35111}
35112
35113bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35114 switch (Opcode) {
35115 // These are non-commutative binops.
35116 // TODO: Add more X86ISD opcodes once we have test coverage.
35117 case X86ISD::ANDNP:
35118 case X86ISD::PCMPGT:
35119 case X86ISD::FMAX:
35120 case X86ISD::FMIN:
35121 case X86ISD::FANDN:
35122 case X86ISD::VPSHA:
35123 case X86ISD::VPSHL:
35124 case X86ISD::VSHLV:
35125 case X86ISD::VSRLV:
35126 case X86ISD::VSRAV:
35127 return true;
35128 }
35129
35130 return TargetLoweringBase::isBinOp(Opcode);
35131}
35132
35133bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35134 switch (Opcode) {
35135 // TODO: Add more X86ISD opcodes once we have test coverage.
35136 case X86ISD::PCMPEQ:
35137 case X86ISD::PMULDQ:
35138 case X86ISD::PMULUDQ:
35139 case X86ISD::FMAXC:
35140 case X86ISD::FMINC:
35141 case X86ISD::FAND:
35142 case X86ISD::FOR:
35143 case X86ISD::FXOR:
35144 return true;
35145 }
35146
35148}
35149
35151 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35152 return false;
35153 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35154 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35155 return NumBits1 > NumBits2;
35156}
35157
35159 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35160 return false;
35161
35162 if (!isTypeLegal(EVT::getEVT(Ty1)))
35163 return false;
35164
35165 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35166
35167 // Assuming the caller doesn't have a zeroext or signext return parameter,
35168 // truncation all the way down to i1 is valid.
35169 return true;
35170}
35171
35173 return isInt<32>(Imm);
35174}
35175
35177 // Can also use sub to handle negated immediates.
35178 return isInt<32>(Imm);
35179}
35180
35182 return isInt<32>(Imm);
35183}
35184
35186 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35187 return false;
35188 unsigned NumBits1 = VT1.getSizeInBits();
35189 unsigned NumBits2 = VT2.getSizeInBits();
35190 return NumBits1 > NumBits2;
35191}
35192
35194 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35195 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35196}
35197
35199 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35200 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35201}
35202
35204 EVT VT1 = Val.getValueType();
35205 if (isZExtFree(VT1, VT2))
35206 return true;
35207
35208 if (Val.getOpcode() != ISD::LOAD)
35209 return false;
35210
35211 if (!VT1.isSimple() || !VT1.isInteger() ||
35212 !VT2.isSimple() || !VT2.isInteger())
35213 return false;
35214
35215 switch (VT1.getSimpleVT().SimpleTy) {
35216 default: break;
35217 case MVT::i8:
35218 case MVT::i16:
35219 case MVT::i32:
35220 // X86 has 8, 16, and 32-bit zero-extending loads.
35221 return true;
35222 }
35223
35224 return false;
35225}
35226
35228 if (!Subtarget.is64Bit())
35229 return false;
35231}
35232
35234 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35235 return false;
35236
35237 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35238
35239 // There is no extending load for vXi1.
35240 if (SrcVT.getScalarType() == MVT::i1)
35241 return false;
35242
35243 return true;
35244}
35245
35247 EVT VT) const {
35248 if (Subtarget.useSoftFloat())
35249 return false;
35250
35251 if (!Subtarget.hasAnyFMA())
35252 return false;
35253
35254 VT = VT.getScalarType();
35255
35256 if (!VT.isSimple())
35257 return false;
35258
35259 switch (VT.getSimpleVT().SimpleTy) {
35260 case MVT::f16:
35261 return Subtarget.hasFP16();
35262 case MVT::f32:
35263 case MVT::f64:
35264 return true;
35265 default:
35266 break;
35267 }
35268
35269 return false;
35270}
35271
35273 EVT DestVT) const {
35274 // i16 instructions are longer (0x66 prefix) and potentially slower.
35275 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35276}
35277
35279 EVT VT) const {
35280 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35281 // benefit. The transform may also be profitable for scalar code.
35282 if (!Subtarget.hasAVX512())
35283 return false;
35284 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35285 return false;
35286 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35287 return false;
35288
35289 return true;
35290}
35291
35292/// Targets can use this to indicate that they only support *some*
35293/// VECTOR_SHUFFLE operations, those with specific masks.
35294/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35295/// are assumed to be legal.
35297 if (!VT.isSimple())
35298 return false;
35299
35300 // Not for i1 vectors
35301 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35302 return false;
35303
35304 // Very little shuffling can be done for 64-bit vectors right now.
35305 if (VT.getSimpleVT().getSizeInBits() == 64)
35306 return false;
35307
35308 // We only care that the types being shuffled are legal. The lowering can
35309 // handle any possible shuffle mask that results.
35310 return isTypeLegal(VT.getSimpleVT());
35311}
35312
35314 EVT VT) const {
35315 // Don't convert an 'and' into a shuffle that we don't directly support.
35316 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35317 if (!Subtarget.hasAVX2())
35318 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35319 return false;
35320
35321 // Just delegate to the generic legality, clear masks aren't special.
35322 return isShuffleMaskLegal(Mask, VT);
35323}
35324
35326 // If the subtarget is using thunks, we need to not generate jump tables.
35327 if (Subtarget.useIndirectThunkBranches())
35328 return false;
35329
35330 // Otherwise, fallback on the generic logic.
35332}
35333
35335 EVT ConditionVT) const {
35336 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35337 // zero-extensions.
35338 if (ConditionVT.getSizeInBits() < 32)
35339 return MVT::i32;
35341 ConditionVT);
35342}
35343
35344//===----------------------------------------------------------------------===//
35345// X86 Scheduler Hooks
35346//===----------------------------------------------------------------------===//
35347
35348// Returns true if EFLAG is consumed after this iterator in the rest of the
35349// basic block or any successors of the basic block.
35351 MachineBasicBlock *BB) {
35352 // Scan forward through BB for a use/def of EFLAGS.
35353 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
35354 if (mi.readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
35355 return true;
35356 // If we found a def, we can stop searching.
35357 if (mi.definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
35358 return false;
35359 }
35360
35361 // If we hit the end of the block, check whether EFLAGS is live into a
35362 // successor.
35363 for (MachineBasicBlock *Succ : BB->successors())
35364 if (Succ->isLiveIn(X86::EFLAGS))
35365 return true;
35366
35367 return false;
35368}
35369
35370/// Utility function to emit xbegin specifying the start of an RTM region.
35372 const TargetInstrInfo *TII) {
35373 const MIMetadata MIMD(MI);
35374
35375 const BasicBlock *BB = MBB->getBasicBlock();
35377
35378 // For the v = xbegin(), we generate
35379 //
35380 // thisMBB:
35381 // xbegin sinkMBB
35382 //
35383 // mainMBB:
35384 // s0 = -1
35385 //
35386 // fallBB:
35387 // eax = # XABORT_DEF
35388 // s1 = eax
35389 //
35390 // sinkMBB:
35391 // v = phi(s0/mainBB, s1/fallBB)
35392
35393 MachineBasicBlock *thisMBB = MBB;
35394 MachineFunction *MF = MBB->getParent();
35395 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35396 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35397 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35398 MF->insert(I, mainMBB);
35399 MF->insert(I, fallMBB);
35400 MF->insert(I, sinkMBB);
35401
35402 if (isEFLAGSLiveAfter(MI, MBB)) {
35403 mainMBB->addLiveIn(X86::EFLAGS);
35404 fallMBB->addLiveIn(X86::EFLAGS);
35405 sinkMBB->addLiveIn(X86::EFLAGS);
35406 }
35407
35408 // Transfer the remainder of BB and its successor edges to sinkMBB.
35409 sinkMBB->splice(sinkMBB->begin(), MBB,
35410 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35412
35414 Register DstReg = MI.getOperand(0).getReg();
35415 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35416 Register mainDstReg = MRI.createVirtualRegister(RC);
35417 Register fallDstReg = MRI.createVirtualRegister(RC);
35418
35419 // thisMBB:
35420 // xbegin fallMBB
35421 // # fallthrough to mainMBB
35422 // # abortion to fallMBB
35423 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35424 thisMBB->addSuccessor(mainMBB);
35425 thisMBB->addSuccessor(fallMBB);
35426
35427 // mainMBB:
35428 // mainDstReg := -1
35429 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35430 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35431 mainMBB->addSuccessor(sinkMBB);
35432
35433 // fallMBB:
35434 // ; pseudo instruction to model hardware's definition from XABORT
35435 // EAX := XABORT_DEF
35436 // fallDstReg := EAX
35437 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35438 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35439 .addReg(X86::EAX);
35440 fallMBB->addSuccessor(sinkMBB);
35441
35442 // sinkMBB:
35443 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35444 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35445 .addReg(mainDstReg).addMBB(mainMBB)
35446 .addReg(fallDstReg).addMBB(fallMBB);
35447
35448 MI.eraseFromParent();
35449 return sinkMBB;
35450}
35451
35453X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35454 MachineBasicBlock *MBB) const {
35455 // Emit va_arg instruction on X86-64.
35456
35457 // Operands to this pseudo-instruction:
35458 // 0 ) Output : destination address (reg)
35459 // 1-5) Input : va_list address (addr, i64mem)
35460 // 6 ) ArgSize : Size (in bytes) of vararg type
35461 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35462 // 8 ) Align : Alignment of type
35463 // 9 ) EFLAGS (implicit-def)
35464
35465 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35466 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35467
35468 Register DestReg = MI.getOperand(0).getReg();
35469 MachineOperand &Base = MI.getOperand(1);
35470 MachineOperand &Scale = MI.getOperand(2);
35471 MachineOperand &Index = MI.getOperand(3);
35472 MachineOperand &Disp = MI.getOperand(4);
35473 MachineOperand &Segment = MI.getOperand(5);
35474 unsigned ArgSize = MI.getOperand(6).getImm();
35475 unsigned ArgMode = MI.getOperand(7).getImm();
35476 Align Alignment = Align(MI.getOperand(8).getImm());
35477
35478 MachineFunction *MF = MBB->getParent();
35479
35480 // Memory Reference
35481 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35482
35483 MachineMemOperand *OldMMO = MI.memoperands().front();
35484
35485 // Clone the MMO into two separate MMOs for loading and storing
35486 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35487 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35488 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35489 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35490
35491 // Machine Information
35492 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35494 const TargetRegisterClass *AddrRegClass =
35496 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35497 const MIMetadata MIMD(MI);
35498
35499 // struct va_list {
35500 // i32 gp_offset
35501 // i32 fp_offset
35502 // i64 overflow_area (address)
35503 // i64 reg_save_area (address)
35504 // }
35505 // sizeof(va_list) = 24
35506 // alignment(va_list) = 8
35507
35508 unsigned TotalNumIntRegs = 6;
35509 unsigned TotalNumXMMRegs = 8;
35510 bool UseGPOffset = (ArgMode == 1);
35511 bool UseFPOffset = (ArgMode == 2);
35512 unsigned MaxOffset = TotalNumIntRegs * 8 +
35513 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35514
35515 /* Align ArgSize to a multiple of 8 */
35516 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35517 bool NeedsAlign = (Alignment > 8);
35518
35519 MachineBasicBlock *thisMBB = MBB;
35520 MachineBasicBlock *overflowMBB;
35521 MachineBasicBlock *offsetMBB;
35522 MachineBasicBlock *endMBB;
35523
35524 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
35525 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
35526 unsigned OffsetReg = 0;
35527
35528 if (!UseGPOffset && !UseFPOffset) {
35529 // If we only pull from the overflow region, we don't create a branch.
35530 // We don't need to alter control flow.
35531 OffsetDestReg = 0; // unused
35532 OverflowDestReg = DestReg;
35533
35534 offsetMBB = nullptr;
35535 overflowMBB = thisMBB;
35536 endMBB = thisMBB;
35537 } else {
35538 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35539 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35540 // If not, pull from overflow_area. (branch to overflowMBB)
35541 //
35542 // thisMBB
35543 // | .
35544 // | .
35545 // offsetMBB overflowMBB
35546 // | .
35547 // | .
35548 // endMBB
35549
35550 // Registers for the PHI in endMBB
35551 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35552 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35553
35554 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35555 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35556 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35557 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35558
35560
35561 // Insert the new basic blocks
35562 MF->insert(MBBIter, offsetMBB);
35563 MF->insert(MBBIter, overflowMBB);
35564 MF->insert(MBBIter, endMBB);
35565
35566 // Transfer the remainder of MBB and its successor edges to endMBB.
35567 endMBB->splice(endMBB->begin(), thisMBB,
35568 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35569 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35570
35571 // Make offsetMBB and overflowMBB successors of thisMBB
35572 thisMBB->addSuccessor(offsetMBB);
35573 thisMBB->addSuccessor(overflowMBB);
35574
35575 // endMBB is a successor of both offsetMBB and overflowMBB
35576 offsetMBB->addSuccessor(endMBB);
35577 overflowMBB->addSuccessor(endMBB);
35578
35579 // Load the offset value into a register
35580 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35581 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35582 .add(Base)
35583 .add(Scale)
35584 .add(Index)
35585 .addDisp(Disp, UseFPOffset ? 4 : 0)
35586 .add(Segment)
35587 .setMemRefs(LoadOnlyMMO);
35588
35589 // Check if there is enough room left to pull this argument.
35590 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35591 .addReg(OffsetReg)
35592 .addImm(MaxOffset + 8 - ArgSizeA8);
35593
35594 // Branch to "overflowMBB" if offset >= max
35595 // Fall through to "offsetMBB" otherwise
35596 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35597 .addMBB(overflowMBB).addImm(X86::COND_AE);
35598 }
35599
35600 // In offsetMBB, emit code to use the reg_save_area.
35601 if (offsetMBB) {
35602 assert(OffsetReg != 0);
35603
35604 // Read the reg_save_area address.
35605 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35606 BuildMI(
35607 offsetMBB, MIMD,
35608 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35609 RegSaveReg)
35610 .add(Base)
35611 .add(Scale)
35612 .add(Index)
35613 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35614 .add(Segment)
35615 .setMemRefs(LoadOnlyMMO);
35616
35617 if (Subtarget.isTarget64BitLP64()) {
35618 // Zero-extend the offset
35619 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35620 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35621 .addImm(0)
35622 .addReg(OffsetReg)
35623 .addImm(X86::sub_32bit);
35624
35625 // Add the offset to the reg_save_area to get the final address.
35626 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35627 .addReg(OffsetReg64)
35628 .addReg(RegSaveReg);
35629 } else {
35630 // Add the offset to the reg_save_area to get the final address.
35631 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35632 .addReg(OffsetReg)
35633 .addReg(RegSaveReg);
35634 }
35635
35636 // Compute the offset for the next argument
35637 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35638 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
35639 .addReg(OffsetReg)
35640 .addImm(UseFPOffset ? 16 : 8);
35641
35642 // Store it back into the va_list.
35643 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
35644 .add(Base)
35645 .add(Scale)
35646 .add(Index)
35647 .addDisp(Disp, UseFPOffset ? 4 : 0)
35648 .add(Segment)
35649 .addReg(NextOffsetReg)
35650 .setMemRefs(StoreOnlyMMO);
35651
35652 // Jump to endMBB
35653 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
35654 .addMBB(endMBB);
35655 }
35656
35657 //
35658 // Emit code to use overflow area
35659 //
35660
35661 // Load the overflow_area address into a register.
35662 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
35663 BuildMI(overflowMBB, MIMD,
35664 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35665 OverflowAddrReg)
35666 .add(Base)
35667 .add(Scale)
35668 .add(Index)
35669 .addDisp(Disp, 8)
35670 .add(Segment)
35671 .setMemRefs(LoadOnlyMMO);
35672
35673 // If we need to align it, do so. Otherwise, just copy the address
35674 // to OverflowDestReg.
35675 if (NeedsAlign) {
35676 // Align the overflow address
35677 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
35678
35679 // aligned_addr = (addr + (align-1)) & ~(align-1)
35680 BuildMI(
35681 overflowMBB, MIMD,
35682 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35683 TmpReg)
35684 .addReg(OverflowAddrReg)
35685 .addImm(Alignment.value() - 1);
35686
35687 BuildMI(
35688 overflowMBB, MIMD,
35689 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
35690 OverflowDestReg)
35691 .addReg(TmpReg)
35692 .addImm(~(uint64_t)(Alignment.value() - 1));
35693 } else {
35694 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
35695 .addReg(OverflowAddrReg);
35696 }
35697
35698 // Compute the next overflow address after this argument.
35699 // (the overflow address should be kept 8-byte aligned)
35700 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
35701 BuildMI(
35702 overflowMBB, MIMD,
35703 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35704 NextAddrReg)
35705 .addReg(OverflowDestReg)
35706 .addImm(ArgSizeA8);
35707
35708 // Store the new overflow address.
35709 BuildMI(overflowMBB, MIMD,
35710 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
35711 .add(Base)
35712 .add(Scale)
35713 .add(Index)
35714 .addDisp(Disp, 8)
35715 .add(Segment)
35716 .addReg(NextAddrReg)
35717 .setMemRefs(StoreOnlyMMO);
35718
35719 // If we branched, emit the PHI to the front of endMBB.
35720 if (offsetMBB) {
35721 BuildMI(*endMBB, endMBB->begin(), MIMD,
35722 TII->get(X86::PHI), DestReg)
35723 .addReg(OffsetDestReg).addMBB(offsetMBB)
35724 .addReg(OverflowDestReg).addMBB(overflowMBB);
35725 }
35726
35727 // Erase the pseudo instruction
35728 MI.eraseFromParent();
35729
35730 return endMBB;
35731}
35732
35733// The EFLAGS operand of SelectItr might be missing a kill marker
35734// because there were multiple uses of EFLAGS, and ISel didn't know
35735// which to mark. Figure out whether SelectItr should have had a
35736// kill marker, and set it if it should. Returns the correct kill
35737// marker value.
35740 const TargetRegisterInfo* TRI) {
35741 if (isEFLAGSLiveAfter(SelectItr, BB))
35742 return false;
35743
35744 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
35745 // out. SelectMI should have a kill flag on EFLAGS.
35746 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
35747 return true;
35748}
35749
35750// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
35751// together with other CMOV pseudo-opcodes into a single basic-block with
35752// conditional jump around it.
35754 switch (MI.getOpcode()) {
35755 case X86::CMOV_FR16:
35756 case X86::CMOV_FR16X:
35757 case X86::CMOV_FR32:
35758 case X86::CMOV_FR32X:
35759 case X86::CMOV_FR64:
35760 case X86::CMOV_FR64X:
35761 case X86::CMOV_GR8:
35762 case X86::CMOV_GR16:
35763 case X86::CMOV_GR32:
35764 case X86::CMOV_RFP32:
35765 case X86::CMOV_RFP64:
35766 case X86::CMOV_RFP80:
35767 case X86::CMOV_VR64:
35768 case X86::CMOV_VR128:
35769 case X86::CMOV_VR128X:
35770 case X86::CMOV_VR256:
35771 case X86::CMOV_VR256X:
35772 case X86::CMOV_VR512:
35773 case X86::CMOV_VK1:
35774 case X86::CMOV_VK2:
35775 case X86::CMOV_VK4:
35776 case X86::CMOV_VK8:
35777 case X86::CMOV_VK16:
35778 case X86::CMOV_VK32:
35779 case X86::CMOV_VK64:
35780 return true;
35781
35782 default:
35783 return false;
35784 }
35785}
35786
35787// Helper function, which inserts PHI functions into SinkMBB:
35788// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
35789// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
35790// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
35791// the last PHI function inserted.
35794 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
35795 MachineBasicBlock *SinkMBB) {
35796 MachineFunction *MF = TrueMBB->getParent();
35798 const MIMetadata MIMD(*MIItBegin);
35799
35800 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
35802
35803 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
35804
35805 // As we are creating the PHIs, we have to be careful if there is more than
35806 // one. Later CMOVs may reference the results of earlier CMOVs, but later
35807 // PHIs have to reference the individual true/false inputs from earlier PHIs.
35808 // That also means that PHI construction must work forward from earlier to
35809 // later, and that the code must maintain a mapping from earlier PHI's
35810 // destination registers, and the registers that went into the PHI.
35813
35814 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
35815 Register DestReg = MIIt->getOperand(0).getReg();
35816 Register Op1Reg = MIIt->getOperand(1).getReg();
35817 Register Op2Reg = MIIt->getOperand(2).getReg();
35818
35819 // If this CMOV we are generating is the opposite condition from
35820 // the jump we generated, then we have to swap the operands for the
35821 // PHI that is going to be generated.
35822 if (MIIt->getOperand(3).getImm() == OppCC)
35823 std::swap(Op1Reg, Op2Reg);
35824
35825 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
35826 Op1Reg = It->second.first;
35827
35828 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
35829 Op2Reg = It->second.second;
35830
35831 MIB =
35832 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
35833 .addReg(Op1Reg)
35834 .addMBB(FalseMBB)
35835 .addReg(Op2Reg)
35836 .addMBB(TrueMBB);
35837
35838 // Add this PHI to the rewrite table.
35839 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
35840 }
35841
35842 return MIB;
35843}
35844
35845// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
35847X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
35848 MachineInstr &SecondCascadedCMOV,
35849 MachineBasicBlock *ThisMBB) const {
35850 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35851 const MIMetadata MIMD(FirstCMOV);
35852
35853 // We lower cascaded CMOVs such as
35854 //
35855 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
35856 //
35857 // to two successive branches.
35858 //
35859 // Without this, we would add a PHI between the two jumps, which ends up
35860 // creating a few copies all around. For instance, for
35861 //
35862 // (sitofp (zext (fcmp une)))
35863 //
35864 // we would generate:
35865 //
35866 // ucomiss %xmm1, %xmm0
35867 // movss <1.0f>, %xmm0
35868 // movaps %xmm0, %xmm1
35869 // jne .LBB5_2
35870 // xorps %xmm1, %xmm1
35871 // .LBB5_2:
35872 // jp .LBB5_4
35873 // movaps %xmm1, %xmm0
35874 // .LBB5_4:
35875 // retq
35876 //
35877 // because this custom-inserter would have generated:
35878 //
35879 // A
35880 // | \
35881 // | B
35882 // | /
35883 // C
35884 // | \
35885 // | D
35886 // | /
35887 // E
35888 //
35889 // A: X = ...; Y = ...
35890 // B: empty
35891 // C: Z = PHI [X, A], [Y, B]
35892 // D: empty
35893 // E: PHI [X, C], [Z, D]
35894 //
35895 // If we lower both CMOVs in a single step, we can instead generate:
35896 //
35897 // A
35898 // | \
35899 // | C
35900 // | /|
35901 // |/ |
35902 // | |
35903 // | D
35904 // | /
35905 // E
35906 //
35907 // A: X = ...; Y = ...
35908 // D: empty
35909 // E: PHI [X, A], [X, C], [Y, D]
35910 //
35911 // Which, in our sitofp/fcmp example, gives us something like:
35912 //
35913 // ucomiss %xmm1, %xmm0
35914 // movss <1.0f>, %xmm0
35915 // jne .LBB5_4
35916 // jp .LBB5_4
35917 // xorps %xmm0, %xmm0
35918 // .LBB5_4:
35919 // retq
35920 //
35921
35922 // We lower cascaded CMOV into two successive branches to the same block.
35923 // EFLAGS is used by both, so mark it as live in the second.
35924 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35925 MachineFunction *F = ThisMBB->getParent();
35926 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35927 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35928 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35929
35930 MachineFunction::iterator It = ++ThisMBB->getIterator();
35931 F->insert(It, FirstInsertedMBB);
35932 F->insert(It, SecondInsertedMBB);
35933 F->insert(It, SinkMBB);
35934
35935 // For a cascaded CMOV, we lower it to two successive branches to
35936 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
35937 // the FirstInsertedMBB.
35938 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
35939
35940 // If the EFLAGS register isn't dead in the terminator, then claim that it's
35941 // live into the sink and copy blocks.
35942 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35943 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
35944 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
35945 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
35946 SinkMBB->addLiveIn(X86::EFLAGS);
35947 }
35948
35949 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35950 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
35951 std::next(MachineBasicBlock::iterator(FirstCMOV)),
35952 ThisMBB->end());
35953 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35954
35955 // Fallthrough block for ThisMBB.
35956 ThisMBB->addSuccessor(FirstInsertedMBB);
35957 // The true block target of the first branch is always SinkMBB.
35958 ThisMBB->addSuccessor(SinkMBB);
35959 // Fallthrough block for FirstInsertedMBB.
35960 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
35961 // The true block for the branch of FirstInsertedMBB.
35962 FirstInsertedMBB->addSuccessor(SinkMBB);
35963 // This is fallthrough.
35964 SecondInsertedMBB->addSuccessor(SinkMBB);
35965
35966 // Create the conditional branch instructions.
35967 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
35968 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
35969
35970 X86::CondCode SecondCC =
35971 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
35972 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
35973 .addMBB(SinkMBB)
35974 .addImm(SecondCC);
35975
35976 // SinkMBB:
35977 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
35978 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
35979 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
35980 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
35982 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
35983 .addReg(Op1Reg)
35984 .addMBB(SecondInsertedMBB)
35985 .addReg(Op2Reg)
35986 .addMBB(ThisMBB);
35987
35988 // The second SecondInsertedMBB provides the same incoming value as the
35989 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
35990 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
35991
35992 // Now remove the CMOVs.
35993 FirstCMOV.eraseFromParent();
35994 SecondCascadedCMOV.eraseFromParent();
35995
35996 return SinkMBB;
35997}
35998
36000X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36001 MachineBasicBlock *ThisMBB) const {
36002 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36003 const MIMetadata MIMD(MI);
36004
36005 // To "insert" a SELECT_CC instruction, we actually have to insert the
36006 // diamond control-flow pattern. The incoming instruction knows the
36007 // destination vreg to set, the condition code register to branch on, the
36008 // true/false values to select between and a branch opcode to use.
36009
36010 // ThisMBB:
36011 // ...
36012 // TrueVal = ...
36013 // cmpTY ccX, r1, r2
36014 // bCC copy1MBB
36015 // fallthrough --> FalseMBB
36016
36017 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36018 // as described above, by inserting a BB, and then making a PHI at the join
36019 // point to select the true and false operands of the CMOV in the PHI.
36020 //
36021 // The code also handles two different cases of multiple CMOV opcodes
36022 // in a row.
36023 //
36024 // Case 1:
36025 // In this case, there are multiple CMOVs in a row, all which are based on
36026 // the same condition setting (or the exact opposite condition setting).
36027 // In this case we can lower all the CMOVs using a single inserted BB, and
36028 // then make a number of PHIs at the join point to model the CMOVs. The only
36029 // trickiness here, is that in a case like:
36030 //
36031 // t2 = CMOV cond1 t1, f1
36032 // t3 = CMOV cond1 t2, f2
36033 //
36034 // when rewriting this into PHIs, we have to perform some renaming on the
36035 // temps since you cannot have a PHI operand refer to a PHI result earlier
36036 // in the same block. The "simple" but wrong lowering would be:
36037 //
36038 // t2 = PHI t1(BB1), f1(BB2)
36039 // t3 = PHI t2(BB1), f2(BB2)
36040 //
36041 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36042 // renaming is to note that on the path through BB1, t2 is really just a
36043 // copy of t1, and do that renaming, properly generating:
36044 //
36045 // t2 = PHI t1(BB1), f1(BB2)
36046 // t3 = PHI t1(BB1), f2(BB2)
36047 //
36048 // Case 2:
36049 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36050 // function - EmitLoweredCascadedSelect.
36051
36052 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36054 MachineInstr *LastCMOV = &MI;
36056
36057 // Check for case 1, where there are multiple CMOVs with the same condition
36058 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36059 // number of jumps the most.
36060
36061 if (isCMOVPseudo(MI)) {
36062 // See if we have a string of CMOVS with the same condition. Skip over
36063 // intervening debug insts.
36064 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36065 (NextMIIt->getOperand(3).getImm() == CC ||
36066 NextMIIt->getOperand(3).getImm() == OppCC)) {
36067 LastCMOV = &*NextMIIt;
36068 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36069 }
36070 }
36071
36072 // This checks for case 2, but only do this if we didn't already find
36073 // case 1, as indicated by LastCMOV == MI.
36074 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36075 NextMIIt->getOpcode() == MI.getOpcode() &&
36076 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36077 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36078 NextMIIt->getOperand(1).isKill()) {
36079 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36080 }
36081
36082 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36083 MachineFunction *F = ThisMBB->getParent();
36084 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36085 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36086
36087 MachineFunction::iterator It = ++ThisMBB->getIterator();
36088 F->insert(It, FalseMBB);
36089 F->insert(It, SinkMBB);
36090
36091 // Set the call frame size on entry to the new basic blocks.
36092 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36093 FalseMBB->setCallFrameSize(CallFrameSize);
36094 SinkMBB->setCallFrameSize(CallFrameSize);
36095
36096 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36097 // live into the sink and copy blocks.
36098 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36099 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36100 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36101 FalseMBB->addLiveIn(X86::EFLAGS);
36102 SinkMBB->addLiveIn(X86::EFLAGS);
36103 }
36104
36105 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36107 MachineBasicBlock::iterator(LastCMOV));
36108 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36109 if (MI.isDebugInstr())
36110 SinkMBB->push_back(MI.removeFromParent());
36111
36112 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36113 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36114 std::next(MachineBasicBlock::iterator(LastCMOV)),
36115 ThisMBB->end());
36116 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36117
36118 // Fallthrough block for ThisMBB.
36119 ThisMBB->addSuccessor(FalseMBB);
36120 // The true block target of the first (or only) branch is always a SinkMBB.
36121 ThisMBB->addSuccessor(SinkMBB);
36122 // Fallthrough block for FalseMBB.
36123 FalseMBB->addSuccessor(SinkMBB);
36124
36125 // Create the conditional branch instruction.
36126 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36127
36128 // SinkMBB:
36129 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36130 // ...
36133 std::next(MachineBasicBlock::iterator(LastCMOV));
36134 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36135
36136 // Now remove the CMOV(s).
36137 ThisMBB->erase(MIItBegin, MIItEnd);
36138
36139 return SinkMBB;
36140}
36141
36142static unsigned getSUBriOpcode(bool IsLP64) {
36143 if (IsLP64)
36144 return X86::SUB64ri32;
36145 else
36146 return X86::SUB32ri;
36147}
36148
36150X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36151 MachineBasicBlock *MBB) const {
36152 MachineFunction *MF = MBB->getParent();
36153 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36154 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36155 const MIMetadata MIMD(MI);
36156 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36157
36158 const unsigned ProbeSize = getStackProbeSize(*MF);
36159
36161 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36162 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36163 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36164
36166 MF->insert(MBBIter, testMBB);
36167 MF->insert(MBBIter, blockMBB);
36168 MF->insert(MBBIter, tailMBB);
36169
36170 Register sizeVReg = MI.getOperand(1).getReg();
36171
36172 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36173
36174 Register TmpStackPtr = MRI.createVirtualRegister(
36175 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36176 Register FinalStackPtr = MRI.createVirtualRegister(
36177 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36178
36179 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36180 .addReg(physSPReg);
36181 {
36182 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36183 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36184 .addReg(TmpStackPtr)
36185 .addReg(sizeVReg);
36186 }
36187
36188 // test rsp size
36189
36190 BuildMI(testMBB, MIMD,
36191 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36192 .addReg(FinalStackPtr)
36193 .addReg(physSPReg);
36194
36195 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36196 .addMBB(tailMBB)
36198 testMBB->addSuccessor(blockMBB);
36199 testMBB->addSuccessor(tailMBB);
36200
36201 // Touch the block then extend it. This is done on the opposite side of
36202 // static probe where we allocate then touch, to avoid the need of probing the
36203 // tail of the static alloca. Possible scenarios are:
36204 //
36205 // + ---- <- ------------ <- ------------- <- ------------ +
36206 // | |
36207 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36208 // | |
36209 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36210 //
36211 // The property we want to enforce is to never have more than [page alloc] between two probes.
36212
36213 const unsigned XORMIOpc =
36214 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36215 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36216 .addImm(0);
36217
36218 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36219 physSPReg)
36220 .addReg(physSPReg)
36221 .addImm(ProbeSize);
36222
36223 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36224 blockMBB->addSuccessor(testMBB);
36225
36226 // Replace original instruction by the expected stack ptr
36227 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36228 MI.getOperand(0).getReg())
36229 .addReg(FinalStackPtr);
36230
36231 tailMBB->splice(tailMBB->end(), MBB,
36232 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36234 MBB->addSuccessor(testMBB);
36235
36236 // Delete the original pseudo instruction.
36237 MI.eraseFromParent();
36238
36239 // And we're done.
36240 return tailMBB;
36241}
36242
36244X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36245 MachineBasicBlock *BB) const {
36246 MachineFunction *MF = BB->getParent();
36247 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36248 const MIMetadata MIMD(MI);
36249 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36250
36251 assert(MF->shouldSplitStack());
36252
36253 const bool Is64Bit = Subtarget.is64Bit();
36254 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36255
36256 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36257 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36258
36259 // BB:
36260 // ... [Till the alloca]
36261 // If stacklet is not large enough, jump to mallocMBB
36262 //
36263 // bumpMBB:
36264 // Allocate by subtracting from RSP
36265 // Jump to continueMBB
36266 //
36267 // mallocMBB:
36268 // Allocate by call to runtime
36269 //
36270 // continueMBB:
36271 // ...
36272 // [rest of original BB]
36273 //
36274
36275 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36276 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36277 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36278
36280 const TargetRegisterClass *AddrRegClass =
36282
36283 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36284 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36285 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36286 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36287 sizeVReg = MI.getOperand(1).getReg(),
36288 physSPReg =
36289 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
36290
36291 MachineFunction::iterator MBBIter = ++BB->getIterator();
36292
36293 MF->insert(MBBIter, bumpMBB);
36294 MF->insert(MBBIter, mallocMBB);
36295 MF->insert(MBBIter, continueMBB);
36296
36297 continueMBB->splice(continueMBB->begin(), BB,
36298 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36299 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36300
36301 // Add code to the main basic block to check if the stack limit has been hit,
36302 // and if so, jump to mallocMBB otherwise to bumpMBB.
36303 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36304 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36305 .addReg(tmpSPVReg).addReg(sizeVReg);
36306 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36307 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36308 .addReg(SPLimitVReg);
36309 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36310
36311 // bumpMBB simply decreases the stack pointer, since we know the current
36312 // stacklet has enough space.
36313 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36314 .addReg(SPLimitVReg);
36315 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36316 .addReg(SPLimitVReg);
36317 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36318
36319 // Calls into a routine in libgcc to allocate more space from the heap.
36320 const uint32_t *RegMask =
36322 if (IsLP64) {
36323 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36324 .addReg(sizeVReg);
36325 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36326 .addExternalSymbol("__morestack_allocate_stack_space")
36327 .addRegMask(RegMask)
36328 .addReg(X86::RDI, RegState::Implicit)
36329 .addReg(X86::RAX, RegState::ImplicitDefine);
36330 } else if (Is64Bit) {
36331 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36332 .addReg(sizeVReg);
36333 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36334 .addExternalSymbol("__morestack_allocate_stack_space")
36335 .addRegMask(RegMask)
36336 .addReg(X86::EDI, RegState::Implicit)
36337 .addReg(X86::EAX, RegState::ImplicitDefine);
36338 } else {
36339 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36340 .addImm(12);
36341 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36342 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36343 .addExternalSymbol("__morestack_allocate_stack_space")
36344 .addRegMask(RegMask)
36345 .addReg(X86::EAX, RegState::ImplicitDefine);
36346 }
36347
36348 if (!Is64Bit)
36349 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36350 .addImm(16);
36351
36352 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36353 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36354 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36355
36356 // Set up the CFG correctly.
36357 BB->addSuccessor(bumpMBB);
36358 BB->addSuccessor(mallocMBB);
36359 mallocMBB->addSuccessor(continueMBB);
36360 bumpMBB->addSuccessor(continueMBB);
36361
36362 // Take care of the PHI nodes.
36363 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36364 MI.getOperand(0).getReg())
36365 .addReg(mallocPtrVReg)
36366 .addMBB(mallocMBB)
36367 .addReg(bumpSPPtrVReg)
36368 .addMBB(bumpMBB);
36369
36370 // Delete the original pseudo instruction.
36371 MI.eraseFromParent();
36372
36373 // And we're done.
36374 return continueMBB;
36375}
36376
36378X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36379 MachineBasicBlock *BB) const {
36380 MachineFunction *MF = BB->getParent();
36381 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36382 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36383 const MIMetadata MIMD(MI);
36384
36387 "SEH does not use catchret!");
36388
36389 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36390 if (!Subtarget.is32Bit())
36391 return BB;
36392
36393 // C++ EH creates a new target block to hold the restore code, and wires up
36394 // the new block to the return destination with a normal JMP_4.
36395 MachineBasicBlock *RestoreMBB =
36397 assert(BB->succ_size() == 1);
36398 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36399 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36400 BB->addSuccessor(RestoreMBB);
36401 MI.getOperand(0).setMBB(RestoreMBB);
36402
36403 // Marking this as an EH pad but not a funclet entry block causes PEI to
36404 // restore stack pointers in the block.
36405 RestoreMBB->setIsEHPad(true);
36406
36407 auto RestoreMBBI = RestoreMBB->begin();
36408 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36409 return BB;
36410}
36411
36413X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36414 MachineBasicBlock *BB) const {
36415 // This is pretty easy. We're taking the value that we received from
36416 // our load from the relocation, sticking it in either RDI (x86-64)
36417 // or EAX and doing an indirect call. The return value will then
36418 // be in the normal return register.
36419 MachineFunction *F = BB->getParent();
36420 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36421 const MIMetadata MIMD(MI);
36422
36423 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36424 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36425
36426 // Get a register mask for the lowered call.
36427 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36428 // proper register mask.
36429 const uint32_t *RegMask =
36430 Subtarget.is64Bit() ?
36433 if (Subtarget.is64Bit()) {
36435 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36436 .addReg(X86::RIP)
36437 .addImm(0)
36438 .addReg(0)
36439 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36440 MI.getOperand(3).getTargetFlags())
36441 .addReg(0);
36442 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36443 addDirectMem(MIB, X86::RDI);
36444 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36445 } else if (!isPositionIndependent()) {
36447 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36448 .addReg(0)
36449 .addImm(0)
36450 .addReg(0)
36451 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36452 MI.getOperand(3).getTargetFlags())
36453 .addReg(0);
36454 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36455 addDirectMem(MIB, X86::EAX);
36456 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36457 } else {
36459 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36460 .addReg(TII->getGlobalBaseReg(F))
36461 .addImm(0)
36462 .addReg(0)
36463 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36464 MI.getOperand(3).getTargetFlags())
36465 .addReg(0);
36466 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36467 addDirectMem(MIB, X86::EAX);
36468 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36469 }
36470
36471 MI.eraseFromParent(); // The pseudo instruction is gone now.
36472 return BB;
36473}
36474
36475static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36476 switch (RPOpc) {
36477 case X86::INDIRECT_THUNK_CALL32:
36478 return X86::CALLpcrel32;
36479 case X86::INDIRECT_THUNK_CALL64:
36480 return X86::CALL64pcrel32;
36481 case X86::INDIRECT_THUNK_TCRETURN32:
36482 return X86::TCRETURNdi;
36483 case X86::INDIRECT_THUNK_TCRETURN64:
36484 return X86::TCRETURNdi64;
36485 }
36486 llvm_unreachable("not indirect thunk opcode");
36487}
36488
36489static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36490 unsigned Reg) {
36491 if (Subtarget.useRetpolineExternalThunk()) {
36492 // When using an external thunk for retpolines, we pick names that match the
36493 // names GCC happens to use as well. This helps simplify the implementation
36494 // of the thunks for kernels where they have no easy ability to create
36495 // aliases and are doing non-trivial configuration of the thunk's body. For
36496 // example, the Linux kernel will do boot-time hot patching of the thunk
36497 // bodies and cannot easily export aliases of these to loaded modules.
36498 //
36499 // Note that at any point in the future, we may need to change the semantics
36500 // of how we implement retpolines and at that time will likely change the
36501 // name of the called thunk. Essentially, there is no hard guarantee that
36502 // LLVM will generate calls to specific thunks, we merely make a best-effort
36503 // attempt to help out kernels and other systems where duplicating the
36504 // thunks is costly.
36505 switch (Reg) {
36506 case X86::EAX:
36507 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36508 return "__x86_indirect_thunk_eax";
36509 case X86::ECX:
36510 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36511 return "__x86_indirect_thunk_ecx";
36512 case X86::EDX:
36513 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36514 return "__x86_indirect_thunk_edx";
36515 case X86::EDI:
36516 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36517 return "__x86_indirect_thunk_edi";
36518 case X86::R11:
36519 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36520 return "__x86_indirect_thunk_r11";
36521 }
36522 llvm_unreachable("unexpected reg for external indirect thunk");
36523 }
36524
36525 if (Subtarget.useRetpolineIndirectCalls() ||
36526 Subtarget.useRetpolineIndirectBranches()) {
36527 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36528 switch (Reg) {
36529 case X86::EAX:
36530 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36531 return "__llvm_retpoline_eax";
36532 case X86::ECX:
36533 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36534 return "__llvm_retpoline_ecx";
36535 case X86::EDX:
36536 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36537 return "__llvm_retpoline_edx";
36538 case X86::EDI:
36539 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36540 return "__llvm_retpoline_edi";
36541 case X86::R11:
36542 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36543 return "__llvm_retpoline_r11";
36544 }
36545 llvm_unreachable("unexpected reg for retpoline");
36546 }
36547
36548 if (Subtarget.useLVIControlFlowIntegrity()) {
36549 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36550 return "__llvm_lvi_thunk_r11";
36551 }
36552 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36553}
36554
36556X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36557 MachineBasicBlock *BB) const {
36558 // Copy the virtual register into the R11 physical register and
36559 // call the retpoline thunk.
36560 const MIMetadata MIMD(MI);
36561 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36562 Register CalleeVReg = MI.getOperand(0).getReg();
36563 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36564
36565 // Find an available scratch register to hold the callee. On 64-bit, we can
36566 // just use R11, but we scan for uses anyway to ensure we don't generate
36567 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36568 // already a register use operand to the call to hold the callee. If none
36569 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36570 // register and ESI is the base pointer to realigned stack frames with VLAs.
36571 SmallVector<unsigned, 3> AvailableRegs;
36572 if (Subtarget.is64Bit())
36573 AvailableRegs.push_back(X86::R11);
36574 else
36575 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36576
36577 // Zero out any registers that are already used.
36578 for (const auto &MO : MI.operands()) {
36579 if (MO.isReg() && MO.isUse())
36580 llvm::replace(AvailableRegs, static_cast<unsigned>(MO.getReg()), 0U);
36581 }
36582
36583 // Choose the first remaining non-zero available register.
36584 unsigned AvailableReg = 0;
36585 for (unsigned MaybeReg : AvailableRegs) {
36586 if (MaybeReg) {
36587 AvailableReg = MaybeReg;
36588 break;
36589 }
36590 }
36591 if (!AvailableReg)
36592 report_fatal_error("calling convention incompatible with retpoline, no "
36593 "available registers");
36594
36595 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36596
36597 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36598 .addReg(CalleeVReg);
36599 MI.getOperand(0).ChangeToES(Symbol);
36600 MI.setDesc(TII->get(Opc));
36602 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36603 return BB;
36604}
36605
36606/// SetJmp implies future control flow change upon calling the corresponding
36607/// LongJmp.
36608/// Instead of using the 'return' instruction, the long jump fixes the stack and
36609/// performs an indirect branch. To do so it uses the registers that were stored
36610/// in the jump buffer (when calling SetJmp).
36611/// In case the shadow stack is enabled we need to fix it as well, because some
36612/// return addresses will be skipped.
36613/// The function will save the SSP for future fixing in the function
36614/// emitLongJmpShadowStackFix.
36615/// \sa emitLongJmpShadowStackFix
36616/// \param [in] MI The temporary Machine Instruction for the builtin.
36617/// \param [in] MBB The Machine Basic Block that will be modified.
36618void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36619 MachineBasicBlock *MBB) const {
36620 const MIMetadata MIMD(MI);
36621 MachineFunction *MF = MBB->getParent();
36622 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36625
36626 // Memory Reference.
36627 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36628
36629 // Initialize a register with zero.
36630 MVT PVT = getPointerTy(MF->getDataLayout());
36631 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36632 Register ZReg = MRI.createVirtualRegister(PtrRC);
36633 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
36634 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
36635 .addDef(ZReg)
36636 .addReg(ZReg, RegState::Undef)
36637 .addReg(ZReg, RegState::Undef);
36638
36639 // Read the current SSP Register value to the zeroed register.
36640 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36641 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36642 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36643
36644 // Write the SSP register value to offset 3 in input memory buffer.
36645 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36646 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
36647 const int64_t SSPOffset = 3 * PVT.getStoreSize();
36648 const unsigned MemOpndSlot = 1;
36649 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36650 if (i == X86::AddrDisp)
36651 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
36652 else
36653 MIB.add(MI.getOperand(MemOpndSlot + i));
36654 }
36655 MIB.addReg(SSPCopyReg);
36656 MIB.setMemRefs(MMOs);
36657}
36658
36660X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
36661 MachineBasicBlock *MBB) const {
36662 const MIMetadata MIMD(MI);
36663 MachineFunction *MF = MBB->getParent();
36664 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36665 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36667
36668 const BasicBlock *BB = MBB->getBasicBlock();
36670
36671 // Memory Reference
36672 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36673
36674 unsigned DstReg;
36675 unsigned MemOpndSlot = 0;
36676
36677 unsigned CurOp = 0;
36678
36679 DstReg = MI.getOperand(CurOp++).getReg();
36680 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
36681 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
36682 (void)TRI;
36683 Register mainDstReg = MRI.createVirtualRegister(RC);
36684 Register restoreDstReg = MRI.createVirtualRegister(RC);
36685
36686 MemOpndSlot = CurOp;
36687
36688 MVT PVT = getPointerTy(MF->getDataLayout());
36689 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
36690 "Invalid Pointer Size!");
36691
36692 // For v = setjmp(buf), we generate
36693 //
36694 // thisMBB:
36695 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
36696 // SjLjSetup restoreMBB
36697 //
36698 // mainMBB:
36699 // v_main = 0
36700 //
36701 // sinkMBB:
36702 // v = phi(main, restore)
36703 //
36704 // restoreMBB:
36705 // if base pointer being used, load it from frame
36706 // v_restore = 1
36707
36708 MachineBasicBlock *thisMBB = MBB;
36709 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
36710 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36711 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
36712 MF->insert(I, mainMBB);
36713 MF->insert(I, sinkMBB);
36714 MF->push_back(restoreMBB);
36715 restoreMBB->setMachineBlockAddressTaken();
36716
36718
36719 // Transfer the remainder of BB and its successor edges to sinkMBB.
36720 sinkMBB->splice(sinkMBB->begin(), MBB,
36721 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36723
36724 // thisMBB:
36725 unsigned PtrStoreOpc = 0;
36726 unsigned LabelReg = 0;
36727 const int64_t LabelOffset = 1 * PVT.getStoreSize();
36728 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36730
36731 // Prepare IP either in reg or imm.
36732 if (!UseImmLabel) {
36733 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36734 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36735 LabelReg = MRI.createVirtualRegister(PtrRC);
36736 if (Subtarget.is64Bit()) {
36737 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
36738 .addReg(X86::RIP)
36739 .addImm(0)
36740 .addReg(0)
36741 .addMBB(restoreMBB)
36742 .addReg(0);
36743 } else {
36744 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
36745 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
36746 .addReg(XII->getGlobalBaseReg(MF))
36747 .addImm(0)
36748 .addReg(0)
36749 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
36750 .addReg(0);
36751 }
36752 } else
36753 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
36754 // Store IP
36755 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
36756 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36757 if (i == X86::AddrDisp)
36758 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
36759 else
36760 MIB.add(MI.getOperand(MemOpndSlot + i));
36761 }
36762 if (!UseImmLabel)
36763 MIB.addReg(LabelReg);
36764 else
36765 MIB.addMBB(restoreMBB);
36766 MIB.setMemRefs(MMOs);
36767
36768 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
36769 emitSetJmpShadowStackFix(MI, thisMBB);
36770 }
36771
36772 // Setup
36773 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
36774 .addMBB(restoreMBB);
36775
36776 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
36777 MIB.addRegMask(RegInfo->getNoPreservedMask());
36778 thisMBB->addSuccessor(mainMBB);
36779 thisMBB->addSuccessor(restoreMBB);
36780
36781 // mainMBB:
36782 // EAX = 0
36783 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
36784 mainMBB->addSuccessor(sinkMBB);
36785
36786 // sinkMBB:
36787 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
36788 .addReg(mainDstReg)
36789 .addMBB(mainMBB)
36790 .addReg(restoreDstReg)
36791 .addMBB(restoreMBB);
36792
36793 // restoreMBB:
36794 if (RegInfo->hasBasePointer(*MF)) {
36795 const bool Uses64BitFramePtr =
36796 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
36798 X86FI->setRestoreBasePointer(MF);
36799 Register FramePtr = RegInfo->getFrameRegister(*MF);
36800 Register BasePtr = RegInfo->getBaseRegister();
36801 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
36802 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
36803 FramePtr, true, X86FI->getRestoreBasePointerOffset())
36805 }
36806 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
36807 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
36808 restoreMBB->addSuccessor(sinkMBB);
36809
36810 MI.eraseFromParent();
36811 return sinkMBB;
36812}
36813
36814/// Fix the shadow stack using the previously saved SSP pointer.
36815/// \sa emitSetJmpShadowStackFix
36816/// \param [in] MI The temporary Machine Instruction for the builtin.
36817/// \param [in] MBB The Machine Basic Block that will be modified.
36818/// \return The sink MBB that will perform the future indirect branch.
36820X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
36821 MachineBasicBlock *MBB) const {
36822 const MIMetadata MIMD(MI);
36823 MachineFunction *MF = MBB->getParent();
36824 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36826
36827 // Memory Reference
36828 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36829
36830 MVT PVT = getPointerTy(MF->getDataLayout());
36831 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36832
36833 // checkSspMBB:
36834 // xor vreg1, vreg1
36835 // rdssp vreg1
36836 // test vreg1, vreg1
36837 // je sinkMBB # Jump if Shadow Stack is not supported
36838 // fallMBB:
36839 // mov buf+24/12(%rip), vreg2
36840 // sub vreg1, vreg2
36841 // jbe sinkMBB # No need to fix the Shadow Stack
36842 // fixShadowMBB:
36843 // shr 3/2, vreg2
36844 // incssp vreg2 # fix the SSP according to the lower 8 bits
36845 // shr 8, vreg2
36846 // je sinkMBB
36847 // fixShadowLoopPrepareMBB:
36848 // shl vreg2
36849 // mov 128, vreg3
36850 // fixShadowLoopMBB:
36851 // incssp vreg3
36852 // dec vreg2
36853 // jne fixShadowLoopMBB # Iterate until you finish fixing
36854 // # the Shadow Stack
36855 // sinkMBB:
36856
36858 const BasicBlock *BB = MBB->getBasicBlock();
36859
36860 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
36861 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
36862 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
36863 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
36864 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
36865 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36866 MF->insert(I, checkSspMBB);
36867 MF->insert(I, fallMBB);
36868 MF->insert(I, fixShadowMBB);
36869 MF->insert(I, fixShadowLoopPrepareMBB);
36870 MF->insert(I, fixShadowLoopMBB);
36871 MF->insert(I, sinkMBB);
36872
36873 // Transfer the remainder of BB and its successor edges to sinkMBB.
36874 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
36875 MBB->end());
36877
36878 MBB->addSuccessor(checkSspMBB);
36879
36880 // Initialize a register with zero.
36881 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
36882 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
36883
36884 if (PVT == MVT::i64) {
36885 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
36886 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
36887 .addImm(0)
36888 .addReg(ZReg)
36889 .addImm(X86::sub_32bit);
36890 ZReg = TmpZReg;
36891 }
36892
36893 // Read the current SSP Register value to the zeroed register.
36894 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36895 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36896 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36897
36898 // Check whether the result of the SSP register is zero and jump directly
36899 // to the sink.
36900 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
36901 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
36902 .addReg(SSPCopyReg)
36903 .addReg(SSPCopyReg);
36904 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
36905 .addMBB(sinkMBB)
36907 checkSspMBB->addSuccessor(sinkMBB);
36908 checkSspMBB->addSuccessor(fallMBB);
36909
36910 // Reload the previously saved SSP register value.
36911 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
36912 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36913 const int64_t SPPOffset = 3 * PVT.getStoreSize();
36915 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
36916 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36917 const MachineOperand &MO = MI.getOperand(i);
36918 if (i == X86::AddrDisp)
36919 MIB.addDisp(MO, SPPOffset);
36920 else if (MO.isReg()) // Don't add the whole operand, we don't want to
36921 // preserve kill flags.
36922 MIB.addReg(MO.getReg());
36923 else
36924 MIB.add(MO);
36925 }
36926 MIB.setMemRefs(MMOs);
36927
36928 // Subtract the current SSP from the previous SSP.
36929 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
36930 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
36931 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
36932 .addReg(PrevSSPReg)
36933 .addReg(SSPCopyReg);
36934
36935 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
36936 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
36937 .addMBB(sinkMBB)
36939 fallMBB->addSuccessor(sinkMBB);
36940 fallMBB->addSuccessor(fixShadowMBB);
36941
36942 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
36943 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
36944 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
36945 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
36946 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
36947 .addReg(SspSubReg)
36948 .addImm(Offset);
36949
36950 // Increase SSP when looking only on the lower 8 bits of the delta.
36951 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
36952 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
36953
36954 // Reset the lower 8 bits.
36955 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
36956 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
36957 .addReg(SspFirstShrReg)
36958 .addImm(8);
36959
36960 // Jump if the result of the shift is zero.
36961 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
36962 .addMBB(sinkMBB)
36964 fixShadowMBB->addSuccessor(sinkMBB);
36965 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
36966
36967 // Do a single shift left.
36968 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
36969 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
36970 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
36971 .addReg(SspSecondShrReg)
36972 .addImm(1);
36973
36974 // Save the value 128 to a register (will be used next with incssp).
36975 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
36976 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
36977 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
36978 .addImm(128);
36979 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
36980
36981 // Since incssp only looks at the lower 8 bits, we might need to do several
36982 // iterations of incssp until we finish fixing the shadow stack.
36983 Register DecReg = MRI.createVirtualRegister(PtrRC);
36984 Register CounterReg = MRI.createVirtualRegister(PtrRC);
36985 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
36986 .addReg(SspAfterShlReg)
36987 .addMBB(fixShadowLoopPrepareMBB)
36988 .addReg(DecReg)
36989 .addMBB(fixShadowLoopMBB);
36990
36991 // Every iteration we increase the SSP by 128.
36992 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
36993
36994 // Every iteration we decrement the counter by 1.
36995 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
36996 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
36997
36998 // Jump if the counter is not zero yet.
36999 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37000 .addMBB(fixShadowLoopMBB)
37002 fixShadowLoopMBB->addSuccessor(sinkMBB);
37003 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37004
37005 return sinkMBB;
37006}
37007
37009X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37010 MachineBasicBlock *MBB) const {
37011 const MIMetadata MIMD(MI);
37012 MachineFunction *MF = MBB->getParent();
37013 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37015
37016 // Memory Reference
37017 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37018
37019 MVT PVT = getPointerTy(MF->getDataLayout());
37020 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37021 "Invalid Pointer Size!");
37022
37023 const TargetRegisterClass *RC =
37024 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37025 Register Tmp = MRI.createVirtualRegister(RC);
37026 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37027 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37028 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37029 Register SP = RegInfo->getStackRegister();
37030
37032
37033 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37034 const int64_t SPOffset = 2 * PVT.getStoreSize();
37035
37036 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37037 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37038
37039 MachineBasicBlock *thisMBB = MBB;
37040
37041 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37042 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37043 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37044 }
37045
37046 // Reload FP
37047 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37048 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37049 const MachineOperand &MO = MI.getOperand(i);
37050 if (MO.isReg()) // Don't add the whole operand, we don't want to
37051 // preserve kill flags.
37052 MIB.addReg(MO.getReg());
37053 else
37054 MIB.add(MO);
37055 }
37056 MIB.setMemRefs(MMOs);
37058
37059 // Reload IP
37060 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37061 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37062 const MachineOperand &MO = MI.getOperand(i);
37063 if (i == X86::AddrDisp)
37064 MIB.addDisp(MO, LabelOffset);
37065 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37066 // preserve kill flags.
37067 MIB.addReg(MO.getReg());
37068 else
37069 MIB.add(MO);
37070 }
37071 MIB.setMemRefs(MMOs);
37072
37073 // Reload SP
37074 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37075 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37076 if (i == X86::AddrDisp)
37077 MIB.addDisp(MI.getOperand(i), SPOffset);
37078 else
37079 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37080 // the last instruction of the expansion.
37081 }
37082 MIB.setMemRefs(MMOs);
37084
37085 // Jump
37086 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37087
37088 MI.eraseFromParent();
37089 return thisMBB;
37090}
37091
37092void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37094 MachineBasicBlock *DispatchBB,
37095 int FI) const {
37096 const MIMetadata MIMD(MI);
37097 MachineFunction *MF = MBB->getParent();
37099 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37100
37101 MVT PVT = getPointerTy(MF->getDataLayout());
37102 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37103
37104 unsigned Op = 0;
37105 unsigned VR = 0;
37106
37107 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37109
37110 if (UseImmLabel) {
37111 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37112 } else {
37113 const TargetRegisterClass *TRC =
37114 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37115 VR = MRI->createVirtualRegister(TRC);
37116 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37117
37118 if (Subtarget.is64Bit())
37119 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37120 .addReg(X86::RIP)
37121 .addImm(1)
37122 .addReg(0)
37123 .addMBB(DispatchBB)
37124 .addReg(0);
37125 else
37126 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37127 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37128 .addImm(1)
37129 .addReg(0)
37130 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37131 .addReg(0);
37132 }
37133
37134 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37135 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37136 if (UseImmLabel)
37137 MIB.addMBB(DispatchBB);
37138 else
37139 MIB.addReg(VR);
37140}
37141
37143X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37144 MachineBasicBlock *BB) const {
37145 const MIMetadata MIMD(MI);
37146 MachineFunction *MF = BB->getParent();
37148 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37149 int FI = MF->getFrameInfo().getFunctionContextIndex();
37150
37151 // Get a mapping of the call site numbers to all of the landing pads they're
37152 // associated with.
37154 unsigned MaxCSNum = 0;
37155 for (auto &MBB : *MF) {
37156 if (!MBB.isEHPad())
37157 continue;
37158
37159 MCSymbol *Sym = nullptr;
37160 for (const auto &MI : MBB) {
37161 if (MI.isDebugInstr())
37162 continue;
37163
37164 assert(MI.isEHLabel() && "expected EH_LABEL");
37165 Sym = MI.getOperand(0).getMCSymbol();
37166 break;
37167 }
37168
37169 if (!MF->hasCallSiteLandingPad(Sym))
37170 continue;
37171
37172 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37173 CallSiteNumToLPad[CSI].push_back(&MBB);
37174 MaxCSNum = std::max(MaxCSNum, CSI);
37175 }
37176 }
37177
37178 // Get an ordered list of the machine basic blocks for the jump table.
37179 std::vector<MachineBasicBlock *> LPadList;
37181 LPadList.reserve(CallSiteNumToLPad.size());
37182
37183 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37184 for (auto &LP : CallSiteNumToLPad[CSI]) {
37185 LPadList.push_back(LP);
37186 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
37187 }
37188 }
37189
37190 assert(!LPadList.empty() &&
37191 "No landing pad destinations for the dispatch jump table!");
37192
37193 // Create the MBBs for the dispatch code.
37194
37195 // Shove the dispatch's address into the return slot in the function context.
37196 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37197 DispatchBB->setIsEHPad(true);
37198
37199 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37200 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37201 DispatchBB->addSuccessor(TrapBB);
37202
37203 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37204 DispatchBB->addSuccessor(DispContBB);
37205
37206 // Insert MBBs.
37207 MF->push_back(DispatchBB);
37208 MF->push_back(DispContBB);
37209 MF->push_back(TrapBB);
37210
37211 // Insert code into the entry block that creates and registers the function
37212 // context.
37213 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37214
37215 // Create the jump table and associated information
37216 unsigned JTE = getJumpTableEncoding();
37217 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37218 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37219
37220 const X86RegisterInfo &RI = TII->getRegisterInfo();
37221 // Add a register mask with no preserved registers. This results in all
37222 // registers being marked as clobbered.
37223 if (RI.hasBasePointer(*MF)) {
37224 const bool FPIs64Bit =
37225 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37226 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37227 MFI->setRestoreBasePointer(MF);
37228
37229 Register FP = RI.getFrameRegister(*MF);
37230 Register BP = RI.getBaseRegister();
37231 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37232 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37235 } else {
37236 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37238 }
37239
37240 // IReg is used as an index in a memory operand and therefore can't be SP
37241 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37242 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37243 Subtarget.is64Bit() ? 8 : 4);
37244 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37245 .addReg(IReg)
37246 .addImm(LPadList.size());
37247 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37248 .addMBB(TrapBB)
37250
37251 if (Subtarget.is64Bit()) {
37252 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37253 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37254
37255 // leaq .LJTI0_0(%rip), BReg
37256 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37257 .addReg(X86::RIP)
37258 .addImm(1)
37259 .addReg(0)
37260 .addJumpTableIndex(MJTI)
37261 .addReg(0);
37262 // movzx IReg64, IReg
37263 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37264 .addImm(0)
37265 .addReg(IReg)
37266 .addImm(X86::sub_32bit);
37267
37268 switch (JTE) {
37270 // jmpq *(BReg,IReg64,8)
37271 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37272 .addReg(BReg)
37273 .addImm(8)
37274 .addReg(IReg64)
37275 .addImm(0)
37276 .addReg(0);
37277 break;
37279 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37280 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37281 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37282
37283 // movl (BReg,IReg64,4), OReg
37284 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37285 .addReg(BReg)
37286 .addImm(4)
37287 .addReg(IReg64)
37288 .addImm(0)
37289 .addReg(0);
37290 // movsx OReg64, OReg
37291 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37292 .addReg(OReg);
37293 // addq BReg, OReg64, TReg
37294 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37295 .addReg(OReg64)
37296 .addReg(BReg);
37297 // jmpq *TReg
37298 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37299 break;
37300 }
37301 default:
37302 llvm_unreachable("Unexpected jump table encoding");
37303 }
37304 } else {
37305 // jmpl *.LJTI0_0(,IReg,4)
37306 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37307 .addReg(0)
37308 .addImm(4)
37309 .addReg(IReg)
37310 .addJumpTableIndex(MJTI)
37311 .addReg(0);
37312 }
37313
37314 // Add the jump table entries as successors to the MBB.
37316 for (auto &LP : LPadList)
37317 if (SeenMBBs.insert(LP).second)
37318 DispContBB->addSuccessor(LP);
37319
37320 // N.B. the order the invoke BBs are processed in doesn't matter here.
37322 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37323 for (MachineBasicBlock *MBB : InvokeBBs) {
37324 // Remove the landing pad successor from the invoke block and replace it
37325 // with the new dispatch block.
37326 // Keep a copy of Successors since it's modified inside the loop.
37328 MBB->succ_rend());
37329 // FIXME: Avoid quadratic complexity.
37330 for (auto *MBBS : Successors) {
37331 if (MBBS->isEHPad()) {
37332 MBB->removeSuccessor(MBBS);
37333 MBBLPads.push_back(MBBS);
37334 }
37335 }
37336
37337 MBB->addSuccessor(DispatchBB);
37338
37339 // Find the invoke call and mark all of the callee-saved registers as
37340 // 'implicit defined' so that they're spilled. This prevents code from
37341 // moving instructions to before the EH block, where they will never be
37342 // executed.
37343 for (auto &II : reverse(*MBB)) {
37344 if (!II.isCall())
37345 continue;
37346
37348 for (auto &MOp : II.operands())
37349 if (MOp.isReg())
37350 DefRegs[MOp.getReg()] = true;
37351
37352 MachineInstrBuilder MIB(*MF, &II);
37353 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37354 unsigned Reg = SavedRegs[RegIdx];
37355 if (!DefRegs[Reg])
37357 }
37358
37359 break;
37360 }
37361 }
37362
37363 // Mark all former landing pads as non-landing pads. The dispatch is the only
37364 // landing pad now.
37365 for (auto &LP : MBBLPads)
37366 LP->setIsEHPad(false);
37367
37368 // The instruction is gone now.
37369 MI.eraseFromParent();
37370 return BB;
37371}
37372
37374X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37375 MachineBasicBlock *BB) const {
37376 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37377 // calls may require proper stack alignment.
37378 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37379 const MIMetadata MIMD(MI);
37380 MachineFunction &MF = *BB->getParent();
37381
37382 // Emit CALLSEQ_START right before the instruction.
37383 MF.getFrameInfo().setAdjustsStack(true);
37384 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37385 MachineInstrBuilder CallseqStart =
37386 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37387 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37388
37389 // Emit CALLSEQ_END right after the instruction.
37390 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37391 MachineInstrBuilder CallseqEnd =
37392 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37393 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37394
37395 return BB;
37396}
37397
37400 MachineBasicBlock *BB) const {
37401 MachineFunction *MF = BB->getParent();
37402 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37403 const MIMetadata MIMD(MI);
37404
37405 auto TMMImmToTMMReg = [](unsigned Imm) {
37406 assert (Imm < 8 && "Illegal tmm index");
37407 return X86::TMM0 + Imm;
37408 };
37409 auto TMMImmToTMMPair = [](unsigned Imm) {
37410 assert(Imm < 8 && "Illegal tmm pair index.");
37411 return X86::TMM0_TMM1 + Imm / 2;
37412 };
37413 switch (MI.getOpcode()) {
37414 default:
37415 llvm_unreachable("Unexpected instr type to insert");
37416 case X86::INDIRECT_THUNK_CALL32:
37417 case X86::INDIRECT_THUNK_CALL64:
37418 case X86::INDIRECT_THUNK_TCRETURN32:
37419 case X86::INDIRECT_THUNK_TCRETURN64:
37420 return EmitLoweredIndirectThunk(MI, BB);
37421 case X86::CATCHRET:
37422 return EmitLoweredCatchRet(MI, BB);
37423 case X86::SEG_ALLOCA_32:
37424 case X86::SEG_ALLOCA_64:
37425 return EmitLoweredSegAlloca(MI, BB);
37426 case X86::PROBED_ALLOCA_32:
37427 case X86::PROBED_ALLOCA_64:
37428 return EmitLoweredProbedAlloca(MI, BB);
37429 case X86::TLSCall_32:
37430 case X86::TLSCall_64:
37431 return EmitLoweredTLSCall(MI, BB);
37432 case X86::CMOV_FR16:
37433 case X86::CMOV_FR16X:
37434 case X86::CMOV_FR32:
37435 case X86::CMOV_FR32X:
37436 case X86::CMOV_FR64:
37437 case X86::CMOV_FR64X:
37438 case X86::CMOV_GR8:
37439 case X86::CMOV_GR16:
37440 case X86::CMOV_GR32:
37441 case X86::CMOV_RFP32:
37442 case X86::CMOV_RFP64:
37443 case X86::CMOV_RFP80:
37444 case X86::CMOV_VR64:
37445 case X86::CMOV_VR128:
37446 case X86::CMOV_VR128X:
37447 case X86::CMOV_VR256:
37448 case X86::CMOV_VR256X:
37449 case X86::CMOV_VR512:
37450 case X86::CMOV_VK1:
37451 case X86::CMOV_VK2:
37452 case X86::CMOV_VK4:
37453 case X86::CMOV_VK8:
37454 case X86::CMOV_VK16:
37455 case X86::CMOV_VK32:
37456 case X86::CMOV_VK64:
37457 return EmitLoweredSelect(MI, BB);
37458
37459 case X86::FP80_ADDr:
37460 case X86::FP80_ADDm32: {
37461 // Change the floating point control register to use double extended
37462 // precision when performing the addition.
37463 int OrigCWFrameIdx =
37464 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37465 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37466 OrigCWFrameIdx);
37467
37468 // Load the old value of the control word...
37469 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37470 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37471 OrigCWFrameIdx);
37472
37473 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37474 // precision.
37475 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37476 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37477 .addReg(OldCW, RegState::Kill)
37478 .addImm(0x300);
37479
37480 // Extract to 16 bits.
37481 Register NewCW16 =
37482 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37483 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37484 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37485
37486 // Prepare memory for FLDCW.
37487 int NewCWFrameIdx =
37488 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37489 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37490 NewCWFrameIdx)
37491 .addReg(NewCW16, RegState::Kill);
37492
37493 // Reload the modified control word now...
37494 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37495 NewCWFrameIdx);
37496
37497 // Do the addition.
37498 if (MI.getOpcode() == X86::FP80_ADDr) {
37499 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37500 .add(MI.getOperand(0))
37501 .add(MI.getOperand(1))
37502 .add(MI.getOperand(2));
37503 } else {
37504 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37505 .add(MI.getOperand(0))
37506 .add(MI.getOperand(1))
37507 .add(MI.getOperand(2))
37508 .add(MI.getOperand(3))
37509 .add(MI.getOperand(4))
37510 .add(MI.getOperand(5))
37511 .add(MI.getOperand(6));
37512 }
37513
37514 // Reload the original control word now.
37515 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37516 OrigCWFrameIdx);
37517
37518 MI.eraseFromParent(); // The pseudo instruction is gone now.
37519 return BB;
37520 }
37521
37522 case X86::FP32_TO_INT16_IN_MEM:
37523 case X86::FP32_TO_INT32_IN_MEM:
37524 case X86::FP32_TO_INT64_IN_MEM:
37525 case X86::FP64_TO_INT16_IN_MEM:
37526 case X86::FP64_TO_INT32_IN_MEM:
37527 case X86::FP64_TO_INT64_IN_MEM:
37528 case X86::FP80_TO_INT16_IN_MEM:
37529 case X86::FP80_TO_INT32_IN_MEM:
37530 case X86::FP80_TO_INT64_IN_MEM: {
37531 // Change the floating point control register to use "round towards zero"
37532 // mode when truncating to an integer value.
37533 int OrigCWFrameIdx =
37534 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37535 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37536 OrigCWFrameIdx);
37537
37538 // Load the old value of the control word...
37539 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37540 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37541 OrigCWFrameIdx);
37542
37543 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37544 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37545 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37546 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37547
37548 // Extract to 16 bits.
37549 Register NewCW16 =
37550 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37551 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37552 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37553
37554 // Prepare memory for FLDCW.
37555 int NewCWFrameIdx =
37556 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37557 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37558 NewCWFrameIdx)
37559 .addReg(NewCW16, RegState::Kill);
37560
37561 // Reload the modified control word now...
37562 addFrameReference(BuildMI(*BB, MI, MIMD,
37563 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37564
37565 // Get the X86 opcode to use.
37566 unsigned Opc;
37567 switch (MI.getOpcode()) {
37568 // clang-format off
37569 default: llvm_unreachable("illegal opcode!");
37570 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37571 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37572 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37573 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37574 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37575 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37576 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37577 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37578 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37579 // clang-format on
37580 }
37581
37583 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37584 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37585
37586 // Reload the original control word now.
37587 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37588 OrigCWFrameIdx);
37589
37590 MI.eraseFromParent(); // The pseudo instruction is gone now.
37591 return BB;
37592 }
37593
37594 // xbegin
37595 case X86::XBEGIN:
37596 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37597
37598 case X86::VAARG_64:
37599 case X86::VAARG_X32:
37600 return EmitVAARGWithCustomInserter(MI, BB);
37601
37602 case X86::EH_SjLj_SetJmp32:
37603 case X86::EH_SjLj_SetJmp64:
37604 return emitEHSjLjSetJmp(MI, BB);
37605
37606 case X86::EH_SjLj_LongJmp32:
37607 case X86::EH_SjLj_LongJmp64:
37608 return emitEHSjLjLongJmp(MI, BB);
37609
37610 case X86::Int_eh_sjlj_setup_dispatch:
37611 return EmitSjLjDispatchBlock(MI, BB);
37612
37613 case TargetOpcode::STATEPOINT:
37614 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37615 // this point in the process. We diverge later.
37616 return emitPatchPoint(MI, BB);
37617
37618 case TargetOpcode::STACKMAP:
37619 case TargetOpcode::PATCHPOINT:
37620 return emitPatchPoint(MI, BB);
37621
37622 case TargetOpcode::PATCHABLE_EVENT_CALL:
37623 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37624 return emitPatchableEventCall(MI, BB);
37625
37626 case X86::LCMPXCHG8B: {
37627 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37628 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37629 // requires a memory operand. If it happens that current architecture is
37630 // i686 and for current function we need a base pointer
37631 // - which is ESI for i686 - register allocator would not be able to
37632 // allocate registers for an address in form of X(%reg, %reg, Y)
37633 // - there never would be enough unreserved registers during regalloc
37634 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37635 // We are giving a hand to register allocator by precomputing the address in
37636 // a new vreg using LEA.
37637
37638 // If it is not i686 or there is no base pointer - nothing to do here.
37639 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
37640 return BB;
37641
37642 // Even though this code does not necessarily needs the base pointer to
37643 // be ESI, we check for that. The reason: if this assert fails, there are
37644 // some changes happened in the compiler base pointer handling, which most
37645 // probably have to be addressed somehow here.
37646 assert(TRI->getBaseRegister() == X86::ESI &&
37647 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
37648 "base pointer in mind");
37649
37651 MVT SPTy = getPointerTy(MF->getDataLayout());
37652 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
37653 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
37654
37656 // Regalloc does not need any help when the memory operand of CMPXCHG8B
37657 // does not use index register.
37658 if (AM.IndexReg == X86::NoRegister)
37659 return BB;
37660
37661 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
37662 // four operand definitions that are E[ABCD] registers. We skip them and
37663 // then insert the LEA.
37664 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
37665 while (RMBBI != BB->rend() &&
37666 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
37667 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
37668 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
37669 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
37670 ++RMBBI;
37671 }
37674 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
37675
37676 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
37677
37678 return BB;
37679 }
37680 case X86::LCMPXCHG16B_NO_RBX: {
37681 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37682 Register BasePtr = TRI->getBaseRegister();
37683 if (TRI->hasBasePointer(*MF) &&
37684 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
37685 if (!BB->isLiveIn(BasePtr))
37686 BB->addLiveIn(BasePtr);
37687 // Save RBX into a virtual register.
37688 Register SaveRBX =
37689 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37690 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
37691 .addReg(X86::RBX);
37692 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37694 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
37695 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
37696 MIB.add(MI.getOperand(Idx));
37697 MIB.add(MI.getOperand(X86::AddrNumOperands));
37698 MIB.addReg(SaveRBX);
37699 } else {
37700 // Simple case, just copy the virtual register to RBX.
37701 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
37702 .add(MI.getOperand(X86::AddrNumOperands));
37704 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
37705 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
37706 MIB.add(MI.getOperand(Idx));
37707 }
37708 MI.eraseFromParent();
37709 return BB;
37710 }
37711 case X86::MWAITX: {
37712 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37713 Register BasePtr = TRI->getBaseRegister();
37714 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
37715 // If no need to save the base pointer, we generate MWAITXrrr,
37716 // else we generate pseudo MWAITX_SAVE_RBX.
37717 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
37718 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
37719 .addReg(MI.getOperand(0).getReg());
37720 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
37721 .addReg(MI.getOperand(1).getReg());
37722 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
37723 .addReg(MI.getOperand(2).getReg());
37724 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
37725 MI.eraseFromParent();
37726 } else {
37727 if (!BB->isLiveIn(BasePtr)) {
37728 BB->addLiveIn(BasePtr);
37729 }
37730 // Parameters can be copied into ECX and EAX but not EBX yet.
37731 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
37732 .addReg(MI.getOperand(0).getReg());
37733 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
37734 .addReg(MI.getOperand(1).getReg());
37735 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
37736 // Save RBX into a virtual register.
37737 Register SaveRBX =
37738 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37739 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
37740 .addReg(X86::RBX);
37741 // Generate mwaitx pseudo.
37742 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37743 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
37744 .addDef(Dst) // Destination tied in with SaveRBX.
37745 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
37746 .addUse(SaveRBX); // Save of base pointer.
37747 MI.eraseFromParent();
37748 }
37749 return BB;
37750 }
37751 case TargetOpcode::PREALLOCATED_SETUP: {
37752 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
37753 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37754 MFI->setHasPreallocatedCall(true);
37755 int64_t PreallocatedId = MI.getOperand(0).getImm();
37756 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
37757 assert(StackAdjustment != 0 && "0 stack adjustment");
37758 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
37759 << StackAdjustment << "\n");
37760 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
37761 .addReg(X86::ESP)
37762 .addImm(StackAdjustment);
37763 MI.eraseFromParent();
37764 return BB;
37765 }
37766 case TargetOpcode::PREALLOCATED_ARG: {
37767 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
37768 int64_t PreallocatedId = MI.getOperand(1).getImm();
37769 int64_t ArgIdx = MI.getOperand(2).getImm();
37770 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37771 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
37772 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
37773 << ", arg offset " << ArgOffset << "\n");
37774 // stack pointer + offset
37775 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
37776 MI.getOperand(0).getReg()),
37777 X86::ESP, false, ArgOffset);
37778 MI.eraseFromParent();
37779 return BB;
37780 }
37781 case X86::PTDPBSSD:
37782 case X86::PTDPBSUD:
37783 case X86::PTDPBUSD:
37784 case X86::PTDPBUUD:
37785 case X86::PTDPBF16PS:
37786 case X86::PTDPFP16PS:
37787 case X86::PTCMMIMFP16PS:
37788 case X86::PTCMMRLFP16PS:
37789 case X86::PTDPBF8PS:
37790 case X86::PTDPBHF8PS:
37791 case X86::PTDPHBF8PS:
37792 case X86::PTDPHF8PS:
37793 case X86::PTTDPBF16PS:
37794 case X86::PTTDPFP16PS:
37795 case X86::PTTCMMIMFP16PS:
37796 case X86::PTTCMMRLFP16PS:
37797 case X86::PTCONJTCMMIMFP16PS:
37798 case X86::PTMMULTF32PS:
37799 case X86::PTTMMULTF32PS: {
37800 unsigned Opc;
37801 switch (MI.getOpcode()) {
37802 default: llvm_unreachable("illegal opcode!");
37803 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
37804 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
37805 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
37806 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
37807 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
37808 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
37809 case X86::PTCMMIMFP16PS:
37810 Opc = X86::TCMMIMFP16PS;
37811 break;
37812 case X86::PTCMMRLFP16PS:
37813 Opc = X86::TCMMRLFP16PS;
37814 break;
37815 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
37816 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
37817 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
37818 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
37819 case X86::PTTDPBF16PS:
37820 Opc = X86::TTDPBF16PS;
37821 break;
37822 case X86::PTTDPFP16PS:
37823 Opc = X86::TTDPFP16PS;
37824 break;
37825 case X86::PTTCMMIMFP16PS:
37826 Opc = X86::TTCMMIMFP16PS;
37827 break;
37828 case X86::PTTCMMRLFP16PS:
37829 Opc = X86::TTCMMRLFP16PS;
37830 break;
37831 case X86::PTCONJTCMMIMFP16PS:
37832 Opc = X86::TCONJTCMMIMFP16PS;
37833 break;
37834 case X86::PTMMULTF32PS:
37835 Opc = X86::TMMULTF32PS;
37836 break;
37837 case X86::PTTMMULTF32PS:
37838 Opc = X86::TTMMULTF32PS;
37839 break;
37840 }
37841
37842 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37843 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
37844 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
37845 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37846 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
37847
37848 MI.eraseFromParent(); // The pseudo is gone now.
37849 return BB;
37850 }
37851 case X86::PTILEZERO: {
37852 unsigned Imm = MI.getOperand(0).getImm();
37853 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
37854 MI.eraseFromParent(); // The pseudo is gone now.
37855 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37857 return BB;
37858 }
37859 case X86::PTILEZEROV: {
37860 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37862 return BB;
37863 }
37864 case X86::PTILELOADDRS:
37865 case X86::PTILELOADDRST1:
37866 case X86::PTILELOADD:
37867 case X86::PTILELOADDT1:
37868 case X86::PTILESTORED: {
37869 unsigned Opc;
37870 switch (MI.getOpcode()) {
37871 default: llvm_unreachable("illegal opcode!");
37872#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
37873 case X86::PTILELOADD:
37874 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
37875 break;
37876 case X86::PTILELOADDT1:
37877 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
37878 break;
37879 case X86::PTILESTORED:
37880 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
37881 break;
37882 case X86::PTILELOADDRS:
37883 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
37884 break;
37885 case X86::PTILELOADDRST1:
37886 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
37887 break;
37888 }
37889#undef GET_EGPR_IF_ENABLED
37890
37891 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37892 unsigned CurOp = 0;
37893 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
37894 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
37896
37897 MIB.add(MI.getOperand(CurOp++)); // base
37898 MIB.add(MI.getOperand(CurOp++)); // scale
37899 MIB.add(MI.getOperand(CurOp++)); // index -- stride
37900 MIB.add(MI.getOperand(CurOp++)); // displacement
37901 MIB.add(MI.getOperand(CurOp++)); // segment
37902
37903 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
37904 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
37906
37907 MI.eraseFromParent(); // The pseudo is gone now.
37908 return BB;
37909 }
37910 case X86::PT2RPNTLVWZ0:
37911 case X86::PT2RPNTLVWZ0T1:
37912 case X86::PT2RPNTLVWZ1:
37913 case X86::PT2RPNTLVWZ1T1:
37914 case X86::PT2RPNTLVWZ0RS:
37915 case X86::PT2RPNTLVWZ0RST1:
37916 case X86::PT2RPNTLVWZ1RS:
37917 case X86::PT2RPNTLVWZ1RST1: {
37918 const DebugLoc &DL = MI.getDebugLoc();
37919 unsigned Opc;
37920#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
37921 switch (MI.getOpcode()) {
37922 default:
37923 llvm_unreachable("Unexpected instruction!");
37924 case X86::PT2RPNTLVWZ0:
37925 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
37926 break;
37927 case X86::PT2RPNTLVWZ0T1:
37928 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
37929 break;
37930 case X86::PT2RPNTLVWZ1:
37931 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
37932 break;
37933 case X86::PT2RPNTLVWZ1T1:
37934 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
37935 break;
37936 case X86::PT2RPNTLVWZ0RS:
37937 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
37938 break;
37939 case X86::PT2RPNTLVWZ0RST1:
37940 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
37941 break;
37942 case X86::PT2RPNTLVWZ1RS:
37943 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
37944 break;
37945 case X86::PT2RPNTLVWZ1RST1:
37946 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
37947 break;
37948 }
37949#undef GET_EGPR_IF_ENABLED
37950 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37951 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
37952
37953 MIB.add(MI.getOperand(1)); // base
37954 MIB.add(MI.getOperand(2)); // scale
37955 MIB.add(MI.getOperand(3)); // index
37956 MIB.add(MI.getOperand(4)); // displacement
37957 MIB.add(MI.getOperand(5)); // segment
37958 MI.eraseFromParent(); // The pseudo is gone now.
37959 return BB;
37960 }
37961 case X86::PTTRANSPOSED:
37962 case X86::PTCONJTFP16: {
37963 const DebugLoc &DL = MI.getDebugLoc();
37964 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
37965 : X86::TCONJTFP16;
37966
37967 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37968 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
37969 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37970
37971 MI.eraseFromParent(); // The pseudo is gone now.
37972 return BB;
37973 }
37974 case X86::PTCVTROWPS2BF16Hrri:
37975 case X86::PTCVTROWPS2BF16Lrri:
37976 case X86::PTCVTROWPS2PHHrri:
37977 case X86::PTCVTROWPS2PHLrri:
37978 case X86::PTCVTROWD2PSrri:
37979 case X86::PTILEMOVROWrri: {
37980 const DebugLoc &DL = MI.getDebugLoc();
37981 unsigned Opc;
37982 switch (MI.getOpcode()) {
37983 default:
37984 llvm_unreachable("Unexpected instruction!");
37985 case X86::PTCVTROWD2PSrri:
37986 Opc = X86::TCVTROWD2PSrri;
37987 break;
37988 case X86::PTCVTROWPS2BF16Hrri:
37989 Opc = X86::TCVTROWPS2BF16Hrri;
37990 break;
37991 case X86::PTCVTROWPS2PHHrri:
37992 Opc = X86::TCVTROWPS2PHHrri;
37993 break;
37994 case X86::PTCVTROWPS2BF16Lrri:
37995 Opc = X86::TCVTROWPS2BF16Lrri;
37996 break;
37997 case X86::PTCVTROWPS2PHLrri:
37998 Opc = X86::TCVTROWPS2PHLrri;
37999 break;
38000 case X86::PTILEMOVROWrri:
38001 Opc = X86::TILEMOVROWrri;
38002 break;
38003 }
38004 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38005 MIB.add(MI.getOperand(0));
38006 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38007 MIB.addImm(MI.getOperand(2).getImm());
38008
38009 MI.eraseFromParent(); // The pseudo is gone now.
38010 return BB;
38011 }
38012 case X86::PTCVTROWPS2BF16Hrre:
38013 case X86::PTCVTROWPS2BF16Lrre:
38014 case X86::PTCVTROWPS2PHHrre:
38015 case X86::PTCVTROWPS2PHLrre:
38016 case X86::PTCVTROWD2PSrre:
38017 case X86::PTILEMOVROWrre: {
38018 const DebugLoc &DL = MI.getDebugLoc();
38019 unsigned Opc;
38020 switch (MI.getOpcode()) {
38021 default:
38022 llvm_unreachable("Unexpected instruction!");
38023 case X86::PTCVTROWD2PSrre:
38024 Opc = X86::TCVTROWD2PSrre;
38025 break;
38026 case X86::PTCVTROWPS2BF16Hrre:
38027 Opc = X86::TCVTROWPS2BF16Hrre;
38028 break;
38029 case X86::PTCVTROWPS2BF16Lrre:
38030 Opc = X86::TCVTROWPS2BF16Lrre;
38031 break;
38032 case X86::PTCVTROWPS2PHHrre:
38033 Opc = X86::TCVTROWPS2PHHrre;
38034 break;
38035 case X86::PTCVTROWPS2PHLrre:
38036 Opc = X86::TCVTROWPS2PHLrre;
38037 break;
38038 case X86::PTILEMOVROWrre:
38039 Opc = X86::TILEMOVROWrre;
38040 break;
38041 }
38042 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38043 MIB.add(MI.getOperand(0));
38044 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38045 MIB.add(MI.getOperand(2));
38046
38047 MI.eraseFromParent(); // The pseudo is gone now.
38048 return BB;
38049 }
38050 }
38051}
38052
38053//===----------------------------------------------------------------------===//
38054// X86 Optimization Hooks
38055//===----------------------------------------------------------------------===//
38056
38057bool
38059 const APInt &DemandedBits,
38060 const APInt &DemandedElts,
38061 TargetLoweringOpt &TLO) const {
38062 EVT VT = Op.getValueType();
38063 unsigned Opcode = Op.getOpcode();
38064 unsigned EltSize = VT.getScalarSizeInBits();
38065
38066 if (VT.isVector()) {
38067 // If the constant is only all signbits in the active bits, then we should
38068 // extend it to the entire constant to allow it act as a boolean constant
38069 // vector.
38070 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38071 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38072 return false;
38073 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38074 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38075 continue;
38076 const APInt &Val = V.getConstantOperandAPInt(i);
38077 if (Val.getBitWidth() > Val.getNumSignBits() &&
38078 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38079 return true;
38080 }
38081 return false;
38082 };
38083 // For vectors - if we have a constant, then try to sign extend.
38084 // TODO: Handle AND cases.
38085 unsigned ActiveBits = DemandedBits.getActiveBits();
38086 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38087 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38088 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38089 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38090 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38092 SDValue NewC =
38094 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38095 SDValue NewOp =
38096 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38097 return TLO.CombineTo(Op, NewOp);
38098 }
38099 return false;
38100 }
38101
38102 // Only optimize Ands to prevent shrinking a constant that could be
38103 // matched by movzx.
38104 if (Opcode != ISD::AND)
38105 return false;
38106
38107 // Make sure the RHS really is a constant.
38108 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38109 if (!C)
38110 return false;
38111
38112 const APInt &Mask = C->getAPIntValue();
38113
38114 // Clear all non-demanded bits initially.
38115 APInt ShrunkMask = Mask & DemandedBits;
38116
38117 // Find the width of the shrunk mask.
38118 unsigned Width = ShrunkMask.getActiveBits();
38119
38120 // If the mask is all 0s there's nothing to do here.
38121 if (Width == 0)
38122 return false;
38123
38124 // Find the next power of 2 width, rounding up to a byte.
38125 Width = llvm::bit_ceil(std::max(Width, 8U));
38126 // Truncate the width to size to handle illegal types.
38127 Width = std::min(Width, EltSize);
38128
38129 // Calculate a possible zero extend mask for this constant.
38130 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38131
38132 // If we aren't changing the mask, just return true to keep it and prevent
38133 // the caller from optimizing.
38134 if (ZeroExtendMask == Mask)
38135 return true;
38136
38137 // Make sure the new mask can be represented by a combination of mask bits
38138 // and non-demanded bits.
38139 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38140 return false;
38141
38142 // Replace the constant with the zero extend mask.
38143 SDLoc DL(Op);
38144 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38145 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38146 return TLO.CombineTo(Op, NewOp);
38147}
38148
38150 KnownBits &Known,
38151 const APInt &DemandedElts,
38152 const SelectionDAG &DAG, unsigned Depth) {
38153 KnownBits Known2;
38154 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38155 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38156 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38157 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38158 Known = KnownBits::abdu(Known, Known2).zext(16);
38159 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38160 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38161 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38162 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38163 Known = Known.zext(64);
38164}
38165
38167 KnownBits &Known,
38168 const APInt &DemandedElts,
38169 const SelectionDAG &DAG,
38170 unsigned Depth) {
38171 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38172
38173 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38174 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38175 APInt DemandedLoElts =
38176 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38177 APInt DemandedHiElts =
38178 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38179 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38180 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38181 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38182 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38183 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38184 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38185 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38186}
38187
38189 KnownBits &Known,
38190 const APInt &DemandedElts,
38191 const SelectionDAG &DAG,
38192 unsigned Depth) {
38193 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38194
38195 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38196 // pairs.
38197 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38198 APInt DemandedLoElts =
38199 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38200 APInt DemandedHiElts =
38201 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38202 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38203 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38204 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38205 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38206 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38207 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38208 Known = KnownBits::sadd_sat(Lo, Hi);
38209}
38210
38212 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38213 const SelectionDAG &DAG,
38214 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38215 KnownBitsFunc) {
38216 APInt DemandedEltsLHS, DemandedEltsRHS;
38217 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38218 DemandedElts, DemandedEltsLHS,
38219 DemandedEltsRHS);
38220
38221 const auto ComputeForSingleOpFunc =
38222 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38223 return KnownBitsFunc(
38224 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38225 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38226 };
38227
38228 if (DemandedEltsRHS.isZero())
38229 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38230 if (DemandedEltsLHS.isZero())
38231 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38232
38233 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38234 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38235}
38236
38238 KnownBits &Known,
38239 const APInt &DemandedElts,
38240 const SelectionDAG &DAG,
38241 unsigned Depth) const {
38242 unsigned BitWidth = Known.getBitWidth();
38243 unsigned NumElts = DemandedElts.getBitWidth();
38244 unsigned Opc = Op.getOpcode();
38245 EVT VT = Op.getValueType();
38246 assert((Opc >= ISD::BUILTIN_OP_END ||
38247 Opc == ISD::INTRINSIC_WO_CHAIN ||
38248 Opc == ISD::INTRINSIC_W_CHAIN ||
38249 Opc == ISD::INTRINSIC_VOID) &&
38250 "Should use MaskedValueIsZero if you don't know whether Op"
38251 " is a target node!");
38252
38253 Known.resetAll();
38254 switch (Opc) {
38255 default: break;
38256 case X86ISD::MUL_IMM: {
38257 KnownBits Known2;
38258 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38259 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38260 Known = KnownBits::mul(Known, Known2);
38261 break;
38262 }
38263 case X86ISD::BSF: {
38265
38266 KnownBits Known2;
38267 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38268 if (Known2.isNonZero()) {
38269 // If we have a known 1, its position is our upper bound.
38270 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38271 unsigned LowBits = llvm::bit_width(PossibleTZ);
38272 Known.Zero.setBitsFrom(LowBits);
38273 } else if (!Op.getOperand(0).isUndef()) {
38274 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38275 Known = Known.intersectWith(Known2);
38276 }
38277 break;
38278 }
38279 case X86ISD::BSR: {
38280 // TODO: Bound with input known bits?
38282
38283 if (!Op.getOperand(0).isUndef() &&
38284 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38285 KnownBits Known2;
38286 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38287 Known = Known.intersectWith(Known2);
38288 }
38289 break;
38290 }
38291 case X86ISD::SETCC:
38292 Known.Zero.setBitsFrom(1);
38293 break;
38294 case X86ISD::MOVMSK: {
38295 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38296 Known.Zero.setBitsFrom(NumLoBits);
38297 break;
38298 }
38299 case X86ISD::PEXTRB:
38300 case X86ISD::PEXTRW: {
38301 SDValue Src = Op.getOperand(0);
38302 EVT SrcVT = Src.getValueType();
38303 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38304 Op.getConstantOperandVal(1));
38305 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38306 Known = Known.anyextOrTrunc(BitWidth);
38307 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38308 break;
38309 }
38310 case X86ISD::VSRAI:
38311 case X86ISD::VSHLI:
38312 case X86ISD::VSRLI: {
38313 unsigned ShAmt = Op.getConstantOperandVal(1);
38314 if (ShAmt >= VT.getScalarSizeInBits()) {
38315 // Out of range logical bit shifts are guaranteed to be zero.
38316 // Out of range arithmetic bit shifts splat the sign bit.
38317 if (Opc != X86ISD::VSRAI) {
38318 Known.setAllZero();
38319 break;
38320 }
38321
38322 ShAmt = VT.getScalarSizeInBits() - 1;
38323 }
38324
38325 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38326 if (Opc == X86ISD::VSHLI) {
38327 Known.Zero <<= ShAmt;
38328 Known.One <<= ShAmt;
38329 // Low bits are known zero.
38330 Known.Zero.setLowBits(ShAmt);
38331 } else if (Opc == X86ISD::VSRLI) {
38332 Known.Zero.lshrInPlace(ShAmt);
38333 Known.One.lshrInPlace(ShAmt);
38334 // High bits are known zero.
38335 Known.Zero.setHighBits(ShAmt);
38336 } else {
38337 Known.Zero.ashrInPlace(ShAmt);
38338 Known.One.ashrInPlace(ShAmt);
38339 }
38340 break;
38341 }
38342 case X86ISD::PACKUS: {
38343 // PACKUS is just a truncation if the upper half is zero.
38344 APInt DemandedLHS, DemandedRHS;
38345 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38346
38347 Known.One = APInt::getAllOnes(BitWidth * 2);
38348 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38349
38350 KnownBits Known2;
38351 if (!!DemandedLHS) {
38352 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38353 Known = Known.intersectWith(Known2);
38354 }
38355 if (!!DemandedRHS) {
38356 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38357 Known = Known.intersectWith(Known2);
38358 }
38359
38360 if (Known.countMinLeadingZeros() < BitWidth)
38361 Known.resetAll();
38362 Known = Known.trunc(BitWidth);
38363 break;
38364 }
38365 case X86ISD::PSHUFB: {
38366 SDValue Src = Op.getOperand(0);
38367 SDValue Idx = Op.getOperand(1);
38368
38369 // If the index vector is never negative (MSB is zero), then all elements
38370 // come from the source vector. This is useful for cases where
38371 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38372 // below will handle the more common constant shuffle mask case.
38373 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38374 if (KnownIdx.isNonNegative())
38375 Known = DAG.computeKnownBits(Src, Depth + 1);
38376 break;
38377 }
38378 case X86ISD::VBROADCAST: {
38379 SDValue Src = Op.getOperand(0);
38380 if (!Src.getSimpleValueType().isVector()) {
38381 Known = DAG.computeKnownBits(Src, Depth + 1);
38382 return;
38383 }
38384 break;
38385 }
38386 case X86ISD::AND: {
38387 if (Op.getResNo() == 0) {
38388 KnownBits Known2;
38389 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38390 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38391 Known &= Known2;
38392 }
38393 break;
38394 }
38395 case X86ISD::ANDNP: {
38396 KnownBits Known2;
38397 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38398 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38399
38400 // ANDNP = (~X & Y);
38401 Known.One &= Known2.Zero;
38402 Known.Zero |= Known2.One;
38403 break;
38404 }
38405 case X86ISD::FOR: {
38406 KnownBits Known2;
38407 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38408 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38409
38410 Known |= Known2;
38411 break;
38412 }
38413 case X86ISD::PSADBW: {
38414 SDValue LHS = Op.getOperand(0);
38415 SDValue RHS = Op.getOperand(1);
38416 assert(VT.getScalarType() == MVT::i64 &&
38417 LHS.getValueType() == RHS.getValueType() &&
38418 LHS.getValueType().getScalarType() == MVT::i8 &&
38419 "Unexpected PSADBW types");
38420 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38421 break;
38422 }
38423 case X86ISD::PCMPGT:
38424 case X86ISD::PCMPEQ: {
38425 KnownBits KnownLhs =
38426 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38427 KnownBits KnownRhs =
38428 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38429 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38430 ? KnownBits::eq(KnownLhs, KnownRhs)
38431 : KnownBits::sgt(KnownLhs, KnownRhs);
38432 if (Res) {
38433 if (*Res)
38434 Known.setAllOnes();
38435 else
38436 Known.setAllZero();
38437 }
38438 break;
38439 }
38440 case X86ISD::VPMADDWD: {
38441 SDValue LHS = Op.getOperand(0);
38442 SDValue RHS = Op.getOperand(1);
38443 assert(VT.getVectorElementType() == MVT::i32 &&
38444 LHS.getValueType() == RHS.getValueType() &&
38445 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38446 "Unexpected PMADDWD types");
38447 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38448 break;
38449 }
38450 case X86ISD::VPMADDUBSW: {
38451 SDValue LHS = Op.getOperand(0);
38452 SDValue RHS = Op.getOperand(1);
38453 assert(VT.getVectorElementType() == MVT::i16 &&
38454 LHS.getValueType() == RHS.getValueType() &&
38455 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38456 "Unexpected PMADDUBSW types");
38457 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38458 break;
38459 }
38460 case X86ISD::PMULUDQ: {
38461 KnownBits Known2;
38462 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38463 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38464
38465 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38466 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38467 Known = KnownBits::mul(Known, Known2);
38468 break;
38469 }
38470 case X86ISD::CMOV: {
38471 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38472 // If we don't know any bits, early out.
38473 if (Known.isUnknown())
38474 break;
38475 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38476
38477 // Only known if known in both the LHS and RHS.
38478 Known = Known.intersectWith(Known2);
38479 break;
38480 }
38481 case X86ISD::BEXTR:
38482 case X86ISD::BEXTRI: {
38483 SDValue Op0 = Op.getOperand(0);
38484 SDValue Op1 = Op.getOperand(1);
38485
38486 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38487 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38488 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38489
38490 // If the length is 0, the result is 0.
38491 if (Length == 0) {
38492 Known.setAllZero();
38493 break;
38494 }
38495
38496 if ((Shift + Length) <= BitWidth) {
38497 Known = DAG.computeKnownBits(Op0, Depth + 1);
38498 Known = Known.extractBits(Length, Shift);
38499 Known = Known.zextOrTrunc(BitWidth);
38500 }
38501 }
38502 break;
38503 }
38504 case X86ISD::PDEP: {
38505 KnownBits Known2;
38506 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38507 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38508 // Zeros are retained from the mask operand. But not ones.
38509 Known.One.clearAllBits();
38510 // The result will have at least as many trailing zeros as the non-mask
38511 // operand since bits can only map to the same or higher bit position.
38512 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38513 break;
38514 }
38515 case X86ISD::PEXT: {
38516 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38517 // The result has as many leading zeros as the number of zeroes in the mask.
38518 unsigned Count = Known.Zero.popcount();
38519 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
38520 Known.One.clearAllBits();
38521 break;
38522 }
38523 case X86ISD::VTRUNC:
38524 case X86ISD::VTRUNCS:
38525 case X86ISD::VTRUNCUS:
38526 case X86ISD::CVTSI2P:
38527 case X86ISD::CVTUI2P:
38528 case X86ISD::CVTP2SI:
38529 case X86ISD::CVTP2UI:
38530 case X86ISD::MCVTP2SI:
38531 case X86ISD::MCVTP2UI:
38532 case X86ISD::CVTTP2SI:
38533 case X86ISD::CVTTP2UI:
38534 case X86ISD::MCVTTP2SI:
38535 case X86ISD::MCVTTP2UI:
38536 case X86ISD::MCVTSI2P:
38537 case X86ISD::MCVTUI2P:
38538 case X86ISD::VFPROUND:
38539 case X86ISD::VMFPROUND:
38540 case X86ISD::CVTPS2PH:
38541 case X86ISD::MCVTPS2PH:
38542 case X86ISD::MCVTTP2SIS:
38543 case X86ISD::MCVTTP2UIS: {
38544 // Truncations/Conversions - upper elements are known zero.
38545 EVT SrcVT = Op.getOperand(0).getValueType();
38546 if (SrcVT.isVector()) {
38547 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38548 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38549 Known.setAllZero();
38550 }
38551 break;
38552 }
38559 // Strict Conversions - upper elements are known zero.
38560 EVT SrcVT = Op.getOperand(1).getValueType();
38561 if (SrcVT.isVector()) {
38562 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38563 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38564 Known.setAllZero();
38565 }
38566 break;
38567 }
38568 case X86ISD::MOVQ2DQ: {
38569 // Move from MMX to XMM. Upper half of XMM should be 0.
38570 if (DemandedElts.countr_zero() >= (NumElts / 2))
38571 Known.setAllZero();
38572 break;
38573 }
38575 APInt UndefElts;
38576 SmallVector<APInt, 16> EltBits;
38577 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38578 /*AllowWholeUndefs*/ false,
38579 /*AllowPartialUndefs*/ false)) {
38580 Known.Zero.setAllBits();
38581 Known.One.setAllBits();
38582 for (unsigned I = 0; I != NumElts; ++I) {
38583 if (!DemandedElts[I])
38584 continue;
38585 if (UndefElts[I]) {
38586 Known.resetAll();
38587 break;
38588 }
38589 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38590 Known = Known.intersectWith(Known2);
38591 }
38592 return;
38593 }
38594 break;
38595 }
38596 case X86ISD::HADD:
38597 case X86ISD::HSUB: {
38599 Op, DemandedElts, Depth, DAG,
38600 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38602 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38603 KnownLHS, KnownRHS);
38604 });
38605 break;
38606 }
38608 switch (Op->getConstantOperandVal(0)) {
38609 case Intrinsic::x86_sse2_pmadd_wd:
38610 case Intrinsic::x86_avx2_pmadd_wd:
38611 case Intrinsic::x86_avx512_pmaddw_d_512: {
38612 SDValue LHS = Op.getOperand(1);
38613 SDValue RHS = Op.getOperand(2);
38614 assert(VT.getScalarType() == MVT::i32 &&
38615 LHS.getValueType() == RHS.getValueType() &&
38616 LHS.getValueType().getScalarType() == MVT::i16 &&
38617 "Unexpected PMADDWD types");
38618 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38619 break;
38620 }
38621 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38622 case Intrinsic::x86_avx2_pmadd_ub_sw:
38623 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38624 SDValue LHS = Op.getOperand(1);
38625 SDValue RHS = Op.getOperand(2);
38626 assert(VT.getScalarType() == MVT::i16 &&
38627 LHS.getValueType() == RHS.getValueType() &&
38628 LHS.getValueType().getScalarType() == MVT::i8 &&
38629 "Unexpected PMADDUBSW types");
38630 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38631 break;
38632 }
38633 case Intrinsic::x86_sse2_psad_bw:
38634 case Intrinsic::x86_avx2_psad_bw:
38635 case Intrinsic::x86_avx512_psad_bw_512: {
38636 SDValue LHS = Op.getOperand(1);
38637 SDValue RHS = Op.getOperand(2);
38638 assert(VT.getScalarType() == MVT::i64 &&
38639 LHS.getValueType() == RHS.getValueType() &&
38640 LHS.getValueType().getScalarType() == MVT::i8 &&
38641 "Unexpected PSADBW types");
38642 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38643 break;
38644 }
38645 }
38646 break;
38647 }
38648 }
38649
38650 // Handle target shuffles.
38651 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38652 if (isTargetShuffle(Opc)) {
38655 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
38656 unsigned NumOps = Ops.size();
38657 unsigned NumElts = VT.getVectorNumElements();
38658 if (Mask.size() == NumElts) {
38659 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38660 Known.Zero.setAllBits(); Known.One.setAllBits();
38661 for (unsigned i = 0; i != NumElts; ++i) {
38662 if (!DemandedElts[i])
38663 continue;
38664 int M = Mask[i];
38665 if (M == SM_SentinelUndef) {
38666 // For UNDEF elements, we don't know anything about the common state
38667 // of the shuffle result.
38668 Known.resetAll();
38669 break;
38670 }
38671 if (M == SM_SentinelZero) {
38672 Known.One.clearAllBits();
38673 continue;
38674 }
38675 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
38676 "Shuffle index out of range");
38677
38678 unsigned OpIdx = (unsigned)M / NumElts;
38679 unsigned EltIdx = (unsigned)M % NumElts;
38680 if (Ops[OpIdx].getValueType() != VT) {
38681 // TODO - handle target shuffle ops with different value types.
38682 Known.resetAll();
38683 break;
38684 }
38685 DemandedOps[OpIdx].setBit(EltIdx);
38686 }
38687 // Known bits are the values that are shared by every demanded element.
38688 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
38689 if (!DemandedOps[i])
38690 continue;
38691 KnownBits Known2 =
38692 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
38693 Known = Known.intersectWith(Known2);
38694 }
38695 }
38696 }
38697 }
38698}
38699
38701 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
38702 unsigned Depth) const {
38703 EVT VT = Op.getValueType();
38704 unsigned VTBits = VT.getScalarSizeInBits();
38705 unsigned Opcode = Op.getOpcode();
38706 switch (Opcode) {
38708 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
38709 return VTBits;
38710
38711 case X86ISD::VTRUNC: {
38712 SDValue Src = Op.getOperand(0);
38713 MVT SrcVT = Src.getSimpleValueType();
38714 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
38715 assert(VTBits < NumSrcBits && "Illegal truncation input type");
38716 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38717 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
38718 if (Tmp > (NumSrcBits - VTBits))
38719 return Tmp - (NumSrcBits - VTBits);
38720 return 1;
38721 }
38722
38723 case X86ISD::PACKSS: {
38724 // PACKSS is just a truncation if the sign bits extend to the packed size.
38725 APInt DemandedLHS, DemandedRHS;
38726 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
38727 DemandedRHS);
38728
38729 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
38730 // patterns often used to compact vXi64 allsignbit patterns.
38731 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
38733 if (BC.getOpcode() == X86ISD::PACKSS &&
38734 BC.getScalarValueSizeInBits() == 16 &&
38735 V.getScalarValueSizeInBits() == 32) {
38738 if (BC0.getScalarValueSizeInBits() == 64 &&
38739 BC1.getScalarValueSizeInBits() == 64 &&
38740 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
38741 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
38742 return 32;
38743 }
38744 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
38745 };
38746
38747 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
38748 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
38749 if (!!DemandedLHS)
38750 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
38751 if (!!DemandedRHS)
38752 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
38753 unsigned Tmp = std::min(Tmp0, Tmp1);
38754 if (Tmp > (SrcBits - VTBits))
38755 return Tmp - (SrcBits - VTBits);
38756 return 1;
38757 }
38758
38759 case X86ISD::VBROADCAST: {
38760 SDValue Src = Op.getOperand(0);
38761 if (!Src.getSimpleValueType().isVector())
38762 return DAG.ComputeNumSignBits(Src, Depth + 1);
38763 break;
38764 }
38765
38766 case X86ISD::VSHLI: {
38767 SDValue Src = Op.getOperand(0);
38768 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
38769 if (ShiftVal.uge(VTBits))
38770 return VTBits; // Shifted all bits out --> zero.
38771 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38772 if (ShiftVal.uge(Tmp))
38773 return 1; // Shifted all sign bits out --> unknown.
38774 return Tmp - ShiftVal.getZExtValue();
38775 }
38776
38777 case X86ISD::VSRAI: {
38778 SDValue Src = Op.getOperand(0);
38779 APInt ShiftVal = Op.getConstantOperandAPInt(1);
38780 if (ShiftVal.uge(VTBits - 1))
38781 return VTBits; // Sign splat.
38782 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38783 ShiftVal += Tmp;
38784 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
38785 }
38786
38787 case X86ISD::FSETCC:
38788 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
38789 if (VT == MVT::f32 || VT == MVT::f64 ||
38790 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
38791 return VTBits;
38792 break;
38793
38794 case X86ISD::PCMPGT:
38795 case X86ISD::PCMPEQ:
38796 case X86ISD::CMPP:
38797 case X86ISD::VPCOM:
38798 case X86ISD::VPCOMU:
38799 // Vector compares return zero/all-bits result values.
38800 return VTBits;
38801
38802 case X86ISD::ANDNP: {
38803 unsigned Tmp0 =
38804 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
38805 if (Tmp0 == 1) return 1; // Early out.
38806 unsigned Tmp1 =
38807 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
38808 return std::min(Tmp0, Tmp1);
38809 }
38810
38811 case X86ISD::CMOV: {
38812 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
38813 if (Tmp0 == 1) return 1; // Early out.
38814 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
38815 return std::min(Tmp0, Tmp1);
38816 }
38817 }
38818
38819 // Handle target shuffles.
38820 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38821 if (isTargetShuffle(Opcode)) {
38824 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
38825 unsigned NumOps = Ops.size();
38826 unsigned NumElts = VT.getVectorNumElements();
38827 if (Mask.size() == NumElts) {
38828 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38829 for (unsigned i = 0; i != NumElts; ++i) {
38830 if (!DemandedElts[i])
38831 continue;
38832 int M = Mask[i];
38833 if (M == SM_SentinelUndef) {
38834 // For UNDEF elements, we don't know anything about the common state
38835 // of the shuffle result.
38836 return 1;
38837 } else if (M == SM_SentinelZero) {
38838 // Zero = all sign bits.
38839 continue;
38840 }
38841 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
38842 "Shuffle index out of range");
38843
38844 unsigned OpIdx = (unsigned)M / NumElts;
38845 unsigned EltIdx = (unsigned)M % NumElts;
38846 if (Ops[OpIdx].getValueType() != VT) {
38847 // TODO - handle target shuffle ops with different value types.
38848 return 1;
38849 }
38850 DemandedOps[OpIdx].setBit(EltIdx);
38851 }
38852 unsigned Tmp0 = VTBits;
38853 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
38854 if (!DemandedOps[i])
38855 continue;
38856 unsigned Tmp1 =
38857 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
38858 Tmp0 = std::min(Tmp0, Tmp1);
38859 }
38860 return Tmp0;
38861 }
38862 }
38863 }
38864
38865 // Fallback case.
38866 return 1;
38867}
38868
38870 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
38871 return N->getOperand(0);
38872 return N;
38873}
38874
38875// Helper to look for a normal load that can be narrowed into a vzload with the
38876// specified VT and memory VT. Returns SDValue() on failure.
38878 SelectionDAG &DAG) {
38879 // Can't if the load is volatile or atomic.
38880 if (!LN->isSimple())
38881 return SDValue();
38882
38883 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38884 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38885 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
38886 LN->getPointerInfo(), LN->getOriginalAlign(),
38887 LN->getMemOperand()->getFlags());
38888}
38889
38890// Attempt to match a combined shuffle mask against supported unary shuffle
38891// instructions.
38892// TODO: Investigate sharing more of this with shuffle lowering.
38893static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
38894 bool AllowFloatDomain, bool AllowIntDomain,
38895 SDValue V1, const SelectionDAG &DAG,
38896 const X86Subtarget &Subtarget, unsigned &Shuffle,
38897 MVT &SrcVT, MVT &DstVT) {
38898 unsigned NumMaskElts = Mask.size();
38899 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
38900
38901 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
38902 if (Mask[0] == 0 &&
38903 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
38904 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
38906 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
38907 Shuffle = X86ISD::VZEXT_MOVL;
38908 if (MaskEltSize == 16)
38909 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38910 else
38911 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38912 return true;
38913 }
38914 }
38915
38916 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
38917 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
38918 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
38919 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
38920 unsigned MaxScale = 64 / MaskEltSize;
38921 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
38922 DAG.ComputeNumSignBits(V1) == MaskEltSize;
38923 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
38924 bool MatchAny = true;
38925 bool MatchZero = true;
38926 bool MatchSign = UseSign;
38927 unsigned NumDstElts = NumMaskElts / Scale;
38928 for (unsigned i = 0;
38929 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
38930 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
38931 MatchAny = MatchSign = MatchZero = false;
38932 break;
38933 }
38934 unsigned Pos = (i * Scale) + 1;
38935 unsigned Len = Scale - 1;
38936 MatchAny &= isUndefInRange(Mask, Pos, Len);
38937 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
38938 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
38939 }
38940 if (MatchAny || MatchSign || MatchZero) {
38941 assert((MatchSign || MatchZero) &&
38942 "Failed to match sext/zext but matched aext?");
38943 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
38944 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
38945 : MVT::getIntegerVT(MaskEltSize);
38946 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
38947
38948 Shuffle = unsigned(
38949 MatchAny ? ISD::ANY_EXTEND
38950 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
38951 if (SrcVT.getVectorNumElements() != NumDstElts)
38952 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
38953
38954 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
38955 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
38956 return true;
38957 }
38958 }
38959 }
38960
38961 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
38962 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
38963 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
38964 isUndefOrEqual(Mask[0], 0) &&
38965 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
38966 Shuffle = X86ISD::VZEXT_MOVL;
38967 if (MaskEltSize == 16)
38968 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38969 else
38970 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38971 return true;
38972 }
38973
38974 // Check if we have SSE3 which will let us use MOVDDUP etc. The
38975 // instructions are no slower than UNPCKLPD but has the option to
38976 // fold the input operand into even an unaligned memory load.
38977 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
38978 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
38979 Shuffle = X86ISD::MOVDDUP;
38980 SrcVT = DstVT = MVT::v2f64;
38981 return true;
38982 }
38983 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
38984 Shuffle = X86ISD::MOVSLDUP;
38985 SrcVT = DstVT = MVT::v4f32;
38986 return true;
38987 }
38988 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
38989 Shuffle = X86ISD::MOVSHDUP;
38990 SrcVT = DstVT = MVT::v4f32;
38991 return true;
38992 }
38993 }
38994
38995 if (MaskVT.is256BitVector() && AllowFloatDomain) {
38996 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
38997 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
38998 Shuffle = X86ISD::MOVDDUP;
38999 SrcVT = DstVT = MVT::v4f64;
39000 return true;
39001 }
39002 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39003 V1)) {
39004 Shuffle = X86ISD::MOVSLDUP;
39005 SrcVT = DstVT = MVT::v8f32;
39006 return true;
39007 }
39008 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39009 V1)) {
39010 Shuffle = X86ISD::MOVSHDUP;
39011 SrcVT = DstVT = MVT::v8f32;
39012 return true;
39013 }
39014 }
39015
39016 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39017 assert(Subtarget.hasAVX512() &&
39018 "AVX512 required for 512-bit vector shuffles");
39019 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39020 V1)) {
39021 Shuffle = X86ISD::MOVDDUP;
39022 SrcVT = DstVT = MVT::v8f64;
39023 return true;
39024 }
39026 MaskVT, Mask,
39027 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39028 Shuffle = X86ISD::MOVSLDUP;
39029 SrcVT = DstVT = MVT::v16f32;
39030 return true;
39031 }
39033 MaskVT, Mask,
39034 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39035 Shuffle = X86ISD::MOVSHDUP;
39036 SrcVT = DstVT = MVT::v16f32;
39037 return true;
39038 }
39039 }
39040
39041 return false;
39042}
39043
39044// Attempt to match a combined shuffle mask against supported unary immediate
39045// permute instructions.
39046// TODO: Investigate sharing more of this with shuffle lowering.
39048 const APInt &Zeroable,
39049 bool AllowFloatDomain, bool AllowIntDomain,
39050 const SelectionDAG &DAG,
39051 const X86Subtarget &Subtarget,
39052 unsigned &Shuffle, MVT &ShuffleVT,
39053 unsigned &PermuteImm) {
39054 unsigned NumMaskElts = Mask.size();
39055 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39056 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39057 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39058 bool ContainsZeros = isAnyZero(Mask);
39059
39060 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39061 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39062 // Check for lane crossing permutes.
39063 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39064 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39065 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39066 Shuffle = X86ISD::VPERMI;
39067 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39068 PermuteImm = getV4X86ShuffleImm(Mask);
39069 return true;
39070 }
39071 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39072 SmallVector<int, 4> RepeatedMask;
39073 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39074 Shuffle = X86ISD::VPERMI;
39075 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39076 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39077 return true;
39078 }
39079 }
39080 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39081 // VPERMILPD can permute with a non-repeating shuffle.
39082 Shuffle = X86ISD::VPERMILPI;
39083 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39084 PermuteImm = 0;
39085 for (int i = 0, e = Mask.size(); i != e; ++i) {
39086 int M = Mask[i];
39087 if (M == SM_SentinelUndef)
39088 continue;
39089 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39090 PermuteImm |= (M & 1) << i;
39091 }
39092 return true;
39093 }
39094 }
39095
39096 // We are checking for shuffle match or shift match. Loop twice so we can
39097 // order which we try and match first depending on target preference.
39098 for (unsigned Order = 0; Order < 2; ++Order) {
39099 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39100 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39101 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39102 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39103 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39104 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39105 SmallVector<int, 4> RepeatedMask;
39106 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39107 // Narrow the repeated mask to create 32-bit element permutes.
39108 SmallVector<int, 4> WordMask = RepeatedMask;
39109 if (MaskScalarSizeInBits == 64)
39110 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39111
39112 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39113 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39114 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39115 PermuteImm = getV4X86ShuffleImm(WordMask);
39116 return true;
39117 }
39118 }
39119
39120 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39121 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39122 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39123 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39124 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39125 SmallVector<int, 4> RepeatedMask;
39126 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39127 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39128 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39129
39130 // PSHUFLW: permute lower 4 elements only.
39131 if (isUndefOrInRange(LoMask, 0, 4) &&
39132 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39133 Shuffle = X86ISD::PSHUFLW;
39134 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39135 PermuteImm = getV4X86ShuffleImm(LoMask);
39136 return true;
39137 }
39138
39139 // PSHUFHW: permute upper 4 elements only.
39140 if (isUndefOrInRange(HiMask, 4, 8) &&
39141 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39142 // Offset the HiMask so that we can create the shuffle immediate.
39143 int OffsetHiMask[4];
39144 for (int i = 0; i != 4; ++i)
39145 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39146
39147 Shuffle = X86ISD::PSHUFHW;
39148 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39149 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39150 return true;
39151 }
39152 }
39153 }
39154 } else {
39155 // Attempt to match against bit rotates.
39156 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39157 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39158 Subtarget.hasAVX512())) {
39159 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39160 Subtarget, Mask);
39161 if (0 < RotateAmt) {
39162 Shuffle = X86ISD::VROTLI;
39163 PermuteImm = (unsigned)RotateAmt;
39164 return true;
39165 }
39166 }
39167 }
39168 // Attempt to match against byte/bit shifts.
39169 if (AllowIntDomain &&
39170 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39171 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39172 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39173 int ShiftAmt =
39174 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39175 Zeroable, Subtarget);
39176 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39177 32 <= ShuffleVT.getScalarSizeInBits())) {
39178 // Byte shifts can be slower so only match them on second attempt.
39179 if (Order == 0 &&
39180 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39181 continue;
39182
39183 PermuteImm = (unsigned)ShiftAmt;
39184 return true;
39185 }
39186
39187 }
39188 }
39189
39190 return false;
39191}
39192
39193// Attempt to match a combined unary shuffle mask against supported binary
39194// shuffle instructions.
39195// TODO: Investigate sharing more of this with shuffle lowering.
39196static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39197 bool AllowFloatDomain, bool AllowIntDomain,
39198 SDValue &V1, SDValue &V2, const SDLoc &DL,
39199 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39200 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39201 bool IsUnary) {
39202 unsigned NumMaskElts = Mask.size();
39203 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39204 unsigned SizeInBits = MaskVT.getSizeInBits();
39205
39206 if (MaskVT.is128BitVector()) {
39207 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39208 AllowFloatDomain) {
39209 V2 = V1;
39210 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39211 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39212 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39213 return true;
39214 }
39215 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39216 AllowFloatDomain) {
39217 V2 = V1;
39218 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39219 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39220 return true;
39221 }
39222 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39223 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39224 std::swap(V1, V2);
39225 Shuffle = X86ISD::MOVSD;
39226 SrcVT = DstVT = MVT::v2f64;
39227 return true;
39228 }
39229 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39230 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39231 Shuffle = X86ISD::MOVSS;
39232 SrcVT = DstVT = MVT::v4f32;
39233 return true;
39234 }
39235 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39236 DAG) &&
39237 Subtarget.hasFP16()) {
39238 Shuffle = X86ISD::MOVSH;
39239 SrcVT = DstVT = MVT::v8f16;
39240 return true;
39241 }
39242 }
39243
39244 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39245 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39246 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39247 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39248 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39249 Subtarget)) {
39250 DstVT = MaskVT;
39251 return true;
39252 }
39253 }
39254 // TODO: Can we handle this inside matchShuffleWithPACK?
39255 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39256 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39257 V1.getScalarValueSizeInBits() == 64 &&
39258 V2.getScalarValueSizeInBits() == 64) {
39259 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39260 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39261 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39262 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39263 SrcVT = MVT::v4i32;
39264 DstVT = MVT::v8i16;
39265 Shuffle = X86ISD::PACKUS;
39266 return true;
39267 }
39268 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39269 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39270 SrcVT = MVT::v8i16;
39271 DstVT = MVT::v16i8;
39272 Shuffle = X86ISD::PACKUS;
39273 return true;
39274 }
39275 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39276 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39277 SrcVT = MVT::v4i32;
39278 DstVT = MVT::v8i16;
39279 Shuffle = X86ISD::PACKSS;
39280 return true;
39281 }
39282 }
39283
39284 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39285 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39286 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39287 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39288 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39289 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39290 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39291 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39292 Subtarget)) {
39293 SrcVT = DstVT = MaskVT;
39294 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39295 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39296 return true;
39297 }
39298 }
39299
39300 // Attempt to match against a OR if we're performing a blend shuffle and the
39301 // non-blended source element is zero in each case.
39302 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39303 if (SizeInBits == V1.getValueSizeInBits() &&
39304 SizeInBits == V2.getValueSizeInBits() &&
39305 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39306 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39307 bool IsBlend = true;
39308 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39309 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39310 unsigned Scale1 = NumV1Elts / NumMaskElts;
39311 unsigned Scale2 = NumV2Elts / NumMaskElts;
39312 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39313 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39314 for (unsigned i = 0; i != NumMaskElts; ++i) {
39315 int M = Mask[i];
39316 if (M == SM_SentinelUndef)
39317 continue;
39318 if (M == SM_SentinelZero) {
39319 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39320 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39321 continue;
39322 }
39323 if (M == (int)i) {
39324 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39325 continue;
39326 }
39327 if (M == (int)(i + NumMaskElts)) {
39328 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39329 continue;
39330 }
39331 IsBlend = false;
39332 break;
39333 }
39334 if (IsBlend) {
39335 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39336 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39337 Shuffle = ISD::OR;
39338 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39339 return true;
39340 }
39341 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39342 // FIXME: handle mismatched sizes?
39343 // TODO: investigate if `ISD::OR` handling in
39344 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39345 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39346 unsigned NumElts = V.getValueType().getVectorNumElements();
39347 KnownBits Known(NumElts);
39348 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39349 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39350 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39351 if (PeepholeKnown.isZero())
39352 Known.Zero.setBit(EltIdx);
39353 if (PeepholeKnown.isAllOnes())
39354 Known.One.setBit(EltIdx);
39355 }
39356 return Known;
39357 };
39358
39359 KnownBits V1Known = computeKnownBitsElementWise(V1);
39360 KnownBits V2Known = computeKnownBitsElementWise(V2);
39361
39362 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39363 int M = Mask[i];
39364 if (M == SM_SentinelUndef)
39365 continue;
39366 if (M == SM_SentinelZero) {
39367 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39368 continue;
39369 }
39370 if (M == (int)i) {
39371 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39372 continue;
39373 }
39374 if (M == (int)(i + NumMaskElts)) {
39375 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39376 continue;
39377 }
39378 llvm_unreachable("will not get here.");
39379 }
39380 if (IsBlend) {
39381 Shuffle = ISD::OR;
39382 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39383 return true;
39384 }
39385 }
39386 }
39387 }
39388
39389 return false;
39390}
39391
39393 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39394 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39395 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39396 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39397 unsigned NumMaskElts = Mask.size();
39398 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39399
39400 // Attempt to match against VALIGND/VALIGNQ rotate.
39401 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39402 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39403 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39404 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39405 if (!isAnyZero(Mask)) {
39406 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39407 if (0 < Rotation) {
39408 Shuffle = X86ISD::VALIGN;
39409 if (EltSizeInBits == 64)
39410 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
39411 else
39412 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
39413 PermuteImm = Rotation;
39414 return true;
39415 }
39416 }
39417 }
39418
39419 // Attempt to match against PALIGNR byte rotate.
39420 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39421 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39422 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39423 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39424 if (0 < ByteRotation) {
39425 Shuffle = X86ISD::PALIGNR;
39426 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39427 PermuteImm = ByteRotation;
39428 return true;
39429 }
39430 }
39431
39432 // Attempt to combine to X86ISD::BLENDI.
39433 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39434 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39435 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39436 uint64_t BlendMask = 0;
39437 bool ForceV1Zero = false, ForceV2Zero = false;
39438 SmallVector<int, 8> TargetMask(Mask);
39439 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39440 ForceV2Zero, BlendMask)) {
39441 if (MaskVT == MVT::v16i16) {
39442 // We can only use v16i16 PBLENDW if the lanes are repeated.
39443 SmallVector<int, 8> RepeatedMask;
39444 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39445 RepeatedMask)) {
39446 assert(RepeatedMask.size() == 8 &&
39447 "Repeated mask size doesn't match!");
39448 PermuteImm = 0;
39449 for (int i = 0; i < 8; ++i)
39450 if (RepeatedMask[i] >= 8)
39451 PermuteImm |= 1 << i;
39452 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39453 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39454 Shuffle = X86ISD::BLENDI;
39455 ShuffleVT = MaskVT;
39456 return true;
39457 }
39458 } else {
39459 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39460 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39461 PermuteImm = (unsigned)BlendMask;
39462 Shuffle = X86ISD::BLENDI;
39463 ShuffleVT = MaskVT;
39464 return true;
39465 }
39466 }
39467 }
39468
39469 // Attempt to combine to INSERTPS, but only if it has elements that need to
39470 // be set to zero.
39471 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39472 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39473 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39474 Shuffle = X86ISD::INSERTPS;
39475 ShuffleVT = MVT::v4f32;
39476 return true;
39477 }
39478
39479 // Attempt to combine to SHUFPD.
39480 if (AllowFloatDomain && EltSizeInBits == 64 &&
39481 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39482 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39483 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39484 bool ForceV1Zero = false, ForceV2Zero = false;
39485 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39486 PermuteImm, Mask, Zeroable)) {
39487 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39488 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39489 Shuffle = X86ISD::SHUFP;
39490 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39491 return true;
39492 }
39493 }
39494
39495 // Attempt to combine to SHUFPS.
39496 if (AllowFloatDomain && EltSizeInBits == 32 &&
39497 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39498 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39499 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39500 SmallVector<int, 4> RepeatedMask;
39501 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39502 // Match each half of the repeated mask, to determine if its just
39503 // referencing one of the vectors, is zeroable or entirely undef.
39504 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39505 int M0 = RepeatedMask[Offset];
39506 int M1 = RepeatedMask[Offset + 1];
39507
39508 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39509 return DAG.getUNDEF(MaskVT);
39510 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39511 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39512 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39513 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39514 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39515 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39516 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39517 return V1;
39518 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39519 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39520 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39521 return V2;
39522 }
39523
39524 return SDValue();
39525 };
39526
39527 int ShufMask[4] = {-1, -1, -1, -1};
39528 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39529 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39530
39531 if (Lo && Hi) {
39532 V1 = Lo;
39533 V2 = Hi;
39534 Shuffle = X86ISD::SHUFP;
39535 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39536 PermuteImm = getV4X86ShuffleImm(ShufMask);
39537 return true;
39538 }
39539 }
39540 }
39541
39542 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39543 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39544 MaskVT.is128BitVector() &&
39545 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39546 Shuffle = X86ISD::INSERTPS;
39547 ShuffleVT = MVT::v4f32;
39548 return true;
39549 }
39550
39551 return false;
39552}
39553
39555 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39556 bool HasVariableMask, bool AllowVariableCrossLaneMask,
39557 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39558 const X86Subtarget &Subtarget);
39559
39560/// Combine an arbitrary chain of shuffles into a single instruction if
39561/// possible.
39562///
39563/// This is the leaf of the recursive combine below. When we have found some
39564/// chain of single-use x86 shuffle instructions and accumulated the combined
39565/// shuffle mask represented by them, this will try to pattern match that mask
39566/// into either a single instruction if there is a special purpose instruction
39567/// for this operation, or into a PSHUFB instruction which is a fully general
39568/// instruction but should only be used to replace chains over a certain depth.
39570 ArrayRef<int> BaseMask, int Depth,
39571 bool HasVariableMask,
39572 bool AllowVariableCrossLaneMask,
39573 bool AllowVariablePerLaneMask,
39574 SelectionDAG &DAG,
39575 const X86Subtarget &Subtarget) {
39576 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39577 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39578 "Unexpected number of shuffle inputs!");
39579
39580 SDLoc DL(Root);
39581 MVT RootVT = Root.getSimpleValueType();
39582 unsigned RootSizeInBits = RootVT.getSizeInBits();
39583 unsigned NumRootElts = RootVT.getVectorNumElements();
39584
39585 // Canonicalize shuffle input op to the requested type.
39586 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39587 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39588 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39589 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39590 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39591 return DAG.getBitcast(VT, Op);
39592 };
39593
39594 // Find the inputs that enter the chain. Note that multiple uses are OK
39595 // here, we're not going to remove the operands we find.
39596 bool UnaryShuffle = (Inputs.size() == 1);
39597 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39598 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39599 : peekThroughBitcasts(Inputs[1]));
39600
39601 MVT VT1 = V1.getSimpleValueType();
39602 MVT VT2 = V2.getSimpleValueType();
39603 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
39604 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
39605
39606 SDValue Res;
39607
39608 unsigned NumBaseMaskElts = BaseMask.size();
39609 if (NumBaseMaskElts == 1) {
39610 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
39611 return CanonicalizeShuffleInput(RootVT, V1);
39612 }
39613
39614 bool OptForSize = DAG.shouldOptForSize();
39615 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39616 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
39617 (RootVT.isFloatingPoint() && Depth >= 1) ||
39618 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
39619
39620 // Don't combine if we are a AVX512/EVEX target and the mask element size
39621 // is different from the root element size - this would prevent writemasks
39622 // from being reused.
39623 bool IsMaskedShuffle = false;
39624 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
39625 if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
39626 Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
39627 IsMaskedShuffle = true;
39628 }
39629 }
39630
39631 // If we are shuffling a splat (and not introducing zeros) then we can just
39632 // use it directly. This works for smaller elements as well as they already
39633 // repeat across each mask element.
39634 if (UnaryShuffle && !isAnyZero(BaseMask) &&
39635 V1.getValueSizeInBits() >= RootSizeInBits &&
39636 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39637 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
39638 return CanonicalizeShuffleInput(RootVT, V1);
39639 }
39640
39641 SmallVector<int, 64> Mask(BaseMask);
39642
39643 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
39644 // etc. can be simplified.
39645 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
39646 SmallVector<int> ScaledMask, IdentityMask;
39647 unsigned NumElts = VT1.getVectorNumElements();
39648 if (Mask.size() <= NumElts &&
39649 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
39650 for (unsigned i = 0; i != NumElts; ++i)
39651 IdentityMask.push_back(i);
39652 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
39653 V2))
39654 return CanonicalizeShuffleInput(RootVT, V1);
39655 }
39656 }
39657
39658 // Handle 128/256-bit lane shuffles of 512-bit vectors.
39659 if (RootVT.is512BitVector() &&
39660 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
39661 // If the upper subvectors are zeroable, then an extract+insert is more
39662 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
39663 // to zero the upper subvectors.
39664 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
39665 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39666 return SDValue(); // Nothing to do!
39667 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
39668 "Unexpected lane shuffle");
39669 Res = CanonicalizeShuffleInput(RootVT, V1);
39670 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
39671 bool UseZero = isAnyZero(Mask);
39672 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
39673 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
39674 }
39675
39676 // Narrow shuffle mask to v4x128.
39677 SmallVector<int, 4> ScaledMask;
39678 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
39679 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
39680
39681 // Try to lower to vshuf64x2/vshuf32x4.
39682 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
39683 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
39684 SelectionDAG &DAG) {
39685 int PermMask[4] = {-1, -1, -1, -1};
39686 // Ensure elements came from the same Op.
39687 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
39688 for (int i = 0; i < 4; ++i) {
39689 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
39690 if (ScaledMask[i] < 0)
39691 continue;
39692
39693 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
39694 unsigned OpIndex = i / 2;
39695 if (Ops[OpIndex].isUndef())
39696 Ops[OpIndex] = Op;
39697 else if (Ops[OpIndex] != Op)
39698 return SDValue();
39699
39700 PermMask[i] = ScaledMask[i] % 4;
39701 }
39702
39703 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
39704 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
39705 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
39706 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
39707 };
39708
39709 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
39710 // doesn't work because our mask is for 128 bits and we don't have an MVT
39711 // to match that.
39712 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
39713 isUndefOrInRange(ScaledMask[1], 0, 2) &&
39714 isUndefOrInRange(ScaledMask[2], 2, 4) &&
39715 isUndefOrInRange(ScaledMask[3], 2, 4) &&
39716 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
39717 ScaledMask[0] == (ScaledMask[2] % 2)) &&
39718 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
39719 ScaledMask[1] == (ScaledMask[3] % 2));
39720
39721 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
39722 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39723 return SDValue(); // Nothing to do!
39724 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
39725 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
39726 return DAG.getBitcast(RootVT, V);
39727 }
39728 }
39729
39730 // Handle 128-bit lane shuffles of 256-bit vectors.
39731 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
39732 // If the upper half is zeroable, then an extract+insert is more optimal
39733 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
39734 // zero the upper half.
39735 if (isUndefOrZero(Mask[1])) {
39736 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39737 return SDValue(); // Nothing to do!
39738 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
39739 Res = CanonicalizeShuffleInput(RootVT, V1);
39740 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
39741 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
39742 256);
39743 }
39744
39745 // If we're inserting the low subvector, an insert-subvector 'concat'
39746 // pattern is quicker than VPERM2X128.
39747 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
39748 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
39749 !Subtarget.hasAVX2()) {
39750 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39751 return SDValue(); // Nothing to do!
39752 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
39753 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
39754 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
39755 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
39756 }
39757
39758 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
39759 return SDValue(); // Nothing to do!
39760
39761 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
39762 // we need to use the zeroing feature.
39763 // Prefer blends for sequential shuffles unless we are optimizing for size.
39764 if (UnaryShuffle &&
39765 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
39766 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
39767 unsigned PermMask = 0;
39768 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
39769 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
39770 return DAG.getNode(
39771 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
39772 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
39773 }
39774
39775 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39776 return SDValue(); // Nothing to do!
39777
39778 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
39779 if (!UnaryShuffle && !IsMaskedShuffle) {
39780 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
39781 "Unexpected shuffle sentinel value");
39782 // Prefer blends to X86ISD::VPERM2X128.
39783 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
39784 unsigned PermMask = 0;
39785 PermMask |= ((Mask[0] & 3) << 0);
39786 PermMask |= ((Mask[1] & 3) << 4);
39787 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
39788 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
39789 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
39790 CanonicalizeShuffleInput(RootVT, LHS),
39791 CanonicalizeShuffleInput(RootVT, RHS),
39792 DAG.getTargetConstant(PermMask, DL, MVT::i8));
39793 }
39794 }
39795 }
39796
39797 // For masks that have been widened to 128-bit elements or more,
39798 // narrow back down to 64-bit elements.
39799 if (BaseMaskEltSizeInBits > 64) {
39800 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
39801 int MaskScale = BaseMaskEltSizeInBits / 64;
39802 SmallVector<int, 64> ScaledMask;
39803 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39804 Mask = std::move(ScaledMask);
39805 }
39806
39807 // For masked shuffles, we're trying to match the root width for better
39808 // writemask folding, attempt to scale the mask.
39809 // TODO - variable shuffles might need this to be widened again.
39810 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
39811 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
39812 int MaskScale = NumRootElts / Mask.size();
39813 SmallVector<int, 64> ScaledMask;
39814 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39815 Mask = std::move(ScaledMask);
39816 }
39817
39818 unsigned NumMaskElts = Mask.size();
39819 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
39820 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39821
39822 // Determine the effective mask value type.
39823 FloatDomain &= (32 <= MaskEltSizeInBits);
39824 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
39825 : MVT::getIntegerVT(MaskEltSizeInBits);
39826 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
39827
39828 // Only allow legal mask types.
39829 if (!TLI.isTypeLegal(MaskVT))
39830 return SDValue();
39831
39832 // Attempt to match the mask against known shuffle patterns.
39833 MVT ShuffleSrcVT, ShuffleVT;
39834 unsigned Shuffle, PermuteImm;
39835
39836 // Which shuffle domains are permitted?
39837 // Permit domain crossing at higher combine depths.
39838 // TODO: Should we indicate which domain is preferred if both are allowed?
39839 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
39840 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
39841 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
39842
39843 // Determine zeroable mask elements.
39844 APInt KnownUndef, KnownZero;
39845 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
39846 APInt Zeroable = KnownUndef | KnownZero;
39847
39848 if (UnaryShuffle) {
39849 // Attempt to match against broadcast-from-vector.
39850 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
39851 if ((Subtarget.hasAVX2() ||
39852 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
39853 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
39854 if (isUndefOrEqual(Mask, 0)) {
39855 if (V1.getValueType() == MaskVT &&
39857 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
39858 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39859 return SDValue(); // Nothing to do!
39860 Res = V1.getOperand(0);
39861 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39862 return DAG.getBitcast(RootVT, Res);
39863 }
39864 if (Subtarget.hasAVX2()) {
39865 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39866 return SDValue(); // Nothing to do!
39867 Res = CanonicalizeShuffleInput(MaskVT, V1);
39868 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39869 return DAG.getBitcast(RootVT, Res);
39870 }
39871 }
39872 }
39873
39874 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
39875 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
39876 (!IsMaskedShuffle ||
39877 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39878 if (Depth == 0 && Root.getOpcode() == Shuffle)
39879 return SDValue(); // Nothing to do!
39880 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39881 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
39882 return DAG.getBitcast(RootVT, Res);
39883 }
39884
39885 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39886 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
39887 PermuteImm) &&
39888 (!IsMaskedShuffle ||
39889 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39890 if (Depth == 0 && Root.getOpcode() == Shuffle)
39891 return SDValue(); // Nothing to do!
39892 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
39893 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
39894 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39895 return DAG.getBitcast(RootVT, Res);
39896 }
39897 }
39898
39899 // Attempt to combine to INSERTPS, but only if the inserted element has come
39900 // from a scalar.
39901 // TODO: Handle other insertions here as well?
39902 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
39903 Subtarget.hasSSE41() &&
39904 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
39905 if (MaskEltSizeInBits == 32) {
39906 SDValue SrcV1 = V1, SrcV2 = V2;
39907 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
39908 DAG) &&
39909 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
39910 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39911 return SDValue(); // Nothing to do!
39912 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39913 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
39914 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
39915 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39916 return DAG.getBitcast(RootVT, Res);
39917 }
39918 }
39919 if (MaskEltSizeInBits == 64 &&
39920 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
39921 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39922 V2.getScalarValueSizeInBits() <= 32) {
39923 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39924 return SDValue(); // Nothing to do!
39925 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
39926 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39927 CanonicalizeShuffleInput(MVT::v4f32, V1),
39928 CanonicalizeShuffleInput(MVT::v4f32, V2),
39929 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39930 return DAG.getBitcast(RootVT, Res);
39931 }
39932 }
39933
39934 SDValue NewV1 = V1; // Save operands in case early exit happens.
39935 SDValue NewV2 = V2;
39936 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
39937 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
39938 ShuffleVT, UnaryShuffle) &&
39939 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39940 if (Depth == 0 && Root.getOpcode() == Shuffle)
39941 return SDValue(); // Nothing to do!
39942 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
39943 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
39944 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
39945 return DAG.getBitcast(RootVT, Res);
39946 }
39947
39948 NewV1 = V1; // Save operands in case early exit happens.
39949 NewV2 = V2;
39950 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39951 AllowIntDomain, NewV1, NewV2, DL, DAG,
39952 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
39953 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39954 if (Depth == 0 && Root.getOpcode() == Shuffle)
39955 return SDValue(); // Nothing to do!
39956 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
39957 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
39958 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
39959 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39960 return DAG.getBitcast(RootVT, Res);
39961 }
39962
39963 // Typically from here on, we need an integer version of MaskVT.
39964 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
39965 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
39966
39967 // Annoyingly, SSE4A instructions don't map into the above match helpers.
39968 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
39969 uint64_t BitLen, BitIdx;
39970 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
39971 Zeroable)) {
39972 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
39973 return SDValue(); // Nothing to do!
39974 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39975 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
39976 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39977 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39978 return DAG.getBitcast(RootVT, Res);
39979 }
39980
39981 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
39982 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
39983 return SDValue(); // Nothing to do!
39984 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39985 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
39986 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
39987 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39988 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39989 return DAG.getBitcast(RootVT, Res);
39990 }
39991 }
39992
39993 // Match shuffle against TRUNCATE patterns.
39994 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
39995 // Match against a VTRUNC instruction, accounting for src/dst sizes.
39996 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
39997 Subtarget)) {
39998 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
39999 ShuffleSrcVT.getVectorNumElements();
40000 unsigned Opc =
40001 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
40002 if (Depth == 0 && Root.getOpcode() == Opc)
40003 return SDValue(); // Nothing to do!
40004 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40005 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40006 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40007 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40008 return DAG.getBitcast(RootVT, Res);
40009 }
40010
40011 // Do we need a more general binary truncation pattern?
40012 if (RootSizeInBits < 512 &&
40013 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40014 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40015 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40016 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40017 // Bail if this was already a truncation or PACK node.
40018 // We sometimes fail to match PACK if we demand known undef elements.
40019 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
40020 Root.getOpcode() == X86ISD::PACKSS ||
40021 Root.getOpcode() == X86ISD::PACKUS))
40022 return SDValue(); // Nothing to do!
40023 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40024 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40025 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40026 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40027 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40028 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40029 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40030 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40031 return DAG.getBitcast(RootVT, Res);
40032 }
40033 }
40034
40035 // Don't try to re-form single instruction chains under any circumstances now
40036 // that we've done encoding canonicalization for them.
40037 if (Depth < 1)
40038 return SDValue();
40039
40040 // Depth threshold above which we can efficiently use variable mask shuffles.
40041 int VariableCrossLaneShuffleDepth =
40042 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40043 int VariablePerLaneShuffleDepth =
40044 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40045 AllowVariableCrossLaneMask &=
40046 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
40047 AllowVariablePerLaneMask &=
40048 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
40049 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
40050 // higher depth before combining them.
40051 bool AllowBWIVPERMV3 =
40052 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
40053
40054 // If root was a VPERMV3 node, always allow a variable shuffle.
40055 if (Root.getOpcode() == X86ISD::VPERMV3)
40056 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40057
40058 bool MaskContainsZeros = isAnyZero(Mask);
40059
40060 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40061 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40062 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40063 if (Subtarget.hasAVX2() &&
40064 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40065 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40066 Res = CanonicalizeShuffleInput(MaskVT, V1);
40067 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40068 return DAG.getBitcast(RootVT, Res);
40069 }
40070 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40071 if ((Subtarget.hasAVX512() &&
40072 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40073 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40074 (Subtarget.hasBWI() &&
40075 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40076 (Subtarget.hasVBMI() &&
40077 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40078 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40079 V2 = DAG.getUNDEF(MaskVT);
40080 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40081 return DAG.getBitcast(RootVT, Res);
40082 }
40083 }
40084
40085 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40086 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40087 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40088 ((Subtarget.hasAVX512() &&
40089 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40090 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40091 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40092 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40093 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40094 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40095 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40096 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40097 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40098 for (unsigned i = 0; i != NumMaskElts; ++i)
40099 if (Mask[i] == SM_SentinelZero)
40100 Mask[i] = NumMaskElts + i;
40101 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40102 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40103 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40104 return DAG.getBitcast(RootVT, Res);
40105 }
40106
40107 // If that failed and either input is extracted then try to combine as a
40108 // shuffle with the larger type.
40110 Inputs, Root, BaseMask, Depth, HasVariableMask,
40111 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
40112 Subtarget))
40113 return WideShuffle;
40114
40115 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40116 // (non-VLX will pad to 512-bit shuffles).
40117 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40118 ((Subtarget.hasAVX512() &&
40119 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40120 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40121 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40122 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40123 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40124 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40125 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40126 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40127 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40128 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40129 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40130 return DAG.getBitcast(RootVT, Res);
40131 }
40132 return SDValue();
40133 }
40134
40135 // See if we can combine a single input shuffle with zeros to a bit-mask,
40136 // which is much simpler than any shuffle.
40137 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40138 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40139 TLI.isTypeLegal(MaskVT)) {
40140 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40141 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40142 APInt UndefElts(NumMaskElts, 0);
40143 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40144 for (unsigned i = 0; i != NumMaskElts; ++i) {
40145 int M = Mask[i];
40146 if (M == SM_SentinelUndef) {
40147 UndefElts.setBit(i);
40148 continue;
40149 }
40150 if (M == SM_SentinelZero)
40151 continue;
40152 EltBits[i] = AllOnes;
40153 }
40154 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40155 Res = CanonicalizeShuffleInput(MaskVT, V1);
40156 unsigned AndOpcode =
40158 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40159 return DAG.getBitcast(RootVT, Res);
40160 }
40161
40162 // If we have a single input shuffle with different shuffle patterns in the
40163 // the 128-bit lanes use the variable mask to VPERMILPS.
40164 // TODO Combine other mask types at higher depths.
40165 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40166 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40167 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40168 SmallVector<SDValue, 16> VPermIdx;
40169 for (int M : Mask) {
40170 SDValue Idx =
40171 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40172 VPermIdx.push_back(Idx);
40173 }
40174 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40175 Res = CanonicalizeShuffleInput(MaskVT, V1);
40176 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40177 return DAG.getBitcast(RootVT, Res);
40178 }
40179
40180 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40181 // to VPERMIL2PD/VPERMIL2PS.
40182 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40183 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40184 MaskVT == MVT::v8f32)) {
40185 // VPERMIL2 Operation.
40186 // Bits[3] - Match Bit.
40187 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40188 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40189 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40190 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40191 SmallVector<int, 8> VPerm2Idx;
40192 unsigned M2ZImm = 0;
40193 for (int M : Mask) {
40194 if (M == SM_SentinelUndef) {
40195 VPerm2Idx.push_back(-1);
40196 continue;
40197 }
40198 if (M == SM_SentinelZero) {
40199 M2ZImm = 2;
40200 VPerm2Idx.push_back(8);
40201 continue;
40202 }
40203 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40204 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40205 VPerm2Idx.push_back(Index);
40206 }
40207 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40208 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40209 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40210 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40211 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40212 return DAG.getBitcast(RootVT, Res);
40213 }
40214
40215 // If we have 3 or more shuffle instructions or a chain involving a variable
40216 // mask, we can replace them with a single PSHUFB instruction profitably.
40217 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40218 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40219 // more aggressive.
40220 if (UnaryShuffle && AllowVariablePerLaneMask &&
40221 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40222 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40223 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40224 SmallVector<SDValue, 16> PSHUFBMask;
40225 int NumBytes = RootVT.getSizeInBits() / 8;
40226 int Ratio = NumBytes / NumMaskElts;
40227 for (int i = 0; i < NumBytes; ++i) {
40228 int M = Mask[i / Ratio];
40229 if (M == SM_SentinelUndef) {
40230 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40231 continue;
40232 }
40233 if (M == SM_SentinelZero) {
40234 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40235 continue;
40236 }
40237 M = Ratio * M + i % Ratio;
40238 assert((M / 16) == (i / 16) && "Lane crossing detected");
40239 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40240 }
40241 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40242 Res = CanonicalizeShuffleInput(ByteVT, V1);
40243 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40244 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40245 return DAG.getBitcast(RootVT, Res);
40246 }
40247
40248 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40249 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40250 // slower than PSHUFB on targets that support both.
40251 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40252 Subtarget.hasXOP()) {
40253 // VPPERM Mask Operation
40254 // Bits[4:0] - Byte Index (0 - 31)
40255 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40256 SmallVector<SDValue, 16> VPPERMMask;
40257 int NumBytes = 16;
40258 int Ratio = NumBytes / NumMaskElts;
40259 for (int i = 0; i < NumBytes; ++i) {
40260 int M = Mask[i / Ratio];
40261 if (M == SM_SentinelUndef) {
40262 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40263 continue;
40264 }
40265 if (M == SM_SentinelZero) {
40266 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40267 continue;
40268 }
40269 M = Ratio * M + i % Ratio;
40270 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40271 }
40272 MVT ByteVT = MVT::v16i8;
40273 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40274 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40275 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40276 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40277 return DAG.getBitcast(RootVT, Res);
40278 }
40279
40280 // If that failed and either input is extracted then try to combine as a
40281 // shuffle with the larger type.
40283 Inputs, Root, BaseMask, Depth, HasVariableMask,
40284 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
40285 return WideShuffle;
40286
40287 // If we have a dual input shuffle then lower to VPERMV3,
40288 // (non-VLX will pad to 512-bit shuffles)
40289 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40290 ((Subtarget.hasAVX512() &&
40291 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40292 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40293 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40294 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40295 MaskVT == MVT::v16i32)) ||
40296 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40297 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40298 MaskVT == MVT::v32i16)) ||
40299 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40300 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40301 MaskVT == MVT::v64i8)))) {
40302 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40303 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40304 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40305 return DAG.getBitcast(RootVT, Res);
40306 }
40307
40308 // Failed to find any combines.
40309 return SDValue();
40310}
40311
40312// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40313// instruction if possible.
40314//
40315// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40316// type size to attempt to combine:
40317// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40318// -->
40319// extract_subvector(shuffle(x,y,m2),0)
40321 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
40322 bool HasVariableMask, bool AllowVariableCrossLaneMask,
40323 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40324 const X86Subtarget &Subtarget) {
40325 unsigned NumMaskElts = BaseMask.size();
40326 unsigned NumInputs = Inputs.size();
40327 if (NumInputs == 0)
40328 return SDValue();
40329
40330 EVT RootVT = Root.getValueType();
40331 unsigned RootSizeInBits = RootVT.getSizeInBits();
40332 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40333 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40334
40335 // Peek through subvectors to find widest legal vector.
40336 // TODO: Handle ISD::TRUNCATE
40337 unsigned WideSizeInBits = RootSizeInBits;
40338 for (SDValue Input : Inputs) {
40339 Input = peekThroughBitcasts(Input);
40340 while (1) {
40341 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40342 Input = peekThroughBitcasts(Input.getOperand(0));
40343 continue;
40344 }
40345 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40346 Input.getOperand(0).isUndef()) {
40347 Input = peekThroughBitcasts(Input.getOperand(1));
40348 continue;
40349 }
40350 break;
40351 }
40352 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40353 WideSizeInBits < Input.getValueSizeInBits())
40354 WideSizeInBits = Input.getValueSizeInBits();
40355 }
40356
40357 // Bail if we fail to find a source larger than the existing root.
40358 unsigned Scale = WideSizeInBits / RootSizeInBits;
40359 if (WideSizeInBits <= RootSizeInBits ||
40360 (WideSizeInBits % RootSizeInBits) != 0)
40361 return SDValue();
40362
40363 // Create new mask for larger type.
40364 SmallVector<int, 64> WideMask(BaseMask);
40365 for (int &M : WideMask) {
40366 if (M < 0)
40367 continue;
40368 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
40369 }
40370 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
40371
40372 // Attempt to peek through inputs and adjust mask when we extract from an
40373 // upper subvector.
40374 int AdjustedMasks = 0;
40375 SmallVector<SDValue, 4> WideInputs(Inputs);
40376 for (unsigned I = 0; I != NumInputs; ++I) {
40377 SDValue &Input = WideInputs[I];
40378 Input = peekThroughBitcasts(Input);
40379 while (1) {
40380 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40381 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40383 if (Idx != 0) {
40384 ++AdjustedMasks;
40385 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40386 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40387
40388 int lo = I * WideMask.size();
40389 int hi = (I + 1) * WideMask.size();
40390 for (int &M : WideMask)
40391 if (lo <= M && M < hi)
40392 M += Idx;
40393 }
40394 Input = peekThroughBitcasts(Input.getOperand(0));
40395 continue;
40396 }
40397 // TODO: Handle insertions into upper subvectors.
40398 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40399 Input.getOperand(0).isUndef() &&
40400 isNullConstant(Input.getOperand(2))) {
40401 Input = peekThroughBitcasts(Input.getOperand(1));
40402 continue;
40403 }
40404 break;
40405 }
40406 }
40407
40408 // Remove unused/repeated shuffle source ops.
40409 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40410 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40411
40412 // Bail if we're always extracting from the lowest subvectors,
40413 // combineX86ShuffleChain should match this for the current width, or the
40414 // shuffle still references too many inputs.
40415 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40416 return SDValue();
40417
40418 // Minor canonicalization of the accumulated shuffle mask to make it easier
40419 // to match below. All this does is detect masks with sequential pairs of
40420 // elements, and shrink them to the half-width mask. It does this in a loop
40421 // so it will reduce the size of the mask to the minimal width mask which
40422 // performs an equivalent shuffle.
40423 while (WideMask.size() > 1) {
40424 SmallVector<int, 64> WidenedMask;
40425 if (!canWidenShuffleElements(WideMask, WidenedMask))
40426 break;
40427 WideMask = std::move(WidenedMask);
40428 }
40429
40430 // Canonicalization of binary shuffle masks to improve pattern matching by
40431 // commuting the inputs.
40432 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40434 std::swap(WideInputs[0], WideInputs[1]);
40435 }
40436
40437 // Increase depth for every upper subvector we've peeked through.
40438 Depth += AdjustedMasks;
40439
40440 // Attempt to combine wider chain.
40441 // TODO: Can we use a better Root?
40442 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40443 WideInputs.back().getValueSizeInBits()
40444 ? WideInputs.front()
40445 : WideInputs.back();
40446 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40447 "WideRootSize mismatch");
40448
40449 if (SDValue WideShuffle =
40450 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
40451 HasVariableMask, AllowVariableCrossLaneMask,
40452 AllowVariablePerLaneMask, DAG, Subtarget)) {
40453 WideShuffle =
40454 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
40455 return DAG.getBitcast(RootVT, WideShuffle);
40456 }
40457
40458 return SDValue();
40459}
40460
40461// Canonicalize the combined shuffle mask chain with horizontal ops.
40462// NOTE: This may update the Ops and Mask.
40465 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40466 const X86Subtarget &Subtarget) {
40467 if (Mask.empty() || Ops.empty())
40468 return SDValue();
40469
40471 for (SDValue Op : Ops)
40473
40474 // All ops must be the same horizop + type.
40475 SDValue BC0 = BC[0];
40476 EVT VT0 = BC0.getValueType();
40477 unsigned Opcode0 = BC0.getOpcode();
40478 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40479 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40480 }))
40481 return SDValue();
40482
40483 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40484 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40485 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40486 if (!isHoriz && !isPack)
40487 return SDValue();
40488
40489 // Do all ops have a single use?
40490 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40491 return Op.hasOneUse() &&
40493 });
40494
40495 int NumElts = VT0.getVectorNumElements();
40496 int NumLanes = VT0.getSizeInBits() / 128;
40497 int NumEltsPerLane = NumElts / NumLanes;
40498 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40499 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40500 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40501
40502 if (NumEltsPerLane >= 4 &&
40503 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40504 SmallVector<int> LaneMask, ScaledMask;
40505 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40506 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40507 // See if we can remove the shuffle by resorting the HOP chain so that
40508 // the HOP args are pre-shuffled.
40509 // TODO: Generalize to any sized/depth chain.
40510 // TODO: Add support for PACKSS/PACKUS.
40511 if (isHoriz) {
40512 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40513 auto GetHOpSrc = [&](int M) {
40514 if (M == SM_SentinelUndef)
40515 return DAG.getUNDEF(VT0);
40516 if (M == SM_SentinelZero)
40517 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40518 SDValue Src0 = BC[M / 4];
40519 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40520 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40521 return Src1.getOperand(M % 2);
40522 return SDValue();
40523 };
40524 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40525 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40526 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40527 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40528 if (M0 && M1 && M2 && M3) {
40529 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40530 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40531 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40532 }
40533 }
40534 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40535 if (Ops.size() >= 2) {
40536 SDValue LHS, RHS;
40537 auto GetHOpSrc = [&](int M, int &OutM) {
40538 // TODO: Support SM_SentinelZero
40539 if (M < 0)
40540 return M == SM_SentinelUndef;
40541 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40542 if (!LHS || LHS == Src) {
40543 LHS = Src;
40544 OutM = (M % 2);
40545 return true;
40546 }
40547 if (!RHS || RHS == Src) {
40548 RHS = Src;
40549 OutM = (M % 2) + 2;
40550 return true;
40551 }
40552 return false;
40553 };
40554 int PostMask[4] = {-1, -1, -1, -1};
40555 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40556 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40557 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40558 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40559 LHS = DAG.getBitcast(SrcVT, LHS);
40560 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40561 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40562 // Use SHUFPS for the permute so this will work on SSE2 targets,
40563 // shuffle combining and domain handling will simplify this later on.
40564 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40565 Res = DAG.getBitcast(ShuffleVT, Res);
40566 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40567 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40568 }
40569 }
40570 }
40571 }
40572
40573 if (2 < Ops.size())
40574 return SDValue();
40575
40576 SDValue BC1 = BC[BC.size() - 1];
40577 if (Mask.size() == VT0.getVectorNumElements()) {
40578 // Canonicalize binary shuffles of horizontal ops that use the
40579 // same sources to an unary shuffle.
40580 // TODO: Try to perform this fold even if the shuffle remains.
40581 if (Ops.size() == 2) {
40582 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40583 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40584 };
40585 // Commute if all BC0's ops are contained in BC1.
40586 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40587 ContainsOps(BC1, BC0.getOperand(1))) {
40589 std::swap(Ops[0], Ops[1]);
40590 std::swap(BC0, BC1);
40591 }
40592
40593 // If BC1 can be represented by BC0, then convert to unary shuffle.
40594 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40595 ContainsOps(BC0, BC1.getOperand(1))) {
40596 for (int &M : Mask) {
40597 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40598 continue;
40599 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40600 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40601 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40602 M += NumHalfEltsPerLane;
40603 }
40604 }
40605 }
40606
40607 // Canonicalize unary horizontal ops to only refer to lower halves.
40608 for (int i = 0; i != NumElts; ++i) {
40609 int &M = Mask[i];
40610 if (isUndefOrZero(M))
40611 continue;
40612 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40613 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40614 M -= NumHalfEltsPerLane;
40615 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40616 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40617 M -= NumHalfEltsPerLane;
40618 }
40619 }
40620
40621 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
40622 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
40623 // represents the LHS/RHS inputs for the lower/upper halves.
40624 SmallVector<int, 16> TargetMask128, WideMask128;
40625 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
40626 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
40627 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
40628 bool SingleOp = (Ops.size() == 1);
40629 if (isPack || OneUseOps ||
40630 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
40631 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
40632 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
40633 Lo = Lo.getOperand(WideMask128[0] & 1);
40634 Hi = Hi.getOperand(WideMask128[1] & 1);
40635 if (SingleOp) {
40636 SDValue Undef = DAG.getUNDEF(SrcVT);
40637 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
40638 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
40639 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
40640 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
40641 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
40642 }
40643 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
40644 }
40645 }
40646
40647 // If we are post-shuffling a 256-bit hop and not requiring the upper
40648 // elements, then try to narrow to a 128-bit hop directly.
40649 SmallVector<int, 16> WideMask64;
40650 if (Ops.size() == 1 && NumLanes == 2 &&
40651 scaleShuffleElements(Mask, 4, WideMask64) &&
40652 isUndefInRange(WideMask64, 2, 2)) {
40653 int M0 = WideMask64[0];
40654 int M1 = WideMask64[1];
40655 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
40657 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
40658 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
40659 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
40660 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
40661 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
40662 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
40663 }
40664 }
40665
40666 return SDValue();
40667}
40668
40669// Attempt to constant fold all of the constant source ops.
40670// Returns true if the entire shuffle is folded to a constant.
40671// TODO: Extend this to merge multiple constant Ops and update the mask.
40673 ArrayRef<int> Mask,
40674 bool HasVariableMask,
40675 SelectionDAG &DAG, const SDLoc &DL,
40676 const X86Subtarget &Subtarget) {
40677 unsigned SizeInBits = VT.getSizeInBits();
40678 unsigned NumMaskElts = Mask.size();
40679 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
40680 unsigned NumOps = Ops.size();
40681
40682 // Extract constant bits from each source op.
40683 SmallVector<APInt, 16> UndefEltsOps(NumOps);
40684 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
40685 for (unsigned I = 0; I != NumOps; ++I)
40686 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
40687 RawBitsOps[I],
40688 /*AllowWholeUndefs*/ true,
40689 /*AllowPartialUndefs*/ true))
40690 return SDValue();
40691
40692 // If we're optimizing for size, only fold if at least one of the constants is
40693 // only used once or the combined shuffle has included a variable mask
40694 // shuffle, this is to avoid constant pool bloat.
40695 bool IsOptimizingSize = DAG.shouldOptForSize();
40696 if (IsOptimizingSize && !HasVariableMask &&
40697 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
40698 return SDValue();
40699
40700 // Shuffle the constant bits according to the mask.
40701 APInt UndefElts(NumMaskElts, 0);
40702 APInt ZeroElts(NumMaskElts, 0);
40703 APInt ConstantElts(NumMaskElts, 0);
40704 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
40705 APInt::getZero(MaskSizeInBits));
40706 for (unsigned i = 0; i != NumMaskElts; ++i) {
40707 int M = Mask[i];
40708 if (M == SM_SentinelUndef) {
40709 UndefElts.setBit(i);
40710 continue;
40711 } else if (M == SM_SentinelZero) {
40712 ZeroElts.setBit(i);
40713 continue;
40714 }
40715 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
40716
40717 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
40718 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
40719
40720 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
40721 if (SrcUndefElts[SrcMaskIdx]) {
40722 UndefElts.setBit(i);
40723 continue;
40724 }
40725
40726 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
40727 APInt &Bits = SrcEltBits[SrcMaskIdx];
40728 if (!Bits) {
40729 ZeroElts.setBit(i);
40730 continue;
40731 }
40732
40733 ConstantElts.setBit(i);
40734 ConstantBitData[i] = Bits;
40735 }
40736 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
40737
40738 // Attempt to create a zero vector.
40739 if ((UndefElts | ZeroElts).isAllOnes())
40740 return getZeroVector(VT, Subtarget, DAG, DL);
40741
40742 // Create the constant data.
40743 MVT MaskSVT;
40744 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
40745 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
40746 else
40747 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
40748
40749 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
40750 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
40751 return SDValue();
40752
40753 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
40754 return DAG.getBitcast(VT, CstOp);
40755}
40756
40757namespace llvm {
40758 namespace X86 {
40759 enum {
40762 } // namespace X86
40763} // namespace llvm
40764
40765/// Fully generic combining of x86 shuffle instructions.
40766///
40767/// This should be the last combine run over the x86 shuffle instructions. Once
40768/// they have been fully optimized, this will recursively consider all chains
40769/// of single-use shuffle instructions, build a generic model of the cumulative
40770/// shuffle operation, and check for simpler instructions which implement this
40771/// operation. We use this primarily for two purposes:
40772///
40773/// 1) Collapse generic shuffles to specialized single instructions when
40774/// equivalent. In most cases, this is just an encoding size win, but
40775/// sometimes we will collapse multiple generic shuffles into a single
40776/// special-purpose shuffle.
40777/// 2) Look for sequences of shuffle instructions with 3 or more total
40778/// instructions, and replace them with the slightly more expensive SSSE3
40779/// PSHUFB instruction if available. We do this as the last combining step
40780/// to ensure we avoid using PSHUFB if we can implement the shuffle with
40781/// a suitable short sequence of other instructions. The PSHUFB will either
40782/// use a register or have to read from memory and so is slightly (but only
40783/// slightly) more expensive than the other shuffle instructions.
40784///
40785/// Because this is inherently a quadratic operation (for each shuffle in
40786/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
40787/// This should never be an issue in practice as the shuffle lowering doesn't
40788/// produce sequences of more than 8 instructions.
40789///
40790/// FIXME: We will currently miss some cases where the redundant shuffling
40791/// would simplify under the threshold for PSHUFB formation because of
40792/// combine-ordering. To fix this, we should do the redundant instruction
40793/// combining in this recursive walk.
40795 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
40796 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
40797 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
40798 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40799 const X86Subtarget &Subtarget) {
40800 assert(!RootMask.empty() &&
40801 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
40802 "Illegal shuffle root mask");
40803 MVT RootVT = Root.getSimpleValueType();
40804 assert(RootVT.isVector() && "Shuffles operate on vector types!");
40805 unsigned RootSizeInBits = RootVT.getSizeInBits();
40806 SDLoc DL(Root);
40807
40808 // Bound the depth of our recursive combine because this is ultimately
40809 // quadratic in nature.
40810 if (Depth >= MaxDepth)
40811 return SDValue();
40812
40813 // Directly rip through bitcasts to find the underlying operand.
40814 SDValue Op = SrcOps[SrcOpIndex];
40816
40817 EVT VT = Op.getValueType();
40818 if (!VT.isVector() || !VT.isSimple())
40819 return SDValue(); // Bail if we hit a non-simple non-vector.
40820
40821 // FIXME: Just bail on f16 for now.
40822 if (VT.getVectorElementType() == MVT::f16)
40823 return SDValue();
40824
40825 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
40826 "Can only combine shuffles upto size of the root op.");
40827
40828 // Create a demanded elts mask from the referenced elements of Op.
40829 APInt OpDemandedElts = APInt::getZero(RootMask.size());
40830 for (int M : RootMask) {
40831 int BaseIdx = RootMask.size() * SrcOpIndex;
40832 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
40833 OpDemandedElts.setBit(M - BaseIdx);
40834 }
40835 if (RootSizeInBits != VT.getSizeInBits()) {
40836 // Op is smaller than Root - extract the demanded elts for the subvector.
40837 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
40838 unsigned NumOpMaskElts = RootMask.size() / Scale;
40839 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
40840 assert(OpDemandedElts
40841 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
40842 .isZero() &&
40843 "Out of range elements referenced in root mask");
40844 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
40845 }
40846 OpDemandedElts =
40847 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
40848
40849 // Extract target shuffle mask and resolve sentinels and inputs.
40850 SmallVector<int, 64> OpMask;
40851 SmallVector<SDValue, 2> OpInputs;
40852 APInt OpUndef, OpZero;
40853 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
40854 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
40855 OpZero, DAG, Depth, false)) {
40856 // Shuffle inputs must not be larger than the shuffle result.
40857 // TODO: Relax this for single input faux shuffles (e.g. trunc).
40858 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
40859 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
40860 }))
40861 return SDValue();
40862 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40863 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
40864 !isNullConstant(Op.getOperand(1))) {
40865 SDValue SrcVec = Op.getOperand(0);
40866 int ExtractIdx = Op.getConstantOperandVal(1);
40867 unsigned NumElts = VT.getVectorNumElements();
40868 OpInputs.assign({SrcVec});
40869 OpMask.assign(NumElts, SM_SentinelUndef);
40870 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
40871 OpZero = OpUndef = APInt::getZero(NumElts);
40872 } else {
40873 return SDValue();
40874 }
40875
40876 // If the shuffle result was smaller than the root, we need to adjust the
40877 // mask indices and pad the mask with undefs.
40878 if (RootSizeInBits > VT.getSizeInBits()) {
40879 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
40880 unsigned OpMaskSize = OpMask.size();
40881 if (OpInputs.size() > 1) {
40882 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
40883 for (int &M : OpMask) {
40884 if (M < 0)
40885 continue;
40886 int EltIdx = M % OpMaskSize;
40887 int OpIdx = M / OpMaskSize;
40888 M = (PaddedMaskSize * OpIdx) + EltIdx;
40889 }
40890 }
40891 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
40892 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
40893 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
40894 }
40895
40898
40899 // We don't need to merge masks if the root is empty.
40900 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
40901 if (EmptyRoot) {
40902 // Only resolve zeros if it will remove an input, otherwise we might end
40903 // up in an infinite loop.
40904 bool ResolveKnownZeros = true;
40905 if (!OpZero.isZero()) {
40906 APInt UsedInputs = APInt::getZero(OpInputs.size());
40907 for (int i = 0, e = OpMask.size(); i != e; ++i) {
40908 int M = OpMask[i];
40909 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
40910 continue;
40911 UsedInputs.setBit(M / OpMask.size());
40912 if (UsedInputs.isAllOnes()) {
40913 ResolveKnownZeros = false;
40914 break;
40915 }
40916 }
40917 }
40918 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
40919 ResolveKnownZeros);
40920
40921 Mask = OpMask;
40922 Ops.append(OpInputs.begin(), OpInputs.end());
40923 } else {
40924 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
40925
40926 // Add the inputs to the Ops list, avoiding duplicates.
40927 Ops.append(SrcOps.begin(), SrcOps.end());
40928
40929 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
40930 // Attempt to find an existing match.
40931 SDValue InputBC = peekThroughBitcasts(Input);
40932 for (int i = 0, e = Ops.size(); i < e; ++i)
40933 if (InputBC == peekThroughBitcasts(Ops[i]))
40934 return i;
40935 // Match failed - should we replace an existing Op?
40936 if (InsertionPoint >= 0) {
40937 Ops[InsertionPoint] = Input;
40938 return InsertionPoint;
40939 }
40940 // Add to the end of the Ops list.
40941 Ops.push_back(Input);
40942 return Ops.size() - 1;
40943 };
40944
40945 SmallVector<int, 2> OpInputIdx;
40946 for (SDValue OpInput : OpInputs)
40947 OpInputIdx.push_back(
40948 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
40949
40950 assert(((RootMask.size() > OpMask.size() &&
40951 RootMask.size() % OpMask.size() == 0) ||
40952 (OpMask.size() > RootMask.size() &&
40953 OpMask.size() % RootMask.size() == 0) ||
40954 OpMask.size() == RootMask.size()) &&
40955 "The smaller number of elements must divide the larger.");
40956
40957 // This function can be performance-critical, so we rely on the power-of-2
40958 // knowledge that we have about the mask sizes to replace div/rem ops with
40959 // bit-masks and shifts.
40960 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
40961 "Non-power-of-2 shuffle mask sizes");
40962 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
40963 "Non-power-of-2 shuffle mask sizes");
40964 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
40965 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
40966
40967 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
40968 unsigned RootRatio =
40969 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
40970 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
40971 assert((RootRatio == 1 || OpRatio == 1) &&
40972 "Must not have a ratio for both incoming and op masks!");
40973
40974 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
40975 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
40976 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
40977 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
40978 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
40979
40980 Mask.resize(MaskWidth, SM_SentinelUndef);
40981
40982 // Merge this shuffle operation's mask into our accumulated mask. Note that
40983 // this shuffle's mask will be the first applied to the input, followed by
40984 // the root mask to get us all the way to the root value arrangement. The
40985 // reason for this order is that we are recursing up the operation chain.
40986 for (unsigned i = 0; i < MaskWidth; ++i) {
40987 unsigned RootIdx = i >> RootRatioLog2;
40988 if (RootMask[RootIdx] < 0) {
40989 // This is a zero or undef lane, we're done.
40990 Mask[i] = RootMask[RootIdx];
40991 continue;
40992 }
40993
40994 unsigned RootMaskedIdx =
40995 RootRatio == 1
40996 ? RootMask[RootIdx]
40997 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
40998
40999 // Just insert the scaled root mask value if it references an input other
41000 // than the SrcOp we're currently inserting.
41001 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41002 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41003 Mask[i] = RootMaskedIdx;
41004 continue;
41005 }
41006
41007 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41008 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41009 if (OpMask[OpIdx] < 0) {
41010 // The incoming lanes are zero or undef, it doesn't matter which ones we
41011 // are using.
41012 Mask[i] = OpMask[OpIdx];
41013 continue;
41014 }
41015
41016 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41017 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41018 : (OpMask[OpIdx] << OpRatioLog2) +
41019 (RootMaskedIdx & (OpRatio - 1));
41020
41021 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41022 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41023 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41024 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41025
41026 Mask[i] = OpMaskedIdx;
41027 }
41028 }
41029
41030 // Peek through vector widenings and set out of bounds mask indices to undef.
41031 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41032 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
41033 SDValue &Op = Ops[I];
41034 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
41035 isNullConstant(Op.getOperand(2))) {
41036 Op = Op.getOperand(1);
41037 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41038 int Lo = I * Mask.size();
41039 int Hi = (I + 1) * Mask.size();
41040 int NewHi = Lo + (Mask.size() / Scale);
41041 for (int &M : Mask) {
41042 if (Lo <= M && NewHi <= M && M < Hi)
41043 M = SM_SentinelUndef;
41044 }
41045 }
41046 }
41047
41048 // Peek through any free extract_subvector nodes back to root size.
41049 for (SDValue &Op : Ops)
41050 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41051 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41052 isNullConstant(Op.getOperand(1)))
41053 Op = Op.getOperand(0);
41054
41055 // Remove unused/repeated shuffle source ops.
41057
41058 // Handle the all undef/zero/ones cases early.
41059 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41060 return DAG.getUNDEF(RootVT);
41061 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41062 return getZeroVector(RootVT, Subtarget, DAG, DL);
41063 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41065 return getOnesVector(RootVT, DAG, DL);
41066
41067 assert(!Ops.empty() && "Shuffle with no inputs detected");
41068 HasVariableMask |= IsOpVariableMask;
41069
41070 // Update the list of shuffle nodes that have been combined so far.
41071 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41072 CombinedNodes.push_back(Op.getNode());
41073
41074 // See if we can recurse into each shuffle source op (if it's a target
41075 // shuffle). The source op should only be generally combined if it either has
41076 // a single use (i.e. current Op) or all its users have already been combined,
41077 // if not then we can still combine but should prevent generation of variable
41078 // shuffles to avoid constant pool bloat.
41079 // Don't recurse if we already have more source ops than we can combine in
41080 // the remaining recursion depth.
41081 if (Ops.size() < (MaxDepth - Depth)) {
41082 for (int i = 0, e = Ops.size(); i < e; ++i) {
41083 // For empty roots, we need to resolve zeroable elements before combining
41084 // them with other shuffles.
41085 SmallVector<int, 64> ResolvedMask = Mask;
41086 if (EmptyRoot)
41087 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41088 bool AllowCrossLaneVar = false;
41089 bool AllowPerLaneVar = false;
41090 if (Ops[i].getNode()->hasOneUse() ||
41091 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41092 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41093 AllowPerLaneVar = AllowVariablePerLaneMask;
41094 }
41096 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
41097 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
41098 Subtarget))
41099 return Res;
41100 }
41101 }
41102
41103 // Attempt to constant fold all of the constant source ops.
41105 RootVT, Ops, Mask, HasVariableMask, DAG, DL, Subtarget))
41106 return Cst;
41107
41108 // If constant fold failed and we only have constants - then we have
41109 // multiple uses by a single non-variable shuffle - just bail.
41110 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41111 APInt UndefElts;
41112 SmallVector<APInt> RawBits;
41113 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41114 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41115 RawBits,
41116 /*AllowWholeUndefs*/ true,
41117 /*AllowPartialUndefs*/ true);
41118 })) {
41119 return SDValue();
41120 }
41121
41122 // Canonicalize the combined shuffle mask chain with horizontal ops.
41123 // NOTE: This will update the Ops and Mask.
41125 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41126 return DAG.getBitcast(RootVT, HOp);
41127
41128 // Try to refine our inputs given our knowledge of target shuffle mask.
41129 for (auto I : enumerate(Ops)) {
41130 int OpIdx = I.index();
41131 SDValue &Op = I.value();
41132
41133 // What range of shuffle mask element values results in picking from Op?
41134 int Lo = OpIdx * Mask.size();
41135 int Hi = Lo + Mask.size();
41136
41137 // Which elements of Op do we demand, given the mask's granularity?
41138 APInt OpDemandedElts(Mask.size(), 0);
41139 for (int MaskElt : Mask) {
41140 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41141 int OpEltIdx = MaskElt - Lo;
41142 OpDemandedElts.setBit(OpEltIdx);
41143 }
41144 }
41145
41146 // Is the shuffle result smaller than the root?
41147 if (Op.getValueSizeInBits() < RootSizeInBits) {
41148 // We padded the mask with undefs. But we now need to undo that.
41149 unsigned NumExpectedVectorElts = Mask.size();
41150 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41151 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41152 assert(!OpDemandedElts.extractBits(
41153 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41154 "Demanding the virtual undef widening padding?");
41155 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41156 }
41157
41158 // The Op itself may be of different VT, so we need to scale the mask.
41159 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41160 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41161
41162 // Can this operand be simplified any further, given it's demanded elements?
41163 if (SDValue NewOp =
41165 Op, OpScaledDemandedElts, DAG))
41166 Op = NewOp;
41167 }
41168 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41169
41170 // Widen any subvector shuffle inputs we've collected.
41171 // TODO: Remove this to avoid generating temporary nodes, we should only
41172 // widen once combineX86ShuffleChain has found a match.
41173 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41174 return Op.getValueSizeInBits() < RootSizeInBits;
41175 })) {
41176 for (SDValue &Op : Ops)
41177 if (Op.getValueSizeInBits() < RootSizeInBits)
41178 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41179 RootSizeInBits);
41180 // Reresolve - we might have repeated subvector sources.
41182 }
41183
41184 // We can only combine unary and binary shuffle mask cases.
41185 if (Ops.size() <= 2) {
41186 // Minor canonicalization of the accumulated shuffle mask to make it easier
41187 // to match below. All this does is detect masks with sequential pairs of
41188 // elements, and shrink them to the half-width mask. It does this in a loop
41189 // so it will reduce the size of the mask to the minimal width mask which
41190 // performs an equivalent shuffle.
41191 while (Mask.size() > 1) {
41192 SmallVector<int, 64> WidenedMask;
41193 if (!canWidenShuffleElements(Mask, WidenedMask))
41194 break;
41195 Mask = std::move(WidenedMask);
41196 }
41197
41198 // Canonicalization of binary shuffle masks to improve pattern matching by
41199 // commuting the inputs.
41200 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41202 std::swap(Ops[0], Ops[1]);
41203 }
41204
41205 // Try to combine into a single shuffle instruction.
41206 if (SDValue Shuffle = combineX86ShuffleChain(
41207 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41208 AllowVariablePerLaneMask, DAG, Subtarget))
41209 return Shuffle;
41210
41211 // If all the operands come from the same larger vector, fallthrough and try
41212 // to use combineX86ShuffleChainWithExtract.
41215 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41216 (RootSizeInBits / Mask.size()) != 64 ||
41217 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41218 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41219 LHS.getOperand(0) != RHS.getOperand(0))
41220 return SDValue();
41221 }
41222
41223 // If that failed and any input is extracted then try to combine as a
41224 // shuffle with the larger type.
41226 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41227 AllowVariablePerLaneMask, DAG, Subtarget);
41228}
41229
41230/// Helper entry wrapper to combineX86ShufflesRecursively.
41232 const X86Subtarget &Subtarget) {
41234 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
41235 /*HasVarMask*/ false,
41236 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
41237 Subtarget);
41238}
41239
41240/// Get the PSHUF-style mask from PSHUF node.
41241///
41242/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41243/// PSHUF-style masks that can be reused with such instructions.
41245 MVT VT = N.getSimpleValueType();
41248 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41249 (void)HaveMask;
41250 assert(HaveMask);
41251
41252 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41253 // matter. Check that the upper masks are repeats and remove them.
41254 if (VT.getSizeInBits() > 128) {
41255 int LaneElts = 128 / VT.getScalarSizeInBits();
41256#ifndef NDEBUG
41257 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41258 for (int j = 0; j < LaneElts; ++j)
41259 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41260 "Mask doesn't repeat in high 128-bit lanes!");
41261#endif
41262 Mask.resize(LaneElts);
41263 }
41264
41265 switch (N.getOpcode()) {
41266 case X86ISD::PSHUFD:
41267 return Mask;
41268 case X86ISD::PSHUFLW:
41269 Mask.resize(4);
41270 return Mask;
41271 case X86ISD::PSHUFHW:
41272 Mask.erase(Mask.begin(), Mask.begin() + 4);
41273 for (int &M : Mask)
41274 M -= 4;
41275 return Mask;
41276 default:
41277 llvm_unreachable("No valid shuffle instruction found!");
41278 }
41279}
41280
41281/// Search for a combinable shuffle across a chain ending in pshufd.
41282///
41283/// We walk up the chain and look for a combinable shuffle, skipping over
41284/// shuffles that we could hoist this shuffle's transformation past without
41285/// altering anything.
41288 const SDLoc &DL,
41289 SelectionDAG &DAG) {
41290 assert(N.getOpcode() == X86ISD::PSHUFD &&
41291 "Called with something other than an x86 128-bit half shuffle!");
41292
41293 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41294 // of the shuffles in the chain so that we can form a fresh chain to replace
41295 // this one.
41297 SDValue V = N.getOperand(0);
41298 for (; V.hasOneUse(); V = V.getOperand(0)) {
41299 switch (V.getOpcode()) {
41300 default:
41301 return SDValue(); // Nothing combined!
41302
41303 case ISD::BITCAST:
41304 // Skip bitcasts as we always know the type for the target specific
41305 // instructions.
41306 continue;
41307
41308 case X86ISD::PSHUFD:
41309 // Found another dword shuffle.
41310 break;
41311
41312 case X86ISD::PSHUFLW:
41313 // Check that the low words (being shuffled) are the identity in the
41314 // dword shuffle, and the high words are self-contained.
41315 if (Mask[0] != 0 || Mask[1] != 1 ||
41316 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41317 return SDValue();
41318
41319 Chain.push_back(V);
41320 continue;
41321
41322 case X86ISD::PSHUFHW:
41323 // Check that the high words (being shuffled) are the identity in the
41324 // dword shuffle, and the low words are self-contained.
41325 if (Mask[2] != 2 || Mask[3] != 3 ||
41326 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41327 return SDValue();
41328
41329 Chain.push_back(V);
41330 continue;
41331
41332 case X86ISD::UNPCKL:
41333 case X86ISD::UNPCKH:
41334 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41335 // shuffle into a preceding word shuffle.
41336 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41337 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41338 return SDValue();
41339
41340 // Search for a half-shuffle which we can combine with.
41341 unsigned CombineOp =
41342 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41343 if (V.getOperand(0) != V.getOperand(1) ||
41344 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41345 return SDValue();
41346 Chain.push_back(V);
41347 V = V.getOperand(0);
41348 do {
41349 switch (V.getOpcode()) {
41350 default:
41351 return SDValue(); // Nothing to combine.
41352
41353 case X86ISD::PSHUFLW:
41354 case X86ISD::PSHUFHW:
41355 if (V.getOpcode() == CombineOp)
41356 break;
41357
41358 Chain.push_back(V);
41359
41360 [[fallthrough]];
41361 case ISD::BITCAST:
41362 V = V.getOperand(0);
41363 continue;
41364 }
41365 break;
41366 } while (V.hasOneUse());
41367 break;
41368 }
41369 // Break out of the loop if we break out of the switch.
41370 break;
41371 }
41372
41373 if (!V.hasOneUse())
41374 // We fell out of the loop without finding a viable combining instruction.
41375 return SDValue();
41376
41377 // Merge this node's mask and our incoming mask.
41379 for (int &M : Mask)
41380 M = VMask[M];
41381 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41382 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41383
41384 // Rebuild the chain around this new shuffle.
41385 while (!Chain.empty()) {
41386 SDValue W = Chain.pop_back_val();
41387
41388 if (V.getValueType() != W.getOperand(0).getValueType())
41389 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41390
41391 switch (W.getOpcode()) {
41392 default:
41393 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41394
41395 case X86ISD::UNPCKL:
41396 case X86ISD::UNPCKH:
41397 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41398 break;
41399
41400 case X86ISD::PSHUFD:
41401 case X86ISD::PSHUFLW:
41402 case X86ISD::PSHUFHW:
41403 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41404 break;
41405 }
41406 }
41407 if (V.getValueType() != N.getValueType())
41408 V = DAG.getBitcast(N.getValueType(), V);
41409
41410 // Return the new chain to replace N.
41411 return V;
41412}
41413
41414// Attempt to commute shufps LHS loads:
41415// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41417 SelectionDAG &DAG) {
41418 // TODO: Add vXf64 support.
41419 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41420 return SDValue();
41421
41422 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41423 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41424 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41425 return SDValue();
41426 SDValue N0 = V.getOperand(0);
41427 SDValue N1 = V.getOperand(1);
41428 unsigned Imm = V.getConstantOperandVal(2);
41429 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41430 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41432 return SDValue();
41433 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41434 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41435 DAG.getTargetConstant(Imm, DL, MVT::i8));
41436 };
41437
41438 switch (N.getOpcode()) {
41439 case X86ISD::VPERMILPI:
41440 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41441 unsigned Imm = N.getConstantOperandVal(1);
41442 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41443 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41444 }
41445 break;
41446 case X86ISD::SHUFP: {
41447 SDValue N0 = N.getOperand(0);
41448 SDValue N1 = N.getOperand(1);
41449 unsigned Imm = N.getConstantOperandVal(2);
41450 if (N0 == N1) {
41451 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41452 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41453 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41454 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41455 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41456 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41457 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41458 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41459 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41460 }
41461 break;
41462 }
41463 }
41464
41465 return SDValue();
41466}
41467
41468// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41469// iff we don't demand the same element index for both X and Y.
41470static SDValue
41472 const APInt &DemandedElts, SelectionDAG &DAG,
41473 const X86Subtarget &Subtarget, const SDLoc &DL) {
41474 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41475 if (!N0.hasOneUse() || !N1.hasOneUse())
41476 return SDValue();
41477
41478 unsigned NumElts = VT.getVectorNumElements();
41481
41482 // See if both operands are shuffles, and that we can scale the shuffle masks
41483 // to the same width as the blend mask.
41484 // TODO: Support SM_SentinelZero?
41485 SmallVector<SDValue, 2> Ops0, Ops1;
41486 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41487 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41488 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41489 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41490 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41491 return SDValue();
41492
41493 // Determine the demanded elts from both permutes.
41494 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41495 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41496 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41497 Demanded1,
41498 /*AllowUndefElts=*/true) ||
41499 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41500 DemandedRHS0, /*AllowUndefElts=*/true) ||
41501 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41502 DemandedRHS1, /*AllowUndefElts=*/true))
41503 return SDValue();
41504
41505 // Confirm that we only use a single operand from both permutes and that we
41506 // don't demand the same index from both.
41507 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41508 DemandedLHS0.intersects(DemandedLHS1))
41509 return SDValue();
41510
41511 // Use the permute demanded elts masks as the new blend mask.
41512 // Create the new permute mask as a blend of the 2 original permute masks.
41513 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41514 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41515 for (unsigned I = 0; I != NumElts; ++I) {
41516 if (Demanded0[I]) {
41517 int M = ScaledMask0[I];
41518 if (0 <= M) {
41519 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41520 "BlendMask demands LHS AND RHS");
41521 NewBlendMask[M] = M;
41522 NewPermuteMask[I] = M;
41523 }
41524 } else if (Demanded1[I]) {
41525 int M = ScaledMask1[I];
41526 if (0 <= M) {
41527 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41528 "BlendMask demands LHS AND RHS");
41529 NewBlendMask[M] = M + NumElts;
41530 NewPermuteMask[I] = M;
41531 }
41532 }
41533 }
41534 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41535 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41536
41537 // v16i16 shuffles can explode in complexity very easily, only accept them if
41538 // the blend mask is the same in the 128-bit subvectors (or can widen to
41539 // v8i32) and the permute can be widened as well.
41540 if (VT == MVT::v16i16) {
41541 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41542 !canWidenShuffleElements(NewBlendMask))
41543 return SDValue();
41544 if (!canWidenShuffleElements(NewPermuteMask))
41545 return SDValue();
41546 }
41547
41548 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41549 // widened to a lane permute (vperm2f128).
41550 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41552 NewPermuteMask) &&
41553 !canScaleShuffleElements(NewPermuteMask, 2))
41554 return SDValue();
41555
41556 SDValue NewBlend =
41557 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
41558 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
41559 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
41560 NewPermuteMask);
41561}
41562
41563// TODO - move this to TLI like isBinOp?
41564static bool isUnaryOp(unsigned Opcode) {
41565 switch (Opcode) {
41566 case ISD::CTLZ:
41567 case ISD::CTTZ:
41568 case ISD::CTPOP:
41569 return true;
41570 }
41571 return false;
41572}
41573
41574// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41575// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41577 const SDLoc &DL) {
41578 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41579 EVT ShuffleVT = N.getValueType();
41580 unsigned Opc = N.getOpcode();
41581
41582 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
41583 // AllZeros/AllOnes constants are freely shuffled and will peek through
41584 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
41585 // merge with target shuffles if it has one use so shuffle combining is
41586 // likely to kick in. Shuffles of splats are expected to be removed.
41587 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
41588 ISD::isBuildVectorAllZeros(Op.getNode()) ||
41591 getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
41592 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
41593 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41594 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
41595 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
41596 };
41597 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
41598 // Ensure we only shuffle whole vector src elements, unless its a logical
41599 // binops where we can more aggressively move shuffles from dst to src.
41600 return isLogicOp(BinOp) ||
41601 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
41602 };
41603
41604 switch (Opc) {
41605 // Unary and Unary+Permute Shuffles.
41606 case X86ISD::PSHUFB: {
41607 // Don't merge PSHUFB if it contains zero'd elements.
41608 SmallVector<int> Mask;
41610 if (!getTargetShuffleMask(N, false, Ops, Mask))
41611 break;
41612 [[fallthrough]];
41613 }
41614 case X86ISD::VBROADCAST:
41615 case X86ISD::MOVDDUP:
41616 case X86ISD::PSHUFD:
41617 case X86ISD::PSHUFHW:
41618 case X86ISD::PSHUFLW:
41619 case X86ISD::VPERMI:
41620 case X86ISD::VPERMILPI: {
41621 if (N.getOperand(0).getValueType() == ShuffleVT &&
41622 N->isOnlyUserOf(N.getOperand(0).getNode())) {
41623 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41624 unsigned SrcOpcode = N0.getOpcode();
41625 EVT OpVT = N0.getValueType();
41626 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
41629 bool FoldShuf = Opc != X86ISD::VPERMI;
41630 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
41631 IsMergeableWithShuffle(Op01, FoldShuf)) {
41632 SDValue LHS, RHS;
41633 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41634 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41635 if (N.getNumOperands() == 2) {
41636 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
41637 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
41638 } else {
41639 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
41640 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
41641 }
41642 return DAG.getBitcast(ShuffleVT,
41643 DAG.getNode(SrcOpcode, DL, OpVT,
41644 DAG.getBitcast(OpVT, LHS),
41645 DAG.getBitcast(OpVT, RHS)));
41646 }
41647 }
41648 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
41649 OpVT.getScalarSizeInBits() ==
41651 SDValue Op00 = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
41652 SDValue Res =
41653 N.getNumOperands() == 2
41654 ? DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1))
41655 : DAG.getNode(Opc, DL, ShuffleVT, Op00);
41656 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
41657 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
41658 }
41659 }
41660 break;
41661 }
41662 // Binary and Binary+Permute Shuffles.
41663 case X86ISD::INSERTPS: {
41664 // Don't merge INSERTPS if it contains zero'd elements.
41665 unsigned InsertPSMask = N.getConstantOperandVal(2);
41666 unsigned ZeroMask = InsertPSMask & 0xF;
41667 if (ZeroMask != 0)
41668 break;
41669 [[fallthrough]];
41670 }
41671 case X86ISD::MOVSD:
41672 case X86ISD::MOVSS:
41673 case X86ISD::BLENDI:
41674 case X86ISD::SHUFP:
41675 case X86ISD::UNPCKH:
41676 case X86ISD::UNPCKL: {
41677 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
41678 N->isOnlyUserOf(N.getOperand(1).getNode())) {
41679 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41680 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
41681 unsigned SrcOpcode = N0.getOpcode();
41682 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
41683 N0.getValueType() == N1.getValueType() &&
41684 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41685 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41690 // Ensure the total number of shuffles doesn't increase by folding this
41691 // shuffle through to the source ops.
41692 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
41693 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
41694 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
41695 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
41696 SDValue LHS, RHS;
41697 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41698 Op10 = DAG.getBitcast(ShuffleVT, Op10);
41699 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41700 Op11 = DAG.getBitcast(ShuffleVT, Op11);
41701 if (N.getNumOperands() == 3) {
41702 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
41703 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
41704 } else {
41705 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
41706 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
41707 }
41708 EVT OpVT = N0.getValueType();
41709 return DAG.getBitcast(ShuffleVT,
41710 DAG.getNode(SrcOpcode, DL, OpVT,
41711 DAG.getBitcast(OpVT, LHS),
41712 DAG.getBitcast(OpVT, RHS)));
41713 }
41714 }
41715 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
41716 N0.getValueType() == N1.getValueType() &&
41717 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41718 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41721 SDValue Res;
41722 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41723 Op10 = DAG.getBitcast(ShuffleVT, Op10);
41724 if (N.getNumOperands() == 3) {
41725 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
41726 } else {
41727 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
41728 }
41729 EVT OpVT = N0.getValueType();
41730 return DAG.getBitcast(
41731 ShuffleVT,
41732 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
41733 }
41734 // TODO: We can generalize this for other shuffles/conversions.
41735 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
41736 N1.getOpcode() == SrcOpcode &&
41737 N0.getValueType() == N1.getValueType() &&
41738 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
41739 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
41740 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41741 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41742 EVT OpSrcVT = N0.getOperand(0).getValueType();
41743 EVT OpDstVT = N0.getValueType();
41744 SDValue Res =
41745 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
41746 return DAG.getBitcast(ShuffleVT,
41747 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
41748 }
41749 }
41750 break;
41751 }
41752 }
41753 return SDValue();
41754}
41755
41756/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
41758 SelectionDAG &DAG,
41759 const SDLoc &DL) {
41760 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
41761
41762 MVT VT = V.getSimpleValueType();
41763 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
41764 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
41765 unsigned SrcOpc0 = Src0.getOpcode();
41766 unsigned SrcOpc1 = Src1.getOpcode();
41767 EVT SrcVT0 = Src0.getValueType();
41768 EVT SrcVT1 = Src1.getValueType();
41769
41770 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
41771 return SDValue();
41772
41773 switch (SrcOpc0) {
41774 case X86ISD::MOVDDUP: {
41775 SDValue LHS = Src0.getOperand(0);
41776 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41777 SDValue Res =
41778 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
41779 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
41780 return DAG.getBitcast(VT, Res);
41781 }
41782 case X86ISD::VPERMILPI:
41783 // TODO: Handle v4f64 permutes with different low/high lane masks.
41784 if (SrcVT0 == MVT::v4f64) {
41785 uint64_t Mask = Src0.getConstantOperandVal(1);
41786 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
41787 break;
41788 }
41789 [[fallthrough]];
41790 case X86ISD::VSHLI:
41791 case X86ISD::VSRLI:
41792 case X86ISD::VSRAI:
41793 case X86ISD::PSHUFD:
41794 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
41795 SDValue LHS = Src0.getOperand(0);
41796 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41797 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
41798 V.getOperand(2));
41799 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
41800 return DAG.getBitcast(VT, Res);
41801 }
41802 break;
41803 }
41804
41805 return SDValue();
41806}
41807
41808static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
41811 const X86Subtarget &Subtarget);
41812
41813/// Try to combine x86 target specific shuffles.
41815 SelectionDAG &DAG,
41817 const X86Subtarget &Subtarget) {
41818 using namespace SDPatternMatch;
41819
41820 MVT VT = N.getSimpleValueType();
41821 unsigned NumElts = VT.getVectorNumElements();
41823 unsigned Opcode = N.getOpcode();
41824 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41825
41826 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
41827 return R;
41828
41829 // Handle specific target shuffles.
41830 switch (Opcode) {
41831 case X86ISD::MOVDDUP: {
41832 SDValue Src = N.getOperand(0);
41833 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
41834 if (VT == MVT::v2f64 && Src.hasOneUse() &&
41835 ISD::isNormalLoad(Src.getNode())) {
41836 LoadSDNode *LN = cast<LoadSDNode>(Src);
41837 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
41838 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
41839 DCI.CombineTo(N.getNode(), Movddup);
41840 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41842 return N; // Return N so it doesn't get rechecked!
41843 }
41844 }
41845
41846 return SDValue();
41847 }
41848 case X86ISD::VBROADCAST: {
41849 SDValue Src = N.getOperand(0);
41850 SDValue BC = peekThroughBitcasts(Src);
41851 EVT SrcVT = Src.getValueType();
41852 EVT BCVT = BC.getValueType();
41853
41854 // If broadcasting from another shuffle, attempt to simplify it.
41855 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
41856 if (isTargetShuffle(BC.getOpcode()) &&
41857 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
41858 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
41859 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
41861 for (unsigned i = 0; i != Scale; ++i)
41862 DemandedMask[i] = i;
41864 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
41866 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
41867 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
41868 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41869 DAG.getBitcast(SrcVT, Res));
41870 }
41871
41872 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
41873 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
41874 if (Src.getOpcode() == ISD::BITCAST &&
41875 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
41876 TLI.isTypeLegal(BCVT) &&
41878 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
41879 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
41881 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41882 }
41883
41884 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
41885 // If we're re-broadcasting a smaller type then broadcast with that type and
41886 // bitcast.
41887 // TODO: Do this for any splat?
41888 if (Src.getOpcode() == ISD::BITCAST &&
41889 (BC.getOpcode() == X86ISD::VBROADCAST ||
41891 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
41892 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
41893 MVT NewVT =
41895 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
41896 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41897 }
41898
41899 // Reduce broadcast source vector to lowest 128-bits.
41900 if (SrcVT.getSizeInBits() > 128)
41901 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41902 extract128BitVector(Src, 0, DAG, DL));
41903
41904 // broadcast(scalar_to_vector(x)) -> broadcast(x).
41905 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
41906 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
41907 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41908
41909 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
41910 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41911 isNullConstant(Src.getOperand(1)) &&
41912 Src.getValueType() ==
41913 Src.getOperand(0).getValueType().getScalarType() &&
41914 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
41915 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41916
41917 // Share broadcast with the longest vector and extract low subvector (free).
41918 // Ensure the same SDValue from the SDNode use is being used.
41919 for (SDNode *User : Src->users())
41920 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
41921 Src == User->getOperand(0) &&
41922 User->getValueSizeInBits(0).getFixedValue() >
41923 VT.getFixedSizeInBits()) {
41924 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
41925 VT.getSizeInBits());
41926 }
41927
41928 // vbroadcast(scalarload X) -> vbroadcast_load X
41929 // For float loads, extract other uses of the scalar from the broadcast.
41930 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
41931 ISD::isNormalLoad(Src.getNode())) {
41932 LoadSDNode *LN = cast<LoadSDNode>(Src);
41933 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41934 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41935 SDValue BcastLd =
41937 LN->getMemoryVT(), LN->getMemOperand());
41938 // If the load value is used only by N, replace it via CombineTo N.
41939 bool NoReplaceExtract = Src.hasOneUse();
41940 DCI.CombineTo(N.getNode(), BcastLd);
41941 if (NoReplaceExtract) {
41942 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41944 } else {
41945 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
41946 DAG.getVectorIdxConstant(0, DL));
41947 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
41948 }
41949 return N; // Return N so it doesn't get rechecked!
41950 }
41951
41952 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
41953 // i16. So shrink it ourselves if we can make a broadcast_load.
41954 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
41955 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
41956 assert(Subtarget.hasAVX2() && "Expected AVX2");
41957 SDValue TruncIn = Src.getOperand(0);
41958
41959 // If this is a truncate of a non extending load we can just narrow it to
41960 // use a broadcast_load.
41961 if (ISD::isNormalLoad(TruncIn.getNode())) {
41962 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
41963 // Unless its volatile or atomic.
41964 if (LN->isSimple()) {
41965 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41966 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41967 SDValue BcastLd = DAG.getMemIntrinsicNode(
41968 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41969 LN->getPointerInfo(), LN->getOriginalAlign(),
41970 LN->getMemOperand()->getFlags());
41971 DCI.CombineTo(N.getNode(), BcastLd);
41972 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41973 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41974 return N; // Return N so it doesn't get rechecked!
41975 }
41976 }
41977
41978 // If this is a truncate of an i16 extload, we can directly replace it.
41979 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
41980 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
41981 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
41982 if (LN->getMemoryVT().getSizeInBits() == 16) {
41983 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41984 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41985 SDValue BcastLd =
41987 LN->getMemoryVT(), LN->getMemOperand());
41988 DCI.CombineTo(N.getNode(), BcastLd);
41989 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41990 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41991 return N; // Return N so it doesn't get rechecked!
41992 }
41993 }
41994
41995 // If this is a truncate of load that has been shifted right, we can
41996 // offset the pointer and use a narrower load.
41997 if (TruncIn.getOpcode() == ISD::SRL &&
41998 TruncIn.getOperand(0).hasOneUse() &&
41999 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42000 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42001 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42002 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42003 // Make sure the shift amount and the load size are divisible by 16.
42004 // Don't do this if the load is volatile or atomic.
42005 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42006 LN->isSimple()) {
42007 unsigned Offset = ShiftAmt / 8;
42008 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42011 SDValue Ops[] = { LN->getChain(), Ptr };
42012 SDValue BcastLd = DAG.getMemIntrinsicNode(
42013 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42015 LN->getOriginalAlign(),
42016 LN->getMemOperand()->getFlags());
42017 DCI.CombineTo(N.getNode(), BcastLd);
42018 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42019 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42020 return N; // Return N so it doesn't get rechecked!
42021 }
42022 }
42023 }
42024
42025 // vbroadcast(vzload X) -> vbroadcast_load X
42026 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42027 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
42028 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42029 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42030 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42031 SDValue BcastLd =
42033 LN->getMemoryVT(), LN->getMemOperand());
42034 DCI.CombineTo(N.getNode(), BcastLd);
42035 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42037 return N; // Return N so it doesn't get rechecked!
42038 }
42039 }
42040
42041 // vbroadcast(vector load X) -> vbroadcast_load
42042 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
42043 SrcVT == MVT::v4i32) &&
42044 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42045 LoadSDNode *LN = cast<LoadSDNode>(Src);
42046 // Unless the load is volatile or atomic.
42047 if (LN->isSimple()) {
42048 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42049 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42050 SDValue BcastLd = DAG.getMemIntrinsicNode(
42051 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
42052 LN->getPointerInfo(), LN->getOriginalAlign(),
42053 LN->getMemOperand()->getFlags());
42054 DCI.CombineTo(N.getNode(), BcastLd);
42055 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42057 return N; // Return N so it doesn't get rechecked!
42058 }
42059 }
42060
42061 return SDValue();
42062 }
42063 case X86ISD::VZEXT_MOVL: {
42064 SDValue N0 = N.getOperand(0);
42065
42066 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42067 // the load is volatile.
42068 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42069 auto *LN = cast<LoadSDNode>(N0);
42070 if (SDValue VZLoad =
42071 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42072 DCI.CombineTo(N.getNode(), VZLoad);
42073 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42075 return N;
42076 }
42077 }
42078
42079 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42080 // and can just use a VZEXT_LOAD.
42081 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42082 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42083 auto *LN = cast<MemSDNode>(N0);
42084 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42085 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42086 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42087 SDValue VZLoad =
42089 LN->getMemoryVT(), LN->getMemOperand());
42090 DCI.CombineTo(N.getNode(), VZLoad);
42091 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42093 return N;
42094 }
42095 }
42096
42097 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42098 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42099 // if the upper bits of the i64 are zero.
42100 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42101 N0.getOperand(0).hasOneUse() &&
42102 N0.getOperand(0).getValueType() == MVT::i64) {
42103 SDValue In = N0.getOperand(0);
42104 APInt Mask = APInt::getHighBitsSet(64, 32);
42105 if (DAG.MaskedValueIsZero(In, Mask)) {
42106 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42107 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42108 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42109 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42110 return DAG.getBitcast(VT, Movl);
42111 }
42112 }
42113
42114 // Load a scalar integer constant directly to XMM instead of transferring an
42115 // immediate value from GPR.
42116 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42117 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42118 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42119 // Create a vector constant - scalar constant followed by zeros.
42120 EVT ScalarVT = N0.getOperand(0).getValueType();
42121 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42122 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42123 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42124 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42125
42126 // Load the vector constant from constant pool.
42127 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42128 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42129 MachinePointerInfo MPI =
42131 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42132 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42134 }
42135 }
42136
42137 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42138 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42139 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42140 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42141 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42143
42144 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42145 isNullConstant(V.getOperand(2))) {
42146 SDValue In = V.getOperand(1);
42148 In.getValueSizeInBits() /
42149 VT.getScalarSizeInBits());
42150 In = DAG.getBitcast(SubVT, In);
42151 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42152 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42153 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42154 V.getOperand(2));
42155 }
42156 }
42157
42158 return SDValue();
42159 }
42160 case X86ISD::BLENDI: {
42161 SDValue N0 = N.getOperand(0);
42162 SDValue N1 = N.getOperand(1);
42163 unsigned EltBits = VT.getScalarSizeInBits();
42164
42165 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42166 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42167 // TODO: Handle MVT::v16i16 repeated blend mask.
42168 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42169 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42170 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42171 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42172 unsigned NewSize = SrcVT.getVectorNumElements();
42173 APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(NumElts);
42174 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42175 return DAG.getBitcast(
42176 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42177 N1.getOperand(0),
42178 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42179 DL, MVT::i8)));
42180 }
42181 }
42182 // Share PSHUFB masks:
42183 // blend(pshufb(x,m1),pshufb(y,m2))
42184 // --> m3 = blend(m1,m2)
42185 // blend(pshufb(x,m3),pshufb(y,m3))
42186 if (N0.hasOneUse() && N1.hasOneUse()) {
42187 SmallVector<int> Mask, ByteMask;
42191 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42192 RHS.getOpcode() == X86ISD::PSHUFB &&
42193 LHS.getOperand(1) != RHS.getOperand(1) &&
42194 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42195 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42196 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42197 RHS == peekThroughOneUseBitcasts(Ops[1]) &&
42198 "BLENDI decode mismatch");
42199 MVT ShufVT = LHS.getSimpleValueType();
42200 SDValue MaskLHS = LHS.getOperand(1);
42201 SDValue MaskRHS = RHS.getOperand(1);
42202 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42204 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42205 /*HasVariableMask=*/true, DAG, DL, Subtarget)) {
42206 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42207 LHS.getOperand(0), NewMask);
42208 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42209 RHS.getOperand(0), NewMask);
42210 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42211 DAG.getBitcast(VT, NewLHS),
42212 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42213 }
42214 }
42215 }
42216 }
42217 return SDValue();
42218 }
42219 case X86ISD::SHUFP: {
42220 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42221 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42222 // TODO: Support types other than v4f32.
42223 if (VT == MVT::v4f32) {
42224 bool Updated = false;
42225 SmallVector<int> Mask;
42227 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42228 for (int i = 0; i != 2; ++i) {
42229 SmallVector<SDValue> SubOps;
42230 SmallVector<int> SubMask, SubScaledMask;
42231 SDValue Sub = peekThroughBitcasts(Ops[i]);
42232 // TODO: Scaling might be easier if we specify the demanded elts.
42233 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42234 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42235 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42236 int Ofs = i * 2;
42237 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42238 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42239 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42240 Updated = true;
42241 }
42242 }
42243 }
42244 if (Updated) {
42245 for (int &M : Mask)
42246 M %= 4;
42247 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42248 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42249 }
42250 }
42251 return SDValue();
42252 }
42253 case X86ISD::VPERMI: {
42254 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42255 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42256 SDValue N0 = N.getOperand(0);
42257 SDValue N1 = N.getOperand(1);
42258 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42259 if (N0.getOpcode() == ISD::BITCAST &&
42260 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42261 SDValue Src = N0.getOperand(0);
42262 EVT SrcVT = Src.getValueType();
42263 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42264 return DAG.getBitcast(VT, Res);
42265 }
42266 return SDValue();
42267 }
42268 case X86ISD::SHUF128: {
42269 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42270 // see if we can peek through and access the subvector directly.
42271 if (VT.is512BitVector()) {
42272 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
42273 // upper subvector is used.
42274 SDValue LHS = N->getOperand(0);
42275 SDValue RHS = N->getOperand(1);
42276 uint64_t Mask = N->getConstantOperandVal(2);
42277 SmallVector<SDValue> LHSOps, RHSOps;
42278 SDValue NewLHS, NewRHS;
42279 if ((Mask & 0x0A) == 0x0A &&
42280 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42281 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42282 Mask &= ~0x0A;
42283 }
42284 if ((Mask & 0xA0) == 0xA0 &&
42285 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42286 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42287 Mask &= ~0xA0;
42288 }
42289 if (NewLHS || NewRHS)
42290 return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS,
42291 NewRHS ? NewRHS : RHS,
42292 DAG.getTargetConstant(Mask, DL, MVT::i8));
42293 }
42294 return SDValue();
42295 }
42296 case X86ISD::VPERM2X128: {
42297 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42298 SDValue LHS = N->getOperand(0);
42299 SDValue RHS = N->getOperand(1);
42300 if (LHS.getOpcode() == ISD::BITCAST &&
42301 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42302 EVT SrcVT = LHS.getOperand(0).getValueType();
42303 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42304 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42305 DAG.getBitcast(SrcVT, LHS),
42306 DAG.getBitcast(SrcVT, RHS),
42307 N->getOperand(2)));
42308 }
42309 }
42310
42311 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42313 return Res;
42314
42315 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42316 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42317 auto FindSubVector128 = [&](unsigned Idx) {
42318 if (Idx > 3)
42319 return SDValue();
42320 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42321 SmallVector<SDValue> SubOps;
42322 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42323 return SubOps[Idx & 1];
42324 unsigned NumElts = Src.getValueType().getVectorNumElements();
42325 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42326 Src.getOperand(1).getValueSizeInBits() == 128 &&
42327 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42328 return Src.getOperand(1);
42329 }
42330 return SDValue();
42331 };
42332 unsigned Imm = N.getConstantOperandVal(2);
42333 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42334 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42335 MVT SubVT = VT.getHalfNumVectorElementsVT();
42336 SubLo = DAG.getBitcast(SubVT, SubLo);
42337 SubHi = DAG.getBitcast(SubVT, SubHi);
42338 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42339 }
42340 }
42341 return SDValue();
42342 }
42343 case X86ISD::PSHUFD:
42344 case X86ISD::PSHUFLW:
42345 case X86ISD::PSHUFHW: {
42346 SDValue N0 = N.getOperand(0);
42347 SDValue N1 = N.getOperand(1);
42348 if (N0->hasOneUse()) {
42350 switch (V.getOpcode()) {
42351 case X86ISD::VSHL:
42352 case X86ISD::VSRL:
42353 case X86ISD::VSRA:
42354 case X86ISD::VSHLI:
42355 case X86ISD::VSRLI:
42356 case X86ISD::VSRAI:
42357 case X86ISD::VROTLI:
42358 case X86ISD::VROTRI: {
42359 MVT InnerVT = V.getSimpleValueType();
42360 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42361 SDValue Res = DAG.getNode(Opcode, DL, VT,
42362 DAG.getBitcast(VT, V.getOperand(0)), N1);
42363 Res = DAG.getBitcast(InnerVT, Res);
42364 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42365 return DAG.getBitcast(VT, Res);
42366 }
42367 break;
42368 }
42369 }
42370 }
42371
42372 Mask = getPSHUFShuffleMask(N);
42373 assert(Mask.size() == 4);
42374 break;
42375 }
42376 case X86ISD::MOVSD:
42377 case X86ISD::MOVSH:
42378 case X86ISD::MOVSS: {
42379 SDValue N0 = N.getOperand(0);
42380 SDValue N1 = N.getOperand(1);
42381
42382 // Canonicalize scalar FPOps:
42383 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42384 // If commutable, allow OP(N1[0], N0[0]).
42385 unsigned Opcode1 = N1.getOpcode();
42386 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42387 Opcode1 == ISD::FDIV) {
42388 SDValue N10 = N1.getOperand(0);
42389 SDValue N11 = N1.getOperand(1);
42390 if (N10 == N0 ||
42391 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42392 if (N10 != N0)
42393 std::swap(N10, N11);
42394 MVT SVT = VT.getVectorElementType();
42395 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42396 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42397 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42398 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42399 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42400 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42401 }
42402 }
42403
42404 return SDValue();
42405 }
42406 case X86ISD::INSERTPS: {
42407 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42408 SDValue Op0 = N.getOperand(0);
42409 SDValue Op1 = N.getOperand(1);
42410 unsigned InsertPSMask = N.getConstantOperandVal(2);
42411 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42412 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42413 unsigned ZeroMask = InsertPSMask & 0xF;
42414
42415 // If we zero out all elements from Op0 then we don't need to reference it.
42416 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42417 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42418 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42419
42420 // If we zero out the element from Op1 then we don't need to reference it.
42421 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42422 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42423 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42424
42425 // Attempt to merge insertps Op1 with an inner target shuffle node.
42426 SmallVector<int, 8> TargetMask1;
42428 APInt KnownUndef1, KnownZero1;
42429 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42430 KnownZero1)) {
42431 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42432 // Zero/UNDEF insertion - zero out element and remove dependency.
42433 InsertPSMask |= (1u << DstIdx);
42434 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42435 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42436 }
42437 // Update insertps mask srcidx and reference the source input directly.
42438 int M = TargetMask1[SrcIdx];
42439 assert(0 <= M && M < 8 && "Shuffle index out of range");
42440 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42441 Op1 = Ops1[M < 4 ? 0 : 1];
42442 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42443 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42444 }
42445
42446 // Attempt to merge insertps Op0 with an inner target shuffle node.
42447 SmallVector<int, 8> TargetMask0;
42449 APInt KnownUndef0, KnownZero0;
42450 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42451 KnownZero0)) {
42452 bool Updated = false;
42453 bool UseInput00 = false;
42454 bool UseInput01 = false;
42455 for (int i = 0; i != 4; ++i) {
42456 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42457 // No change if element is already zero or the inserted element.
42458 continue;
42459 }
42460
42461 if (KnownUndef0[i] || KnownZero0[i]) {
42462 // If the target mask is undef/zero then we must zero the element.
42463 InsertPSMask |= (1u << i);
42464 Updated = true;
42465 continue;
42466 }
42467
42468 // The input vector element must be inline.
42469 int M = TargetMask0[i];
42470 if (M != i && M != (i + 4))
42471 return SDValue();
42472
42473 // Determine which inputs of the target shuffle we're using.
42474 UseInput00 |= (0 <= M && M < 4);
42475 UseInput01 |= (4 <= M);
42476 }
42477
42478 // If we're not using both inputs of the target shuffle then use the
42479 // referenced input directly.
42480 if (UseInput00 && !UseInput01) {
42481 Updated = true;
42482 Op0 = Ops0[0];
42483 } else if (!UseInput00 && UseInput01) {
42484 Updated = true;
42485 Op0 = Ops0[1];
42486 }
42487
42488 if (Updated)
42489 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42490 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42491 }
42492
42493 // If we're inserting an element from a vbroadcast load, fold the
42494 // load into the X86insertps instruction. We need to convert the scalar
42495 // load to a vector and clear the source lane of the INSERTPS control.
42496 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42497 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42498 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42499 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42500 MemIntr->getBasePtr(),
42501 MemIntr->getMemOperand());
42502 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42504 Load),
42505 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42506 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42507 return Insert;
42508 }
42509 }
42510
42511 return SDValue();
42512 }
42513 case X86ISD::VPERMV3: {
42514 // Combine VPERMV3 to widened VPERMV if the two source operands can be
42515 // freely concatenated.
42516 if (VT.is128BitVector() ||
42517 (VT.is256BitVector() && Subtarget.useAVX512Regs())) {
42518 SDValue Ops[] = {N.getOperand(0), N.getOperand(2)};
42519 MVT WideVT = VT.getDoubleNumVectorElementsVT();
42520 if (SDValue ConcatSrc =
42521 combineConcatVectorOps(DL, WideVT, Ops, DAG, DCI, Subtarget)) {
42522 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
42523 DL, WideVT.getSizeInBits());
42524 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
42525 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
42526 DAG.getVectorIdxConstant(0, DL));
42527 }
42528 }
42531 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42532 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42533 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
42534 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
42535 MVT MaskVT = N.getOperand(1).getSimpleValueType();
42536 // Canonicalize to VPERMV if both sources are the same.
42537 if (V1 == V2) {
42538 for (int &M : Mask)
42539 M = (M < 0 ? M : M & (Mask.size() - 1));
42540 SDValue NewMask = getConstVector(Mask, MaskVT, DAG, DL,
42541 /*IsMask=*/true);
42542 return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, N.getOperand(0));
42543 }
42544 // If sources are half width, then concat and use VPERMV with adjusted
42545 // mask.
42546 SDValue Ops[2];
42547 MVT HalfVT = VT.getHalfNumVectorElementsVT();
42548 if (sd_match(V1,
42549 m_InsertSubvector(m_Undef(), m_Value(Ops[0]), m_Zero())) &&
42550 sd_match(V2,
42551 m_InsertSubvector(m_Undef(), m_Value(Ops[1]), m_Zero())) &&
42552 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
42553 if (SDValue ConcatSrc =
42554 combineConcatVectorOps(DL, VT, Ops, DAG, DCI, Subtarget)) {
42555 for (int &M : Mask)
42556 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
42557 SDValue NewMask = getConstVector(Mask, MaskVT, DAG, DL,
42558 /*IsMask=*/true);
42559 return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, ConcatSrc);
42560 }
42561 }
42562 // Commute foldable source to the RHS.
42563 if (isShuffleFoldableLoad(N.getOperand(0)) &&
42564 !isShuffleFoldableLoad(N.getOperand(2))) {
42566 SDValue NewMask =
42567 getConstVector(Mask, MaskVT, DAG, DL, /*IsMask=*/true);
42568 return DAG.getNode(X86ISD::VPERMV3, DL, VT, N.getOperand(2), NewMask,
42569 N.getOperand(0));
42570 }
42571 }
42572 return SDValue();
42573 }
42574 default:
42575 return SDValue();
42576 }
42577
42578 // Nuke no-op shuffles that show up after combining.
42579 if (isNoopShuffleMask(Mask))
42580 return N.getOperand(0);
42581
42582 // Look for simplifications involving one or two shuffle instructions.
42583 SDValue V = N.getOperand(0);
42584 switch (N.getOpcode()) {
42585 default:
42586 break;
42587 case X86ISD::PSHUFLW:
42588 case X86ISD::PSHUFHW:
42589 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
42590
42591 // See if this reduces to a PSHUFD which is no more expensive and can
42592 // combine with more operations. Note that it has to at least flip the
42593 // dwords as otherwise it would have been removed as a no-op.
42594 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
42595 int DMask[] = {0, 1, 2, 3};
42596 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
42597 DMask[DOffset + 0] = DOffset + 1;
42598 DMask[DOffset + 1] = DOffset + 0;
42599 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
42600 V = DAG.getBitcast(DVT, V);
42601 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
42602 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
42603 return DAG.getBitcast(VT, V);
42604 }
42605
42606 // Look for shuffle patterns which can be implemented as a single unpack.
42607 // FIXME: This doesn't handle the location of the PSHUFD generically, and
42608 // only works when we have a PSHUFD followed by two half-shuffles.
42609 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
42610 (V.getOpcode() == X86ISD::PSHUFLW ||
42611 V.getOpcode() == X86ISD::PSHUFHW) &&
42612 V.getOpcode() != N.getOpcode() &&
42613 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
42614 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
42615 if (D.getOpcode() == X86ISD::PSHUFD) {
42618 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42619 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42620 int WordMask[8];
42621 for (int i = 0; i < 4; ++i) {
42622 WordMask[i + NOffset] = Mask[i] + NOffset;
42623 WordMask[i + VOffset] = VMask[i] + VOffset;
42624 }
42625 // Map the word mask through the DWord mask.
42626 int MappedMask[8];
42627 for (int i = 0; i < 8; ++i)
42628 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
42629 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
42630 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
42631 // We can replace all three shuffles with an unpack.
42632 V = DAG.getBitcast(VT, D.getOperand(0));
42633 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
42635 DL, VT, V, V);
42636 }
42637 }
42638 }
42639
42640 break;
42641
42642 case X86ISD::PSHUFD:
42643 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
42644 return NewN;
42645
42646 break;
42647 }
42648
42649 return SDValue();
42650}
42651
42652/// Checks if the shuffle mask takes subsequent elements
42653/// alternately from two vectors.
42654/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
42655static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
42656
42657 int ParitySrc[2] = {-1, -1};
42658 unsigned Size = Mask.size();
42659 for (unsigned i = 0; i != Size; ++i) {
42660 int M = Mask[i];
42661 if (M < 0)
42662 continue;
42663
42664 // Make sure we are using the matching element from the input.
42665 if ((M % Size) != i)
42666 return false;
42667
42668 // Make sure we use the same input for all elements of the same parity.
42669 int Src = M / Size;
42670 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
42671 return false;
42672 ParitySrc[i % 2] = Src;
42673 }
42674
42675 // Make sure each input is used.
42676 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
42677 return false;
42678
42679 Op0Even = ParitySrc[0] == 0;
42680 return true;
42681}
42682
42683/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
42684/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
42685/// are written to the parameters \p Opnd0 and \p Opnd1.
42686///
42687/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
42688/// so it is easier to generically match. We also insert dummy vector shuffle
42689/// nodes for the operands which explicitly discard the lanes which are unused
42690/// by this operation to try to flow through the rest of the combiner
42691/// the fact that they're unused.
42692static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
42693 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
42694 bool &IsSubAdd) {
42695
42696 EVT VT = N->getValueType(0);
42697 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42698 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
42700 return false;
42701
42702 // We only handle target-independent shuffles.
42703 // FIXME: It would be easy and harmless to use the target shuffle mask
42704 // extraction tool to support more.
42705 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42706 return false;
42707
42708 SDValue V1 = N->getOperand(0);
42709 SDValue V2 = N->getOperand(1);
42710
42711 // Make sure we have an FADD and an FSUB.
42712 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
42713 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
42714 V1.getOpcode() == V2.getOpcode())
42715 return false;
42716
42717 // If there are other uses of these operations we can't fold them.
42718 if (!V1->hasOneUse() || !V2->hasOneUse())
42719 return false;
42720
42721 // Ensure that both operations have the same operands. Note that we can
42722 // commute the FADD operands.
42723 SDValue LHS, RHS;
42724 if (V1.getOpcode() == ISD::FSUB) {
42725 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
42726 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
42727 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
42728 return false;
42729 } else {
42730 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
42731 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
42732 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
42733 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
42734 return false;
42735 }
42736
42737 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42738 bool Op0Even;
42739 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42740 return false;
42741
42742 // It's a subadd if the vector in the even parity is an FADD.
42743 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
42744 : V2->getOpcode() == ISD::FADD;
42745
42746 Opnd0 = LHS;
42747 Opnd1 = RHS;
42748 return true;
42749}
42750
42751/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
42753 const X86Subtarget &Subtarget,
42754 SelectionDAG &DAG) {
42755 // We only handle target-independent shuffles.
42756 // FIXME: It would be easy and harmless to use the target shuffle mask
42757 // extraction tool to support more.
42758 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42759 return SDValue();
42760
42761 MVT VT = N->getSimpleValueType(0);
42762 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42763 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
42764 return SDValue();
42765
42766 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
42767 SDValue Op0 = N->getOperand(0);
42768 SDValue Op1 = N->getOperand(1);
42769 SDValue FMAdd = Op0, FMSub = Op1;
42770 if (FMSub.getOpcode() != X86ISD::FMSUB)
42771 std::swap(FMAdd, FMSub);
42772
42773 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
42774 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
42775 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
42776 FMAdd.getOperand(2) != FMSub.getOperand(2))
42777 return SDValue();
42778
42779 // Check for correct shuffle mask.
42780 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42781 bool Op0Even;
42782 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42783 return SDValue();
42784
42785 // FMAddSub takes zeroth operand from FMSub node.
42786 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
42787 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42788 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
42789 FMAdd.getOperand(2));
42790}
42791
42792/// Try to combine a shuffle into a target-specific add-sub or
42793/// mul-add-sub node.
42795 const X86Subtarget &Subtarget,
42796 SelectionDAG &DAG) {
42797 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
42798 return V;
42799
42800 SDValue Opnd0, Opnd1;
42801 bool IsSubAdd;
42802 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
42803 return SDValue();
42804
42805 MVT VT = N->getSimpleValueType(0);
42806
42807 // Try to generate X86ISD::FMADDSUB node here.
42808 SDValue Opnd2;
42809 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
42810 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42811 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
42812 }
42813
42814 if (IsSubAdd)
42815 return SDValue();
42816
42817 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
42818 // the ADDSUB idiom has been successfully recognized. There are no known
42819 // X86 targets with 512-bit ADDSUB instructions!
42820 if (VT.is512BitVector())
42821 return SDValue();
42822
42823 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
42824 // the ADDSUB idiom has been successfully recognized. There are no known
42825 // X86 targets with FP16 ADDSUB instructions!
42826 if (VT.getVectorElementType() == MVT::f16)
42827 return SDValue();
42828
42829 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
42830}
42831
42832// We are looking for a shuffle where both sources are concatenated with undef
42833// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
42834// if we can express this as a single-source shuffle, that's preferable.
42836 SelectionDAG &DAG,
42837 const X86Subtarget &Subtarget) {
42838 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
42839 return SDValue();
42840
42841 EVT VT = N->getValueType(0);
42842
42843 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
42844 if (!VT.is128BitVector() && !VT.is256BitVector())
42845 return SDValue();
42846
42847 if (VT.getVectorElementType() != MVT::i32 &&
42848 VT.getVectorElementType() != MVT::i64 &&
42849 VT.getVectorElementType() != MVT::f32 &&
42850 VT.getVectorElementType() != MVT::f64)
42851 return SDValue();
42852
42853 SDValue N0 = N->getOperand(0);
42854 SDValue N1 = N->getOperand(1);
42855
42856 // Check that both sources are concats with undef.
42857 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
42858 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
42859 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
42860 !N1.getOperand(1).isUndef())
42861 return SDValue();
42862
42863 // Construct the new shuffle mask. Elements from the first source retain their
42864 // index, but elements from the second source no longer need to skip an undef.
42866 int NumElts = VT.getVectorNumElements();
42867
42868 auto *SVOp = cast<ShuffleVectorSDNode>(N);
42869 for (int Elt : SVOp->getMask())
42870 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
42871
42873 N1.getOperand(0));
42874 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
42875}
42876
42877/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
42878/// low half of each source vector and does not set any high half elements in
42879/// the destination vector, narrow the shuffle to half its original size.
42881 EVT VT = Shuf->getValueType(0);
42882 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
42883 return SDValue();
42884 if (!VT.is256BitVector() && !VT.is512BitVector())
42885 return SDValue();
42886
42887 // See if we can ignore all of the high elements of the shuffle.
42888 ArrayRef<int> Mask = Shuf->getMask();
42889 if (!isUndefUpperHalf(Mask))
42890 return SDValue();
42891
42892 // Check if the shuffle mask accesses only the low half of each input vector
42893 // (half-index output is 0 or 2).
42894 int HalfIdx1, HalfIdx2;
42895 SmallVector<int, 8> HalfMask(Mask.size() / 2);
42896 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
42897 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
42898 return SDValue();
42899
42900 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
42901 // The trick is knowing that all of the insert/extract are actually free
42902 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
42903 // of narrow inputs into a narrow output, and that is always cheaper than
42904 // the wide shuffle that we started with.
42905 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
42906 Shuf->getOperand(1), HalfMask, HalfIdx1,
42907 HalfIdx2, false, DAG, /*UseConcat*/ true);
42908}
42909
42912 const X86Subtarget &Subtarget) {
42913 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
42914 if (SDValue V = narrowShuffle(Shuf, DAG))
42915 return V;
42916
42917 // If we have legalized the vector types, look for blends of FADD and FSUB
42918 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
42919 SDLoc dl(N);
42920 EVT VT = N->getValueType(0);
42921 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42922 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
42923 if (SDValue AddSub =
42924 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
42925 return AddSub;
42926
42927 // Attempt to combine into a vector load/broadcast.
42929 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
42930 return LD;
42931
42932 // For AVX2, we sometimes want to combine
42933 // (vector_shuffle <mask> (concat_vectors t1, undef)
42934 // (concat_vectors t2, undef))
42935 // Into:
42936 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
42937 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
42938 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
42939 return ShufConcat;
42940
42941 if (isTargetShuffle(N->getOpcode())) {
42942 SDValue Op(N, 0);
42943 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
42944 return Shuffle;
42945
42946 // Try recursively combining arbitrary sequences of x86 shuffle
42947 // instructions into higher-order shuffles. We do this after combining
42948 // specific PSHUF instruction sequences into their minimal form so that we
42949 // can evaluate how many specialized shuffle instructions are involved in
42950 // a particular chain.
42951 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
42952 return Res;
42953
42954 // Simplify source operands based on shuffle mask.
42955 // TODO - merge this into combineX86ShufflesRecursively.
42956 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
42957 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
42958 return SDValue(N, 0);
42959
42960 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42961 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42962 // Perform this after other shuffle combines to allow inner shuffles to be
42963 // combined away first.
42964 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
42965 return BinOp;
42966 }
42967
42968 return SDValue();
42969}
42970
42971// Simplify variable target shuffle masks based on the demanded elements.
42972// TODO: Handle DemandedBits in mask indices as well?
42974 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
42975 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
42976 // If we're demanding all elements don't bother trying to simplify the mask.
42977 unsigned NumElts = DemandedElts.getBitWidth();
42978 if (DemandedElts.isAllOnes())
42979 return false;
42980
42981 SDValue Mask = Op.getOperand(MaskIndex);
42982 if (!Mask.hasOneUse())
42983 return false;
42984
42985 // Attempt to generically simplify the variable shuffle mask.
42986 APInt MaskUndef, MaskZero;
42987 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
42988 Depth + 1))
42989 return true;
42990
42991 // Attempt to extract+simplify a (constant pool load) shuffle mask.
42992 // TODO: Support other types from getTargetShuffleMaskIndices?
42994 EVT BCVT = BC.getValueType();
42995 auto *Load = dyn_cast<LoadSDNode>(BC);
42996 if (!Load || !Load->getBasePtr().hasOneUse())
42997 return false;
42998
42999 const Constant *C = getTargetConstantFromNode(Load);
43000 if (!C)
43001 return false;
43002
43003 Type *CTy = C->getType();
43004 if (!CTy->isVectorTy() ||
43005 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43006 return false;
43007
43008 // Handle scaling for i64 elements on 32-bit targets.
43009 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43010 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43011 return false;
43012 unsigned Scale = NumCstElts / NumElts;
43013
43014 // Simplify mask if we have an undemanded element that is not undef.
43015 bool Simplified = false;
43016 SmallVector<Constant *, 32> ConstVecOps;
43017 for (unsigned i = 0; i != NumCstElts; ++i) {
43018 Constant *Elt = C->getAggregateElement(i);
43019 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43020 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43021 Simplified = true;
43022 continue;
43023 }
43024 ConstVecOps.push_back(Elt);
43025 }
43026 if (!Simplified)
43027 return false;
43028
43029 // Generate new constant pool entry + legalize immediately for the load.
43030 SDLoc DL(Op);
43031 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43032 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43033 SDValue NewMask = TLO.DAG.getLoad(
43034 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43036 Load->getAlign());
43037 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43038}
43039
43041 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43042 TargetLoweringOpt &TLO, unsigned Depth) const {
43043 int NumElts = DemandedElts.getBitWidth();
43044 unsigned Opc = Op.getOpcode();
43045 EVT VT = Op.getValueType();
43046
43047 // Handle special case opcodes.
43048 switch (Opc) {
43049 case X86ISD::PMULDQ:
43050 case X86ISD::PMULUDQ: {
43051 APInt LHSUndef, LHSZero;
43052 APInt RHSUndef, RHSZero;
43053 SDValue LHS = Op.getOperand(0);
43054 SDValue RHS = Op.getOperand(1);
43055 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43056 Depth + 1))
43057 return true;
43058 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43059 Depth + 1))
43060 return true;
43061 // Multiply by zero.
43062 KnownZero = LHSZero | RHSZero;
43063 break;
43064 }
43065 case X86ISD::VPMADDUBSW:
43066 case X86ISD::VPMADDWD: {
43067 APInt LHSUndef, LHSZero;
43068 APInt RHSUndef, RHSZero;
43069 SDValue LHS = Op.getOperand(0);
43070 SDValue RHS = Op.getOperand(1);
43071 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43072
43073 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43074 Depth + 1))
43075 return true;
43076 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43077 Depth + 1))
43078 return true;
43079
43080 // TODO: Multiply by zero.
43081
43082 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43083 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43084 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43085 Depth + 1))
43086 return true;
43087 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43088 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43089 Depth + 1))
43090 return true;
43091 break;
43092 }
43093 case X86ISD::PSADBW: {
43094 SDValue LHS = Op.getOperand(0);
43095 SDValue RHS = Op.getOperand(1);
43096 assert(VT.getScalarType() == MVT::i64 &&
43097 LHS.getValueType() == RHS.getValueType() &&
43098 LHS.getValueType().getScalarType() == MVT::i8 &&
43099 "Unexpected PSADBW types");
43100
43101 // Aggressively peek through ops to get at the demanded elts.
43102 if (!DemandedElts.isAllOnes()) {
43103 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43104 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43106 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43108 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43109 if (NewLHS || NewRHS) {
43110 NewLHS = NewLHS ? NewLHS : LHS;
43111 NewRHS = NewRHS ? NewRHS : RHS;
43112 return TLO.CombineTo(
43113 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43114 }
43115 }
43116 break;
43117 }
43118 case X86ISD::VSHL:
43119 case X86ISD::VSRL:
43120 case X86ISD::VSRA: {
43121 // We only need the bottom 64-bits of the (128-bit) shift amount.
43122 SDValue Amt = Op.getOperand(1);
43123 MVT AmtVT = Amt.getSimpleValueType();
43124 assert(AmtVT.is128BitVector() && "Unexpected value type");
43125
43126 // If we reuse the shift amount just for sse shift amounts then we know that
43127 // only the bottom 64-bits are only ever used.
43128 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43129 unsigned UseOpc = Use->getOpcode();
43130 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43131 UseOpc == X86ISD::VSRA) &&
43132 Use->getOperand(0) != Amt;
43133 });
43134
43135 APInt AmtUndef, AmtZero;
43136 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43137 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43138 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43139 Depth + 1, AssumeSingleUse))
43140 return true;
43141 [[fallthrough]];
43142 }
43143 case X86ISD::VSHLI:
43144 case X86ISD::VSRLI:
43145 case X86ISD::VSRAI: {
43146 SDValue Src = Op.getOperand(0);
43147 APInt SrcUndef;
43148 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43149 Depth + 1))
43150 return true;
43151
43152 // Fold shift(0,x) -> 0
43153 if (DemandedElts.isSubsetOf(KnownZero))
43154 return TLO.CombineTo(
43155 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43156
43157 // Aggressively peek through ops to get at the demanded elts.
43158 if (!DemandedElts.isAllOnes())
43160 Src, DemandedElts, TLO.DAG, Depth + 1))
43161 return TLO.CombineTo(
43162 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43163 break;
43164 }
43165 case X86ISD::VPSHA:
43166 case X86ISD::VPSHL:
43167 case X86ISD::VSHLV:
43168 case X86ISD::VSRLV:
43169 case X86ISD::VSRAV: {
43170 APInt LHSUndef, LHSZero;
43171 APInt RHSUndef, RHSZero;
43172 SDValue LHS = Op.getOperand(0);
43173 SDValue RHS = Op.getOperand(1);
43174 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43175 Depth + 1))
43176 return true;
43177
43178 // Fold shift(0,x) -> 0
43179 if (DemandedElts.isSubsetOf(LHSZero))
43180 return TLO.CombineTo(
43181 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43182
43183 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43184 Depth + 1))
43185 return true;
43186
43187 KnownZero = LHSZero;
43188 break;
43189 }
43190 case X86ISD::PCMPEQ:
43191 case X86ISD::PCMPGT: {
43192 APInt LHSUndef, LHSZero;
43193 APInt RHSUndef, RHSZero;
43194 SDValue LHS = Op.getOperand(0);
43195 SDValue RHS = Op.getOperand(1);
43196 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43197 Depth + 1))
43198 return true;
43199 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43200 Depth + 1))
43201 return true;
43202 break;
43203 }
43204 case X86ISD::KSHIFTL: {
43205 SDValue Src = Op.getOperand(0);
43206 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43207 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43208 unsigned ShiftAmt = Amt->getZExtValue();
43209
43210 if (ShiftAmt == 0)
43211 return TLO.CombineTo(Op, Src);
43212
43213 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43214 // single shift. We can do this if the bottom bits (which are shifted
43215 // out) are never demanded.
43216 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43217 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43218 unsigned C1 = Src.getConstantOperandVal(1);
43219 unsigned NewOpc = X86ISD::KSHIFTL;
43220 int Diff = ShiftAmt - C1;
43221 if (Diff < 0) {
43222 Diff = -Diff;
43223 NewOpc = X86ISD::KSHIFTR;
43224 }
43225
43226 SDLoc dl(Op);
43227 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43228 return TLO.CombineTo(
43229 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43230 }
43231 }
43232
43233 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43234 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43235 Depth + 1))
43236 return true;
43237
43238 KnownUndef <<= ShiftAmt;
43239 KnownZero <<= ShiftAmt;
43240 KnownZero.setLowBits(ShiftAmt);
43241 break;
43242 }
43243 case X86ISD::KSHIFTR: {
43244 SDValue Src = Op.getOperand(0);
43245 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43246 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43247 unsigned ShiftAmt = Amt->getZExtValue();
43248
43249 if (ShiftAmt == 0)
43250 return TLO.CombineTo(Op, Src);
43251
43252 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43253 // single shift. We can do this if the top bits (which are shifted
43254 // out) are never demanded.
43255 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43256 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43257 unsigned C1 = Src.getConstantOperandVal(1);
43258 unsigned NewOpc = X86ISD::KSHIFTR;
43259 int Diff = ShiftAmt - C1;
43260 if (Diff < 0) {
43261 Diff = -Diff;
43262 NewOpc = X86ISD::KSHIFTL;
43263 }
43264
43265 SDLoc dl(Op);
43266 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43267 return TLO.CombineTo(
43268 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43269 }
43270 }
43271
43272 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43273 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43274 Depth + 1))
43275 return true;
43276
43277 KnownUndef.lshrInPlace(ShiftAmt);
43278 KnownZero.lshrInPlace(ShiftAmt);
43279 KnownZero.setHighBits(ShiftAmt);
43280 break;
43281 }
43282 case X86ISD::ANDNP: {
43283 // ANDNP = (~LHS & RHS);
43284 SDValue LHS = Op.getOperand(0);
43285 SDValue RHS = Op.getOperand(1);
43286
43287 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43288 APInt UndefElts;
43289 SmallVector<APInt> EltBits;
43290 int NumElts = VT.getVectorNumElements();
43291 int EltSizeInBits = VT.getScalarSizeInBits();
43292 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43293 APInt OpElts = DemandedElts;
43294 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43295 EltBits)) {
43296 OpBits.clearAllBits();
43297 OpElts.clearAllBits();
43298 for (int I = 0; I != NumElts; ++I) {
43299 if (!DemandedElts[I])
43300 continue;
43301 if (UndefElts[I]) {
43302 // We can't assume an undef src element gives an undef dst - the
43303 // other src might be zero.
43304 OpBits.setAllBits();
43305 OpElts.setBit(I);
43306 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43307 (!Invert && !EltBits[I].isZero())) {
43308 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43309 OpElts.setBit(I);
43310 }
43311 }
43312 }
43313 return std::make_pair(OpBits, OpElts);
43314 };
43315 APInt BitsLHS, EltsLHS;
43316 APInt BitsRHS, EltsRHS;
43317 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43318 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43319
43320 APInt LHSUndef, LHSZero;
43321 APInt RHSUndef, RHSZero;
43322 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43323 Depth + 1))
43324 return true;
43325 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43326 Depth + 1))
43327 return true;
43328
43329 if (!DemandedElts.isAllOnes()) {
43330 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43331 TLO.DAG, Depth + 1);
43332 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43333 TLO.DAG, Depth + 1);
43334 if (NewLHS || NewRHS) {
43335 NewLHS = NewLHS ? NewLHS : LHS;
43336 NewRHS = NewRHS ? NewRHS : RHS;
43337 return TLO.CombineTo(
43338 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43339 }
43340 }
43341 break;
43342 }
43343 case X86ISD::CVTSI2P:
43344 case X86ISD::CVTUI2P:
43345 case X86ISD::CVTPH2PS:
43346 case X86ISD::CVTPS2PH: {
43347 SDValue Src = Op.getOperand(0);
43348 EVT SrcVT = Src.getValueType();
43349 APInt SrcUndef, SrcZero;
43350 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43351 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43352 Depth + 1))
43353 return true;
43354 break;
43355 }
43356 case X86ISD::PACKSS:
43357 case X86ISD::PACKUS: {
43358 SDValue N0 = Op.getOperand(0);
43359 SDValue N1 = Op.getOperand(1);
43360
43361 APInt DemandedLHS, DemandedRHS;
43362 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43363
43364 APInt LHSUndef, LHSZero;
43365 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43366 Depth + 1))
43367 return true;
43368 APInt RHSUndef, RHSZero;
43369 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43370 Depth + 1))
43371 return true;
43372
43373 // TODO - pass on known zero/undef.
43374
43375 // Aggressively peek through ops to get at the demanded elts.
43376 // TODO - we should do this for all target/faux shuffles ops.
43377 if (!DemandedElts.isAllOnes()) {
43378 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43379 TLO.DAG, Depth + 1);
43380 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43381 TLO.DAG, Depth + 1);
43382 if (NewN0 || NewN1) {
43383 NewN0 = NewN0 ? NewN0 : N0;
43384 NewN1 = NewN1 ? NewN1 : N1;
43385 return TLO.CombineTo(Op,
43386 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43387 }
43388 }
43389 break;
43390 }
43391 case X86ISD::HADD:
43392 case X86ISD::HSUB:
43393 case X86ISD::FHADD:
43394 case X86ISD::FHSUB: {
43395 SDValue N0 = Op.getOperand(0);
43396 SDValue N1 = Op.getOperand(1);
43397
43398 APInt DemandedLHS, DemandedRHS;
43399 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43400
43401 APInt LHSUndef, LHSZero;
43402 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43403 Depth + 1))
43404 return true;
43405 APInt RHSUndef, RHSZero;
43406 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43407 Depth + 1))
43408 return true;
43409
43410 // TODO - pass on known zero/undef.
43411
43412 // Aggressively peek through ops to get at the demanded elts.
43413 // TODO: Handle repeated operands.
43414 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43415 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43416 TLO.DAG, Depth + 1);
43417 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43418 TLO.DAG, Depth + 1);
43419 if (NewN0 || NewN1) {
43420 NewN0 = NewN0 ? NewN0 : N0;
43421 NewN1 = NewN1 ? NewN1 : N1;
43422 return TLO.CombineTo(Op,
43423 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43424 }
43425 }
43426 break;
43427 }
43428 case X86ISD::VTRUNC:
43429 case X86ISD::VTRUNCS:
43430 case X86ISD::VTRUNCUS: {
43431 SDValue Src = Op.getOperand(0);
43432 MVT SrcVT = Src.getSimpleValueType();
43433 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43434 APInt SrcUndef, SrcZero;
43435 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43436 Depth + 1))
43437 return true;
43438 KnownZero = SrcZero.zextOrTrunc(NumElts);
43439 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43440 break;
43441 }
43442 case X86ISD::BLENDI: {
43443 SmallVector<int, 16> BlendMask;
43444 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43446 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43447 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43448 return TLO.CombineTo(Op, R);
43449 break;
43450 }
43451 case X86ISD::BLENDV: {
43452 APInt SelUndef, SelZero;
43453 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43454 SelZero, TLO, Depth + 1))
43455 return true;
43456
43457 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43458 APInt LHSUndef, LHSZero;
43459 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43460 LHSZero, TLO, Depth + 1))
43461 return true;
43462
43463 APInt RHSUndef, RHSZero;
43464 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43465 RHSZero, TLO, Depth + 1))
43466 return true;
43467
43468 KnownZero = LHSZero & RHSZero;
43469 KnownUndef = LHSUndef & RHSUndef;
43470 break;
43471 }
43472 case X86ISD::VZEXT_MOVL: {
43473 // If upper demanded elements are already zero then we have nothing to do.
43474 SDValue Src = Op.getOperand(0);
43475 APInt DemandedUpperElts = DemandedElts;
43476 DemandedUpperElts.clearLowBits(1);
43477 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43478 return TLO.CombineTo(Op, Src);
43479 break;
43480 }
43481 case X86ISD::VZEXT_LOAD: {
43482 // If upper demanded elements are not demanded then simplify to a
43483 // scalar_to_vector(load()).
43485 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43486 SDLoc DL(Op);
43487 auto *Mem = cast<MemSDNode>(Op);
43488 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43489 Mem->getMemOperand());
43490 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43491 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43492 }
43493 break;
43494 }
43495 case X86ISD::VBROADCAST: {
43496 SDValue Src = Op.getOperand(0);
43497 MVT SrcVT = Src.getSimpleValueType();
43498 // Don't bother broadcasting if we just need the 0'th element.
43499 if (DemandedElts == 1) {
43500 if (!SrcVT.isVector())
43501 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
43502 else if (Src.getValueType() != VT)
43503 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43504 SDLoc(Op));
43505 return TLO.CombineTo(Op, Src);
43506 }
43507 if (!SrcVT.isVector())
43508 break;
43509 APInt SrcUndef, SrcZero;
43510 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43511 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43512 Depth + 1))
43513 return true;
43514 // Aggressively peek through src to get at the demanded elt.
43515 // TODO - we should do this for all target/faux shuffles ops.
43517 Src, SrcElts, TLO.DAG, Depth + 1))
43518 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43519 break;
43520 }
43521 case X86ISD::VPERMV:
43522 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
43523 Depth))
43524 return true;
43525 break;
43526 case X86ISD::PSHUFB:
43527 case X86ISD::VPERMV3:
43528 case X86ISD::VPERMILPV:
43529 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
43530 Depth))
43531 return true;
43532 break;
43533 case X86ISD::VPPERM:
43534 case X86ISD::VPERMIL2:
43535 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
43536 Depth))
43537 return true;
43538 break;
43539 }
43540
43541 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
43542 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
43543 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
43544 if ((VT.is256BitVector() || VT.is512BitVector()) &&
43545 DemandedElts.lshr(NumElts / 2) == 0) {
43546 unsigned SizeInBits = VT.getSizeInBits();
43547 unsigned ExtSizeInBits = SizeInBits / 2;
43548
43549 // See if 512-bit ops only use the bottom 128-bits.
43550 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
43551 ExtSizeInBits = SizeInBits / 4;
43552
43553 switch (Opc) {
43554 // Scalar broadcast.
43555 case X86ISD::VBROADCAST: {
43556 SDLoc DL(Op);
43557 SDValue Src = Op.getOperand(0);
43558 if (Src.getValueSizeInBits() > ExtSizeInBits)
43559 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
43560 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43561 ExtSizeInBits / VT.getScalarSizeInBits());
43562 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
43563 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43564 TLO.DAG, DL, ExtSizeInBits));
43565 }
43567 SDLoc DL(Op);
43568 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43569 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43570 ExtSizeInBits / VT.getScalarSizeInBits());
43571 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
43572 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
43573 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
43574 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
43575 MemIntr->getMemOperand());
43577 Bcst.getValue(1));
43578 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43579 TLO.DAG, DL, ExtSizeInBits));
43580 }
43581 // Subvector broadcast.
43583 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43584 EVT MemVT = MemIntr->getMemoryVT();
43585 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
43586 SDLoc DL(Op);
43587 SDValue Ld =
43588 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
43589 MemIntr->getBasePtr(), MemIntr->getMemOperand());
43591 Ld.getValue(1));
43592 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
43593 TLO.DAG, DL, ExtSizeInBits));
43594 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
43595 SDLoc DL(Op);
43596 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43597 ExtSizeInBits / VT.getScalarSizeInBits());
43598 if (SDValue BcstLd =
43599 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
43600 return TLO.CombineTo(Op,
43601 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
43602 TLO.DAG, DL, ExtSizeInBits));
43603 }
43604 break;
43605 }
43606 // Byte shifts by immediate.
43607 case X86ISD::VSHLDQ:
43608 case X86ISD::VSRLDQ:
43609 // Shift by uniform.
43610 case X86ISD::VSHL:
43611 case X86ISD::VSRL:
43612 case X86ISD::VSRA:
43613 // Shift by immediate.
43614 case X86ISD::VSHLI:
43615 case X86ISD::VSRLI:
43616 case X86ISD::VSRAI: {
43617 SDLoc DL(Op);
43618 SDValue Ext0 =
43619 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
43620 SDValue ExtOp =
43621 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
43622 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43623 SDValue Insert =
43624 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43625 return TLO.CombineTo(Op, Insert);
43626 }
43627 case X86ISD::VPERMI: {
43628 // Simplify PERMPD/PERMQ to extract_subvector.
43629 // TODO: This should be done in shuffle combining.
43630 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
43632 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
43633 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
43634 SDLoc DL(Op);
43635 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
43636 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43637 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
43638 return TLO.CombineTo(Op, Insert);
43639 }
43640 }
43641 break;
43642 }
43643 case X86ISD::VPERM2X128: {
43644 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
43645 SDLoc DL(Op);
43646 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
43647 if (LoMask & 0x8)
43648 return TLO.CombineTo(
43649 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
43650 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
43651 unsigned SrcIdx = (LoMask & 0x2) >> 1;
43652 SDValue ExtOp =
43653 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
43654 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43655 SDValue Insert =
43656 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43657 return TLO.CombineTo(Op, Insert);
43658 }
43659 // Conversions.
43660 // TODO: Add more CVT opcodes when we have test coverage.
43661 case X86ISD::CVTTP2SI:
43662 case X86ISD::CVTTP2UI:
43663 case X86ISD::CVTPH2PS: {
43664 SDLoc DL(Op);
43665 unsigned Scale = SizeInBits / ExtSizeInBits;
43666 SDValue SrcOp = Op.getOperand(0);
43667 MVT SrcVT = SrcOp.getSimpleValueType();
43668 unsigned SrcExtSize =
43669 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
43671 ExtSizeInBits / VT.getScalarSizeInBits());
43672 SDValue ExtOp = TLO.DAG.getNode(
43673 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
43674 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43675 SDValue Insert =
43676 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43677 return TLO.CombineTo(Op, Insert);
43678 }
43679 // Zero upper elements.
43680 case X86ISD::VZEXT_MOVL:
43681 // Variable blend.
43682 case X86ISD::BLENDV:
43683 // Target unary shuffles by immediate:
43684 case X86ISD::PSHUFD:
43685 case X86ISD::PSHUFLW:
43686 case X86ISD::PSHUFHW:
43687 case X86ISD::VPERMILPI:
43688 // (Non-Lane Crossing) Target Shuffles.
43689 case X86ISD::VPERMILPV:
43690 case X86ISD::VPERMIL2:
43691 case X86ISD::PSHUFB:
43692 case X86ISD::UNPCKL:
43693 case X86ISD::UNPCKH:
43694 case X86ISD::BLENDI:
43695 // Integer ops.
43696 case X86ISD::PACKSS:
43697 case X86ISD::PACKUS:
43698 case X86ISD::PCMPEQ:
43699 case X86ISD::PCMPGT:
43700 case X86ISD::PMULUDQ:
43701 case X86ISD::PMULDQ:
43702 case X86ISD::VSHLV:
43703 case X86ISD::VSRLV:
43704 case X86ISD::VSRAV:
43705 // Float ops.
43706 case X86ISD::FMAX:
43707 case X86ISD::FMIN:
43708 case X86ISD::FMAXC:
43709 case X86ISD::FMINC:
43710 case X86ISD::FRSQRT:
43711 case X86ISD::FRCP:
43712 // Horizontal Ops.
43713 case X86ISD::HADD:
43714 case X86ISD::HSUB:
43715 case X86ISD::FHADD:
43716 case X86ISD::FHSUB: {
43717 SDLoc DL(Op);
43719 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
43720 SDValue SrcOp = Op.getOperand(i);
43721 EVT SrcVT = SrcOp.getValueType();
43722 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
43723 "Unsupported vector size");
43724 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
43725 ExtSizeInBits)
43726 : SrcOp);
43727 }
43728 MVT ExtVT = VT.getSimpleVT();
43729 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
43730 ExtSizeInBits / ExtVT.getScalarSizeInBits());
43731 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
43732 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43733 SDValue Insert =
43734 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43735 return TLO.CombineTo(Op, Insert);
43736 }
43737 }
43738 }
43739
43740 // For splats, unless we *only* demand the 0'th element,
43741 // stop attempts at simplification here, we aren't going to improve things,
43742 // this is better than any potential shuffle.
43743 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
43744 return false;
43745
43746 // Get target/faux shuffle mask.
43747 APInt OpUndef, OpZero;
43748 SmallVector<int, 64> OpMask;
43749 SmallVector<SDValue, 2> OpInputs;
43750 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
43751 OpZero, TLO.DAG, Depth, false))
43752 return false;
43753
43754 // Shuffle inputs must be the same size as the result.
43755 if (OpMask.size() != (unsigned)NumElts ||
43756 llvm::any_of(OpInputs, [VT](SDValue V) {
43757 return VT.getSizeInBits() != V.getValueSizeInBits() ||
43758 !V.getValueType().isVector();
43759 }))
43760 return false;
43761
43762 KnownZero = OpZero;
43763 KnownUndef = OpUndef;
43764
43765 // Check if shuffle mask can be simplified to undef/zero/identity.
43766 int NumSrcs = OpInputs.size();
43767 for (int i = 0; i != NumElts; ++i)
43768 if (!DemandedElts[i])
43769 OpMask[i] = SM_SentinelUndef;
43770
43771 if (isUndefInRange(OpMask, 0, NumElts)) {
43772 KnownUndef.setAllBits();
43773 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
43774 }
43775 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
43776 KnownZero.setAllBits();
43777 return TLO.CombineTo(
43778 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43779 }
43780 for (int Src = 0; Src != NumSrcs; ++Src)
43781 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
43782 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
43783
43784 // Attempt to simplify inputs.
43785 for (int Src = 0; Src != NumSrcs; ++Src) {
43786 // TODO: Support inputs of different types.
43787 if (OpInputs[Src].getValueType() != VT)
43788 continue;
43789
43790 int Lo = Src * NumElts;
43791 APInt SrcElts = APInt::getZero(NumElts);
43792 for (int i = 0; i != NumElts; ++i)
43793 if (DemandedElts[i]) {
43794 int M = OpMask[i] - Lo;
43795 if (0 <= M && M < NumElts)
43796 SrcElts.setBit(M);
43797 }
43798
43799 // TODO - Propagate input undef/zero elts.
43800 APInt SrcUndef, SrcZero;
43801 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
43802 TLO, Depth + 1))
43803 return true;
43804 }
43805
43806 // If we don't demand all elements, then attempt to combine to a simpler
43807 // shuffle.
43808 // We need to convert the depth to something combineX86ShufflesRecursively
43809 // can handle - so pretend its Depth == 0 again, and reduce the max depth
43810 // to match. This prevents combineX86ShuffleChain from returning a
43811 // combined shuffle that's the same as the original root, causing an
43812 // infinite loop.
43813 if (!DemandedElts.isAllOnes()) {
43814 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
43815
43816 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
43817 for (int i = 0; i != NumElts; ++i)
43818 if (DemandedElts[i])
43819 DemandedMask[i] = i;
43820
43822 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
43823 /*HasVarMask*/ false,
43824 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
43825 Subtarget);
43826 if (NewShuffle)
43827 return TLO.CombineTo(Op, NewShuffle);
43828 }
43829
43830 return false;
43831}
43832
43834 SDValue Op, const APInt &OriginalDemandedBits,
43835 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
43836 unsigned Depth) const {
43837 EVT VT = Op.getValueType();
43838 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
43839 unsigned Opc = Op.getOpcode();
43840 switch(Opc) {
43841 case X86ISD::VTRUNC: {
43842 KnownBits KnownOp;
43843 SDValue Src = Op.getOperand(0);
43844 MVT SrcVT = Src.getSimpleValueType();
43845
43846 // Simplify the input, using demanded bit information.
43847 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
43848 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
43849 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
43850 return true;
43851 break;
43852 }
43853 case X86ISD::PMULDQ:
43854 case X86ISD::PMULUDQ: {
43855 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
43856 KnownBits KnownLHS, KnownRHS;
43857 SDValue LHS = Op.getOperand(0);
43858 SDValue RHS = Op.getOperand(1);
43859
43860 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
43861 // FIXME: Can we bound this better?
43862 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
43863 APInt DemandedMaskLHS = APInt::getAllOnes(64);
43864 APInt DemandedMaskRHS = APInt::getAllOnes(64);
43865
43866 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
43867 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
43868 DemandedMaskLHS = DemandedMask;
43869 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
43870 DemandedMaskRHS = DemandedMask;
43871
43872 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
43873 KnownLHS, TLO, Depth + 1))
43874 return true;
43875 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
43876 KnownRHS, TLO, Depth + 1))
43877 return true;
43878
43879 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
43880 KnownRHS = KnownRHS.trunc(32);
43881 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
43882 KnownRHS.getConstant().isOne()) {
43883 SDLoc DL(Op);
43884 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
43885 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
43886 }
43887
43888 // Aggressively peek through ops to get at the demanded low bits.
43890 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43892 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43893 if (DemandedLHS || DemandedRHS) {
43894 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
43895 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
43896 return TLO.CombineTo(
43897 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
43898 }
43899 break;
43900 }
43901 case X86ISD::ANDNP: {
43902 KnownBits Known2;
43903 SDValue Op0 = Op.getOperand(0);
43904 SDValue Op1 = Op.getOperand(1);
43905
43906 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
43907 Known, TLO, Depth + 1))
43908 return true;
43909
43910 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
43911 OriginalDemandedElts, Known2, TLO, Depth + 1))
43912 return true;
43913
43914 // If the RHS is a constant, see if we can simplify it.
43915 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
43916 OriginalDemandedElts, TLO))
43917 return true;
43918
43919 // ANDNP = (~Op0 & Op1);
43920 Known.One &= Known2.Zero;
43921 Known.Zero |= Known2.One;
43922 break;
43923 }
43924 case X86ISD::VSHLI: {
43925 SDValue Op0 = Op.getOperand(0);
43926 SDValue Op1 = Op.getOperand(1);
43927
43928 unsigned ShAmt = Op1->getAsZExtVal();
43929 if (ShAmt >= BitWidth)
43930 break;
43931
43932 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
43933
43934 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43935 // single shift. We can do this if the bottom bits (which are shifted
43936 // out) are never demanded.
43937 if (Op0.getOpcode() == X86ISD::VSRLI &&
43938 OriginalDemandedBits.countr_zero() >= ShAmt) {
43939 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
43940 if (Shift2Amt < BitWidth) {
43941 int Diff = ShAmt - Shift2Amt;
43942 if (Diff == 0)
43943 return TLO.CombineTo(Op, Op0.getOperand(0));
43944
43945 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
43946 SDValue NewShift = TLO.DAG.getNode(
43947 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
43948 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
43949 return TLO.CombineTo(Op, NewShift);
43950 }
43951 }
43952
43953 // If we are only demanding sign bits then we can use the shift source directly.
43954 unsigned NumSignBits =
43955 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
43956 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
43957 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43958 return TLO.CombineTo(Op, Op0);
43959
43960 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43961 TLO, Depth + 1))
43962 return true;
43963
43964 Known.Zero <<= ShAmt;
43965 Known.One <<= ShAmt;
43966
43967 // Low bits known zero.
43968 Known.Zero.setLowBits(ShAmt);
43969
43970 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
43971 // Attempt to avoid multi-use ops if we don't need anything from them.
43972 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
43973 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
43974 SDValue NewOp =
43975 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
43976 return TLO.CombineTo(Op, NewOp);
43977 }
43978 }
43979 return false;
43980 }
43981 case X86ISD::VSRLI: {
43982 SDValue Op0 = Op.getOperand(0);
43983 SDValue Op1 = Op.getOperand(1);
43984
43985 unsigned ShAmt = Op1->getAsZExtVal();
43986 if (ShAmt >= BitWidth)
43987 break;
43988
43989 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43990
43991 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43992 TLO, Depth + 1))
43993 return true;
43994
43995 Known.Zero.lshrInPlace(ShAmt);
43996 Known.One.lshrInPlace(ShAmt);
43997
43998 // High bits known zero.
43999 Known.Zero.setHighBits(ShAmt);
44000
44001 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44002 // Attempt to avoid multi-use ops if we don't need anything from them.
44003 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44004 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44005 SDValue NewOp =
44006 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44007 return TLO.CombineTo(Op, NewOp);
44008 }
44009 }
44010 return false;
44011 }
44012 case X86ISD::VSRAI: {
44013 SDValue Op0 = Op.getOperand(0);
44014 SDValue Op1 = Op.getOperand(1);
44015
44016 unsigned ShAmt = Op1->getAsZExtVal();
44017 if (ShAmt >= BitWidth)
44018 break;
44019
44020 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44021
44022 // If we just want the sign bit then we don't need to shift it.
44023 if (OriginalDemandedBits.isSignMask())
44024 return TLO.CombineTo(Op, Op0);
44025
44026 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44027 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44028 SDValue Op00 = Op0.getOperand(0);
44029 unsigned NumSignBits =
44030 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44031 if (ShAmt < NumSignBits)
44032 return TLO.CombineTo(Op, Op00);
44033 }
44034
44035 // If any of the demanded bits are produced by the sign extension, we also
44036 // demand the input sign bit.
44037 if (OriginalDemandedBits.countl_zero() < ShAmt)
44038 DemandedMask.setSignBit();
44039
44040 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44041 TLO, Depth + 1))
44042 return true;
44043
44044 Known.Zero.lshrInPlace(ShAmt);
44045 Known.One.lshrInPlace(ShAmt);
44046
44047 // If the input sign bit is known to be zero, or if none of the top bits
44048 // are demanded, turn this into an unsigned shift right.
44049 if (Known.Zero[BitWidth - ShAmt - 1] ||
44050 OriginalDemandedBits.countl_zero() >= ShAmt)
44051 return TLO.CombineTo(
44052 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44053
44054 // High bits are known one.
44055 if (Known.One[BitWidth - ShAmt - 1])
44056 Known.One.setHighBits(ShAmt);
44057
44058 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44059 // Attempt to avoid multi-use ops if we don't need anything from them.
44060 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44061 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44062 SDValue NewOp =
44063 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44064 return TLO.CombineTo(Op, NewOp);
44065 }
44066 }
44067 return false;
44068 }
44069 case X86ISD::BLENDV: {
44070 SDValue Sel = Op.getOperand(0);
44071 SDValue LHS = Op.getOperand(1);
44072 SDValue RHS = Op.getOperand(2);
44073
44074 APInt SignMask = APInt::getSignMask(BitWidth);
44076 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44078 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44080 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44081
44082 if (NewSel || NewLHS || NewRHS) {
44083 NewSel = NewSel ? NewSel : Sel;
44084 NewLHS = NewLHS ? NewLHS : LHS;
44085 NewRHS = NewRHS ? NewRHS : RHS;
44086 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44087 NewSel, NewLHS, NewRHS));
44088 }
44089 break;
44090 }
44091 case X86ISD::PEXTRB:
44092 case X86ISD::PEXTRW: {
44093 SDValue Vec = Op.getOperand(0);
44094 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44095 MVT VecVT = Vec.getSimpleValueType();
44096 unsigned NumVecElts = VecVT.getVectorNumElements();
44097
44098 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44099 unsigned Idx = CIdx->getZExtValue();
44100 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44101
44102 // If we demand no bits from the vector then we must have demanded
44103 // bits from the implict zext - simplify to zero.
44104 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44105 if (DemandedVecBits == 0)
44106 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44107
44108 APInt KnownUndef, KnownZero;
44109 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44110 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44111 KnownZero, TLO, Depth + 1))
44112 return true;
44113
44114 KnownBits KnownVec;
44115 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44116 KnownVec, TLO, Depth + 1))
44117 return true;
44118
44120 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44121 return TLO.CombineTo(
44122 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44123
44124 Known = KnownVec.zext(BitWidth);
44125 return false;
44126 }
44127 break;
44128 }
44129 case X86ISD::PINSRB:
44130 case X86ISD::PINSRW: {
44131 SDValue Vec = Op.getOperand(0);
44132 SDValue Scl = Op.getOperand(1);
44133 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44134 MVT VecVT = Vec.getSimpleValueType();
44135
44136 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44137 unsigned Idx = CIdx->getZExtValue();
44138 if (!OriginalDemandedElts[Idx])
44139 return TLO.CombineTo(Op, Vec);
44140
44141 KnownBits KnownVec;
44142 APInt DemandedVecElts(OriginalDemandedElts);
44143 DemandedVecElts.clearBit(Idx);
44144 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44145 KnownVec, TLO, Depth + 1))
44146 return true;
44147
44148 KnownBits KnownScl;
44149 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44150 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44151 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44152 return true;
44153
44154 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44155 Known = KnownVec.intersectWith(KnownScl);
44156 return false;
44157 }
44158 break;
44159 }
44160 case X86ISD::PACKSS:
44161 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44162 // sign bit then we can just ask for the source operands sign bit.
44163 // TODO - add known bits handling.
44164 if (OriginalDemandedBits.isSignMask()) {
44165 APInt DemandedLHS, DemandedRHS;
44166 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44167
44168 KnownBits KnownLHS, KnownRHS;
44169 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44170 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44171 KnownLHS, TLO, Depth + 1))
44172 return true;
44173 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44174 KnownRHS, TLO, Depth + 1))
44175 return true;
44176
44177 // Attempt to avoid multi-use ops if we don't need anything from them.
44179 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44181 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44182 if (DemandedOp0 || DemandedOp1) {
44183 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44184 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44185 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44186 }
44187 }
44188 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44189 break;
44190 case X86ISD::VBROADCAST: {
44191 SDValue Src = Op.getOperand(0);
44192 MVT SrcVT = Src.getSimpleValueType();
44193 APInt DemandedElts = APInt::getOneBitSet(
44194 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44195 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44196 TLO, Depth + 1))
44197 return true;
44198 // If we don't need the upper bits, attempt to narrow the broadcast source.
44199 // Don't attempt this on AVX512 as it might affect broadcast folding.
44200 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44201 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44202 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44203 Src->hasOneUse()) {
44204 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44205 SDValue NewSrc =
44206 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44207 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44208 SDValue NewBcst =
44209 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44210 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44211 }
44212 break;
44213 }
44214 case X86ISD::PCMPGT:
44215 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44216 // iff we only need the sign bit then we can use R directly.
44217 if (OriginalDemandedBits.isSignMask() &&
44218 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44219 return TLO.CombineTo(Op, Op.getOperand(1));
44220 break;
44221 case X86ISD::MOVMSK: {
44222 SDValue Src = Op.getOperand(0);
44223 MVT SrcVT = Src.getSimpleValueType();
44224 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44225 unsigned NumElts = SrcVT.getVectorNumElements();
44226
44227 // If we don't need the sign bits at all just return zero.
44228 if (OriginalDemandedBits.countr_zero() >= NumElts)
44229 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44230
44231 // See if we only demand bits from the lower 128-bit vector.
44232 if (SrcVT.is256BitVector() &&
44233 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44234 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44235 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44236 }
44237
44238 // Only demand the vector elements of the sign bits we need.
44239 APInt KnownUndef, KnownZero;
44240 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44241 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44242 TLO, Depth + 1))
44243 return true;
44244
44245 Known.Zero = KnownZero.zext(BitWidth);
44246 Known.Zero.setHighBits(BitWidth - NumElts);
44247
44248 // MOVMSK only uses the MSB from each vector element.
44249 KnownBits KnownSrc;
44250 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44251 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44252 Depth + 1))
44253 return true;
44254
44255 if (KnownSrc.One[SrcBits - 1])
44256 Known.One.setLowBits(NumElts);
44257 else if (KnownSrc.Zero[SrcBits - 1])
44258 Known.Zero.setLowBits(NumElts);
44259
44260 // Attempt to avoid multi-use os if we don't need anything from it.
44262 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44263 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44264 return false;
44265 }
44266 case X86ISD::TESTP: {
44267 SDValue Op0 = Op.getOperand(0);
44268 SDValue Op1 = Op.getOperand(1);
44269 MVT OpVT = Op0.getSimpleValueType();
44270 assert((OpVT.getVectorElementType() == MVT::f32 ||
44271 OpVT.getVectorElementType() == MVT::f64) &&
44272 "Illegal vector type for X86ISD::TESTP");
44273
44274 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44275 KnownBits KnownSrc;
44276 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44277 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44278 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44279 AssumeSingleUse) ||
44280 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44281 AssumeSingleUse);
44282 }
44283 case X86ISD::CMOV: {
44284 KnownBits Known2;
44285 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44286 OriginalDemandedElts, Known2, TLO, Depth + 1))
44287 return true;
44288 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44289 OriginalDemandedElts, Known, TLO, Depth + 1))
44290 return true;
44291
44292 // Only known if known in both the LHS and RHS.
44293 Known = Known.intersectWith(Known2);
44294 break;
44295 }
44296 case X86ISD::BEXTR:
44297 case X86ISD::BEXTRI: {
44298 SDValue Op0 = Op.getOperand(0);
44299 SDValue Op1 = Op.getOperand(1);
44300
44301 // Only bottom 16-bits of the control bits are required.
44302 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44303 // NOTE: SimplifyDemandedBits won't do this for constants.
44304 uint64_t Val1 = Cst1->getZExtValue();
44305 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44306 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44307 SDLoc DL(Op);
44308 return TLO.CombineTo(
44309 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44310 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44311 }
44312
44313 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44314 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44315
44316 // If the length is 0, the result is 0.
44317 if (Length == 0) {
44318 Known.setAllZero();
44319 return false;
44320 }
44321
44322 if ((Shift + Length) <= BitWidth) {
44323 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44324 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44325 return true;
44326
44327 Known = Known.extractBits(Length, Shift);
44328 Known = Known.zextOrTrunc(BitWidth);
44329 return false;
44330 }
44331 } else {
44332 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44333 KnownBits Known1;
44334 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44335 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44336 return true;
44337
44338 // If the length is 0, replace with 0.
44339 KnownBits LengthBits = Known1.extractBits(8, 8);
44340 if (LengthBits.isZero())
44341 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44342 }
44343
44344 break;
44345 }
44346 case X86ISD::PDEP: {
44347 SDValue Op0 = Op.getOperand(0);
44348 SDValue Op1 = Op.getOperand(1);
44349
44350 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44351 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44352
44353 // If the demanded bits has leading zeroes, we don't demand those from the
44354 // mask.
44355 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44356 return true;
44357
44358 // The number of possible 1s in the mask determines the number of LSBs of
44359 // operand 0 used. Undemanded bits from the mask don't matter so filter
44360 // them before counting.
44361 KnownBits Known2;
44362 uint64_t Count = (~Known.Zero & LoMask).popcount();
44363 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44364 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44365 return true;
44366
44367 // Zeroes are retained from the mask, but not ones.
44368 Known.One.clearAllBits();
44369 // The result will have at least as many trailing zeros as the non-mask
44370 // operand since bits can only map to the same or higher bit position.
44371 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44372 return false;
44373 }
44374 }
44375
44377 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
44378}
44379
44381 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
44382 SelectionDAG &DAG, unsigned Depth) const {
44383 int NumElts = DemandedElts.getBitWidth();
44384 unsigned Opc = Op.getOpcode();
44385 EVT VT = Op.getValueType();
44386
44387 switch (Opc) {
44388 case X86ISD::PINSRB:
44389 case X86ISD::PINSRW: {
44390 // If we don't demand the inserted element, return the base vector.
44391 SDValue Vec = Op.getOperand(0);
44392 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44393 MVT VecVT = Vec.getSimpleValueType();
44394 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
44395 !DemandedElts[CIdx->getZExtValue()])
44396 return Vec;
44397 break;
44398 }
44399 case X86ISD::VSHLI: {
44400 // If we are only demanding sign bits then we can use the shift source
44401 // directly.
44402 SDValue Op0 = Op.getOperand(0);
44403 unsigned ShAmt = Op.getConstantOperandVal(1);
44404 unsigned BitWidth = DemandedBits.getBitWidth();
44405 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
44406 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
44407 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44408 return Op0;
44409 break;
44410 }
44411 case X86ISD::VSRAI:
44412 // iff we only need the sign bit then we can use the source directly.
44413 // TODO: generalize where we only demand extended signbits.
44414 if (DemandedBits.isSignMask())
44415 return Op.getOperand(0);
44416 break;
44417 case X86ISD::PCMPGT:
44418 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44419 // iff we only need the sign bit then we can use R directly.
44420 if (DemandedBits.isSignMask() &&
44421 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44422 return Op.getOperand(1);
44423 break;
44424 case X86ISD::BLENDV: {
44425 // BLENDV: Cond (MSB) ? LHS : RHS
44426 SDValue Cond = Op.getOperand(0);
44427 SDValue LHS = Op.getOperand(1);
44428 SDValue RHS = Op.getOperand(2);
44429
44430 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
44431 if (CondKnown.isNegative())
44432 return LHS;
44433 if (CondKnown.isNonNegative())
44434 return RHS;
44435 break;
44436 }
44437 case X86ISD::ANDNP: {
44438 // ANDNP = (~LHS & RHS);
44439 SDValue LHS = Op.getOperand(0);
44440 SDValue RHS = Op.getOperand(1);
44441
44442 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
44443 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
44444
44445 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
44446 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
44447 // this context, so return RHS.
44448 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
44449 return RHS;
44450 break;
44451 }
44452 }
44453
44454 APInt ShuffleUndef, ShuffleZero;
44455 SmallVector<int, 16> ShuffleMask;
44457 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
44458 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
44459 // If all the demanded elts are from one operand and are inline,
44460 // then we can use the operand directly.
44461 int NumOps = ShuffleOps.size();
44462 if (ShuffleMask.size() == (unsigned)NumElts &&
44464 return VT.getSizeInBits() == V.getValueSizeInBits();
44465 })) {
44466
44467 if (DemandedElts.isSubsetOf(ShuffleUndef))
44468 return DAG.getUNDEF(VT);
44469 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
44470 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
44471
44472 // Bitmask that indicates which ops have only been accessed 'inline'.
44473 APInt IdentityOp = APInt::getAllOnes(NumOps);
44474 for (int i = 0; i != NumElts; ++i) {
44475 int M = ShuffleMask[i];
44476 if (!DemandedElts[i] || ShuffleUndef[i])
44477 continue;
44478 int OpIdx = M / NumElts;
44479 int EltIdx = M % NumElts;
44480 if (M < 0 || EltIdx != i) {
44481 IdentityOp.clearAllBits();
44482 break;
44483 }
44484 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
44485 if (IdentityOp == 0)
44486 break;
44487 }
44488 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
44489 "Multiple identity shuffles detected");
44490
44491 if (IdentityOp != 0)
44492 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
44493 }
44494 }
44495
44497 Op, DemandedBits, DemandedElts, DAG, Depth);
44498}
44499
44501 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44502 bool PoisonOnly, unsigned Depth) const {
44503 unsigned NumElts = DemandedElts.getBitWidth();
44504
44505 switch (Op.getOpcode()) {
44506 case X86ISD::PSHUFD:
44507 case X86ISD::VPERMILPI:
44508 case X86ISD::VPERMV3: {
44511 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
44512 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
44513 APInt::getZero(NumElts));
44514 for (auto M : enumerate(Mask)) {
44515 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
44516 continue;
44517 if (M.value() == SM_SentinelUndef)
44518 return false;
44519 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
44520 "Shuffle mask index out of range");
44521 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
44522 }
44523 for (auto Op : enumerate(Ops))
44524 if (!DemandedSrcElts[Op.index()].isZero() &&
44526 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
44527 return false;
44528 return true;
44529 }
44530 break;
44531 }
44532 }
44534 Op, DemandedElts, DAG, PoisonOnly, Depth);
44535}
44536
44538 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44539 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
44540
44541 switch (Op.getOpcode()) {
44542 // SSE vector multiplies are either inbounds or saturate.
44543 case X86ISD::VPMADDUBSW:
44544 case X86ISD::VPMADDWD:
44545 // SSE vector shifts handle out of bounds shift amounts.
44546 case X86ISD::VSHLI:
44547 case X86ISD::VSRLI:
44548 case X86ISD::VSRAI:
44549 return false;
44550 case X86ISD::PSHUFD:
44551 case X86ISD::VPERMILPI:
44552 case X86ISD::VPERMV3:
44553 case X86ISD::UNPCKH:
44554 case X86ISD::UNPCKL:
44555 return false;
44556 // SSE comparisons handle all fcmp cases.
44557 // TODO: Add PCMPEQ/GT and CMPM/MM with test coverage.
44558 case X86ISD::CMPP:
44559 return false;
44561 switch (Op->getConstantOperandVal(0)) {
44562 case Intrinsic::x86_sse2_pmadd_wd:
44563 case Intrinsic::x86_avx2_pmadd_wd:
44564 case Intrinsic::x86_avx512_pmaddw_d_512:
44565 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
44566 case Intrinsic::x86_avx2_pmadd_ub_sw:
44567 case Intrinsic::x86_avx512_pmaddubs_w_512:
44568 return false;
44569 }
44570 }
44572 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
44573}
44574
44576 const APInt &DemandedElts,
44577 APInt &UndefElts,
44578 const SelectionDAG &DAG,
44579 unsigned Depth) const {
44580 unsigned NumElts = DemandedElts.getBitWidth();
44581 unsigned Opc = Op.getOpcode();
44582
44583 switch (Opc) {
44584 case X86ISD::VBROADCAST:
44586 UndefElts = APInt::getZero(NumElts);
44587 return true;
44588 }
44589
44590 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
44591 DAG, Depth);
44592}
44593
44594// Helper to peek through bitops/trunc/setcc to determine size of source vector.
44595// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
44596static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
44597 bool AllowTruncate) {
44598 switch (Src.getOpcode()) {
44599 case ISD::TRUNCATE:
44600 if (!AllowTruncate)
44601 return false;
44602 [[fallthrough]];
44603 case ISD::SETCC:
44604 return Src.getOperand(0).getValueSizeInBits() == Size;
44605 case ISD::FREEZE:
44606 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate);
44607 case ISD::AND:
44608 case ISD::XOR:
44609 case ISD::OR:
44610 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
44611 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
44612 case ISD::SELECT:
44613 case ISD::VSELECT:
44614 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
44615 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
44616 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
44617 case ISD::BUILD_VECTOR:
44618 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
44619 ISD::isBuildVectorAllOnes(Src.getNode());
44620 }
44621 return false;
44622}
44623
44624// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
44625static unsigned getAltBitOpcode(unsigned Opcode) {
44626 switch(Opcode) {
44627 // clang-format off
44628 case ISD::AND: return X86ISD::FAND;
44629 case ISD::OR: return X86ISD::FOR;
44630 case ISD::XOR: return X86ISD::FXOR;
44631 case X86ISD::ANDNP: return X86ISD::FANDN;
44632 // clang-format on
44633 }
44634 llvm_unreachable("Unknown bitwise opcode");
44635}
44636
44637// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
44639 const SDLoc &DL) {
44640 EVT SrcVT = Src.getValueType();
44641 if (SrcVT != MVT::v4i1)
44642 return SDValue();
44643
44644 switch (Src.getOpcode()) {
44645 case ISD::SETCC:
44646 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
44647 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
44648 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
44649 SDValue Op0 = Src.getOperand(0);
44650 if (ISD::isNormalLoad(Op0.getNode()))
44651 return DAG.getBitcast(MVT::v4f32, Op0);
44652 if (Op0.getOpcode() == ISD::BITCAST &&
44653 Op0.getOperand(0).getValueType() == MVT::v4f32)
44654 return Op0.getOperand(0);
44655 }
44656 break;
44657 case ISD::AND:
44658 case ISD::XOR:
44659 case ISD::OR: {
44660 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
44661 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
44662 if (Op0 && Op1)
44663 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
44664 Op1);
44665 break;
44666 }
44667 }
44668 return SDValue();
44669}
44670
44671// Helper to push sign extension of vXi1 SETCC result through bitops.
44673 SDValue Src, const SDLoc &DL) {
44674 switch (Src.getOpcode()) {
44675 case ISD::SETCC:
44676 case ISD::FREEZE:
44677 case ISD::TRUNCATE:
44678 case ISD::BUILD_VECTOR:
44679 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44680 case ISD::AND:
44681 case ISD::XOR:
44682 case ISD::OR:
44683 return DAG.getNode(
44684 Src.getOpcode(), DL, SExtVT,
44685 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
44686 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
44687 case ISD::SELECT:
44688 case ISD::VSELECT:
44689 return DAG.getSelect(
44690 DL, SExtVT, Src.getOperand(0),
44691 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
44692 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
44693 }
44694 llvm_unreachable("Unexpected node type for vXi1 sign extension");
44695}
44696
44697// Try to match patterns such as
44698// (i16 bitcast (v16i1 x))
44699// ->
44700// (i16 movmsk (16i8 sext (v16i1 x)))
44701// before the illegal vector is scalarized on subtargets that don't have legal
44702// vxi1 types.
44704 const SDLoc &DL,
44705 const X86Subtarget &Subtarget) {
44706 EVT SrcVT = Src.getValueType();
44707 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
44708 return SDValue();
44709
44710 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
44711 // legalization destroys the v4i32 type.
44712 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
44713 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
44714 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
44715 DAG.getBitcast(MVT::v4f32, V));
44716 return DAG.getZExtOrTrunc(V, DL, VT);
44717 }
44718 }
44719
44720 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
44721 // movmskb even with avx512. This will be better than truncating to vXi1 and
44722 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
44723 // vpcmpeqb/vpcmpgtb.
44724 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
44725 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
44726 Src.getOperand(0).getValueType() == MVT::v32i8 ||
44727 Src.getOperand(0).getValueType() == MVT::v64i8);
44728
44729 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
44730 // directly with vpmovmskb/vmovmskps/vmovmskpd.
44731 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
44732 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
44733 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
44734 EVT CmpVT = Src.getOperand(0).getValueType();
44735 EVT EltVT = CmpVT.getVectorElementType();
44736 if (CmpVT.getSizeInBits() <= 256 &&
44737 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
44738 PreferMovMsk = true;
44739 }
44740
44741 // With AVX512 vxi1 types are legal and we prefer using k-regs.
44742 // MOVMSK is supported in SSE2 or later.
44743 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
44744 return SDValue();
44745
44746 // If the upper ops of a concatenation are undef, then try to bitcast the
44747 // lower op and extend.
44748 SmallVector<SDValue, 4> SubSrcOps;
44749 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
44750 SubSrcOps.size() >= 2) {
44751 SDValue LowerOp = SubSrcOps[0];
44752 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
44753 if (LowerOp.getOpcode() == ISD::SETCC &&
44754 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
44755 EVT SubVT = VT.getIntegerVT(
44756 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
44757 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
44758 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
44759 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
44760 }
44761 }
44762 }
44763
44764 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
44765 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
44766 // v8i16 and v16i16.
44767 // For these two cases, we can shuffle the upper element bytes to a
44768 // consecutive sequence at the start of the vector and treat the results as
44769 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
44770 // for v16i16 this is not the case, because the shuffle is expensive, so we
44771 // avoid sign-extending to this type entirely.
44772 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
44773 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
44774 MVT SExtVT;
44775 bool PropagateSExt = false;
44776 switch (SrcVT.getSimpleVT().SimpleTy) {
44777 default:
44778 return SDValue();
44779 case MVT::v2i1:
44780 SExtVT = MVT::v2i64;
44781 break;
44782 case MVT::v4i1:
44783 SExtVT = MVT::v4i32;
44784 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
44785 // sign-extend to a 256-bit operation to avoid truncation.
44786 if (Subtarget.hasAVX() &&
44787 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
44788 SExtVT = MVT::v4i64;
44789 PropagateSExt = true;
44790 }
44791 break;
44792 case MVT::v8i1:
44793 SExtVT = MVT::v8i16;
44794 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
44795 // sign-extend to a 256-bit operation to match the compare.
44796 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
44797 // 256-bit because the shuffle is cheaper than sign extending the result of
44798 // the compare.
44799 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
44800 checkBitcastSrcVectorSize(Src, 512, true))) {
44801 SExtVT = MVT::v8i32;
44802 PropagateSExt = true;
44803 }
44804 break;
44805 case MVT::v16i1:
44806 SExtVT = MVT::v16i8;
44807 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
44808 // it is not profitable to sign-extend to 256-bit because this will
44809 // require an extra cross-lane shuffle which is more expensive than
44810 // truncating the result of the compare to 128-bits.
44811 break;
44812 case MVT::v32i1:
44813 SExtVT = MVT::v32i8;
44814 break;
44815 case MVT::v64i1:
44816 // If we have AVX512F, but not AVX512BW and the input is truncated from
44817 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
44818 if (Subtarget.hasAVX512()) {
44819 if (Subtarget.hasBWI())
44820 return SDValue();
44821 SExtVT = MVT::v64i8;
44822 break;
44823 }
44824 // Split if this is a <64 x i8> comparison result.
44825 if (checkBitcastSrcVectorSize(Src, 512, false)) {
44826 SExtVT = MVT::v64i8;
44827 break;
44828 }
44829 return SDValue();
44830 };
44831
44832 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
44833 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44834
44835 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
44836 V = getPMOVMSKB(DL, V, DAG, Subtarget);
44837 } else {
44838 if (SExtVT == MVT::v8i16) {
44839 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
44840 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
44841 }
44842 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
44843 }
44844
44845 EVT IntVT =
44847 V = DAG.getZExtOrTrunc(V, DL, IntVT);
44848 return DAG.getBitcast(VT, V);
44849}
44850
44851// Convert a vXi1 constant build vector to the same width scalar integer.
44853 EVT SrcVT = Op.getValueType();
44854 assert(SrcVT.getVectorElementType() == MVT::i1 &&
44855 "Expected a vXi1 vector");
44857 "Expected a constant build vector");
44858
44859 APInt Imm(SrcVT.getVectorNumElements(), 0);
44860 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
44861 SDValue In = Op.getOperand(Idx);
44862 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
44863 Imm.setBit(Idx);
44864 }
44865 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
44866 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
44867}
44868
44871 const X86Subtarget &Subtarget) {
44872 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
44873
44874 if (!DCI.isBeforeLegalizeOps())
44875 return SDValue();
44876
44877 // Only do this if we have k-registers.
44878 if (!Subtarget.hasAVX512())
44879 return SDValue();
44880
44881 EVT DstVT = N->getValueType(0);
44882 SDValue Op = N->getOperand(0);
44883 EVT SrcVT = Op.getValueType();
44884
44885 if (!Op.hasOneUse())
44886 return SDValue();
44887
44888 // Look for logic ops.
44889 if (Op.getOpcode() != ISD::AND &&
44890 Op.getOpcode() != ISD::OR &&
44891 Op.getOpcode() != ISD::XOR)
44892 return SDValue();
44893
44894 // Make sure we have a bitcast between mask registers and a scalar type.
44895 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44896 DstVT.isScalarInteger()) &&
44897 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
44898 SrcVT.isScalarInteger()))
44899 return SDValue();
44900
44901 SDValue LHS = Op.getOperand(0);
44902 SDValue RHS = Op.getOperand(1);
44903
44904 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
44905 LHS.getOperand(0).getValueType() == DstVT)
44906 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
44907 DAG.getBitcast(DstVT, RHS));
44908
44909 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
44910 RHS.getOperand(0).getValueType() == DstVT)
44911 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44912 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
44913
44914 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
44915 // Most of these have to move a constant from the scalar domain anyway.
44918 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44919 DAG.getBitcast(DstVT, LHS), RHS);
44920 }
44921
44922 return SDValue();
44923}
44924
44926 const X86Subtarget &Subtarget) {
44927 SDLoc DL(BV);
44928 unsigned NumElts = BV->getNumOperands();
44929 SDValue Splat = BV->getSplatValue();
44930
44931 // Build MMX element from integer GPR or SSE float values.
44932 auto CreateMMXElement = [&](SDValue V) {
44933 if (V.isUndef())
44934 return DAG.getUNDEF(MVT::x86mmx);
44935 if (V.getValueType().isFloatingPoint()) {
44936 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
44937 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
44938 V = DAG.getBitcast(MVT::v2i64, V);
44939 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
44940 }
44941 V = DAG.getBitcast(MVT::i32, V);
44942 } else {
44943 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
44944 }
44945 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
44946 };
44947
44948 // Convert build vector ops to MMX data in the bottom elements.
44950
44951 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44952
44953 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
44954 if (Splat) {
44955 if (Splat.isUndef())
44956 return DAG.getUNDEF(MVT::x86mmx);
44957
44958 Splat = CreateMMXElement(Splat);
44959
44960 if (Subtarget.hasSSE1()) {
44961 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
44962 if (NumElts == 8)
44963 Splat = DAG.getNode(
44964 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44965 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
44966 TLI.getPointerTy(DAG.getDataLayout())),
44967 Splat, Splat);
44968
44969 // Use PSHUFW to repeat 16-bit elements.
44970 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
44971 return DAG.getNode(
44972 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44973 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
44974 TLI.getPointerTy(DAG.getDataLayout())),
44975 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
44976 }
44977 Ops.append(NumElts, Splat);
44978 } else {
44979 for (unsigned i = 0; i != NumElts; ++i)
44980 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
44981 }
44982
44983 // Use tree of PUNPCKLs to build up general MMX vector.
44984 while (Ops.size() > 1) {
44985 unsigned NumOps = Ops.size();
44986 unsigned IntrinOp =
44987 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
44988 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
44989 : Intrinsic::x86_mmx_punpcklbw));
44990 SDValue Intrin = DAG.getTargetConstant(
44991 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
44992 for (unsigned i = 0; i != NumOps; i += 2)
44993 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
44994 Ops[i], Ops[i + 1]);
44995 Ops.resize(NumOps / 2);
44996 }
44997
44998 return Ops[0];
44999}
45000
45001// Recursive function that attempts to find if a bool vector node was originally
45002// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45003// integer. If so, replace the scalar ops with bool vector equivalents back down
45004// the chain.
45006 SelectionDAG &DAG,
45007 const X86Subtarget &Subtarget,
45008 unsigned Depth = 0) {
45010 return SDValue(); // Limit search depth.
45011
45012 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45013 unsigned Opc = V.getOpcode();
45014 switch (Opc) {
45015 case ISD::BITCAST: {
45016 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45017 SDValue Src = V.getOperand(0);
45018 EVT SrcVT = Src.getValueType();
45019 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45020 return DAG.getBitcast(VT, Src);
45021 break;
45022 }
45023 case ISD::Constant: {
45024 auto *C = cast<ConstantSDNode>(V);
45025 if (C->isZero())
45026 return DAG.getConstant(0, DL, VT);
45027 if (C->isAllOnes())
45028 return DAG.getAllOnesConstant(DL, VT);
45029 break;
45030 }
45031 case ISD::TRUNCATE: {
45032 // If we find a suitable source, a truncated scalar becomes a subvector.
45033 SDValue Src = V.getOperand(0);
45034 EVT NewSrcVT =
45035 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45036 if (TLI.isTypeLegal(NewSrcVT))
45037 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45038 Subtarget, Depth + 1))
45039 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45040 DAG.getVectorIdxConstant(0, DL));
45041 break;
45042 }
45043 case ISD::ANY_EXTEND:
45044 case ISD::ZERO_EXTEND: {
45045 // If we find a suitable source, an extended scalar becomes a subvector.
45046 SDValue Src = V.getOperand(0);
45047 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45048 Src.getScalarValueSizeInBits());
45049 if (TLI.isTypeLegal(NewSrcVT))
45050 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45051 Subtarget, Depth + 1))
45052 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45053 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45054 : DAG.getConstant(0, DL, VT),
45055 N0, DAG.getVectorIdxConstant(0, DL));
45056 break;
45057 }
45058 case ISD::OR:
45059 case ISD::XOR: {
45060 // If we find suitable sources, we can just move the op to the vector
45061 // domain.
45062 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45063 Subtarget, Depth + 1))
45064 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45065 Subtarget, Depth + 1))
45066 return DAG.getNode(Opc, DL, VT, N0, N1);
45067 break;
45068 }
45069 case ISD::SHL: {
45070 // If we find a suitable source, a SHL becomes a KSHIFTL.
45071 SDValue Src0 = V.getOperand(0);
45072 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45073 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45074 break;
45075
45076 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45077 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45078 Depth + 1))
45079 return DAG.getNode(
45080 X86ISD::KSHIFTL, DL, VT, N0,
45081 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45082 break;
45083 }
45084 }
45085
45086 // Does the inner bitcast already exist?
45087 if (Depth > 0)
45088 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45089 return SDValue(Alt, 0);
45090
45091 return SDValue();
45092}
45093
45096 const X86Subtarget &Subtarget) {
45097 SDValue N0 = N->getOperand(0);
45098 EVT VT = N->getValueType(0);
45099 EVT SrcVT = N0.getValueType();
45100 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45101
45102 // Try to match patterns such as
45103 // (i16 bitcast (v16i1 x))
45104 // ->
45105 // (i16 movmsk (16i8 sext (v16i1 x)))
45106 // before the setcc result is scalarized on subtargets that don't have legal
45107 // vxi1 types.
45108 if (DCI.isBeforeLegalize()) {
45109 SDLoc dl(N);
45110 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45111 return V;
45112
45113 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45114 // type, widen both sides to avoid a trip through memory.
45115 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45116 Subtarget.hasAVX512()) {
45117 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45118 N0 = DAG.getBitcast(MVT::v8i1, N0);
45119 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45120 DAG.getVectorIdxConstant(0, dl));
45121 }
45122
45123 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45124 // type, widen both sides to avoid a trip through memory.
45125 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45126 Subtarget.hasAVX512()) {
45127 // Use zeros for the widening if we already have some zeroes. This can
45128 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45129 // stream of this.
45130 // FIXME: It might make sense to detect a concat_vectors with a mix of
45131 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45132 // a separate combine. What we can't do is canonicalize the operands of
45133 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45134 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45135 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45136 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45137 SrcVT = LastOp.getValueType();
45138 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45139 SmallVector<SDValue, 4> Ops(N0->ops());
45140 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45141 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45142 N0 = DAG.getBitcast(MVT::i8, N0);
45143 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45144 }
45145 }
45146
45147 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45148 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45149 Ops[0] = N0;
45150 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45151 N0 = DAG.getBitcast(MVT::i8, N0);
45152 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45153 }
45154 } else if (DCI.isAfterLegalizeDAG()) {
45155 // If we're bitcasting from iX to vXi1, see if the integer originally
45156 // began as a vXi1 and whether we can remove the bitcast entirely.
45157 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45158 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45159 if (SDValue V =
45160 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45161 return V;
45162 }
45163 }
45164
45165 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45166 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45167 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45168 // we can help with known bits propagation from the vXi1 domain to the
45169 // scalar domain.
45170 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45171 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45172 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45174 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45175 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45176
45177 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45178 // and the vbroadcast_load are both integer or both fp. In some cases this
45179 // will remove the bitcast entirely.
45180 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45181 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45182 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45183 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45184 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45185 // Don't swap i8/i16 since don't have fp types that size.
45186 if (MemSize >= 32) {
45187 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45188 : MVT::getIntegerVT(MemSize);
45189 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45190 : MVT::getIntegerVT(SrcVTSize);
45191 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45192
45193 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45194 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45195 SDValue ResNode =
45197 MemVT, BCast->getMemOperand());
45198 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45199 return DAG.getBitcast(VT, ResNode);
45200 }
45201 }
45202
45203 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45204 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45205 SDValue Src = peekThroughTruncates(N0);
45206 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45207 Src.getOperand(0).getValueSizeInBits() == 128 &&
45208 isNullConstant(Src.getOperand(1))) {
45209 SDLoc DL(N);
45210 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45211 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45212 DAG.getVectorIdxConstant(0, DL));
45213 }
45214 }
45215
45216 // Since MMX types are special and don't usually play with other vector types,
45217 // it's better to handle them early to be sure we emit efficient code by
45218 // avoiding store-load conversions.
45219 if (VT == MVT::x86mmx) {
45220 // Detect MMX constant vectors.
45221 APInt UndefElts;
45222 SmallVector<APInt, 1> EltBits;
45223 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45224 /*AllowWholeUndefs*/ true,
45225 /*AllowPartialUndefs*/ true)) {
45226 SDLoc DL(N0);
45227 // Handle zero-extension of i32 with MOVD.
45228 if (EltBits[0].countl_zero() >= 32)
45229 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45230 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45231 // Else, bitcast to a double.
45232 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45233 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45234 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45235 }
45236
45237 // Detect bitcasts to x86mmx low word.
45238 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45239 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45240 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45241 bool LowUndef = true, AllUndefOrZero = true;
45242 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45243 SDValue Op = N0.getOperand(i);
45244 LowUndef &= Op.isUndef() || (i >= e/2);
45245 AllUndefOrZero &= isNullConstantOrUndef(Op);
45246 }
45247 if (AllUndefOrZero) {
45248 SDValue N00 = N0.getOperand(0);
45249 SDLoc dl(N00);
45250 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45251 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45252 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45253 }
45254 }
45255
45256 // Detect bitcasts of 64-bit build vectors and convert to a
45257 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45258 // lowest element.
45259 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45260 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45261 SrcVT == MVT::v8i8))
45262 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45263
45264 // Detect bitcasts between element or subvector extraction to x86mmx.
45265 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45267 isNullConstant(N0.getOperand(1))) {
45268 SDValue N00 = N0.getOperand(0);
45269 if (N00.getValueType().is128BitVector())
45270 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45271 DAG.getBitcast(MVT::v2i64, N00));
45272 }
45273
45274 // Detect bitcasts from FP_TO_SINT to x86mmx.
45275 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
45276 SDLoc DL(N0);
45277 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
45278 DAG.getUNDEF(MVT::v2i32));
45279 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
45280 DAG.getBitcast(MVT::v2i64, Res));
45281 }
45282 }
45283
45284 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
45285 // most of these to scalar anyway.
45286 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
45287 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45289 return combinevXi1ConstantToInteger(N0, DAG);
45290 }
45291
45292 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
45293 VT.getVectorElementType() == MVT::i1) {
45294 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
45295 if (C->isAllOnes())
45296 return DAG.getConstant(1, SDLoc(N0), VT);
45297 if (C->isZero())
45298 return DAG.getConstant(0, SDLoc(N0), VT);
45299 }
45300 }
45301
45302 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
45303 // Turn it into a sign bit compare that produces a k-register. This avoids
45304 // a trip through a GPR.
45305 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
45306 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
45308 unsigned NumElts = VT.getVectorNumElements();
45309 SDValue Src = N0;
45310
45311 // Peek through truncate.
45312 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
45313 Src = N0.getOperand(0);
45314
45315 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
45316 SDValue MovmskIn = Src.getOperand(0);
45317 MVT MovmskVT = MovmskIn.getSimpleValueType();
45318 unsigned MovMskElts = MovmskVT.getVectorNumElements();
45319
45320 // We allow extra bits of the movmsk to be used since they are known zero.
45321 // We can't convert a VPMOVMSKB without avx512bw.
45322 if (MovMskElts <= NumElts &&
45323 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
45324 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
45325 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
45326 SDLoc dl(N);
45327 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
45328 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
45329 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
45330 if (EVT(CmpVT) == VT)
45331 return Cmp;
45332
45333 // Pad with zeroes up to original VT to replace the zeroes that were
45334 // being used from the MOVMSK.
45335 unsigned NumConcats = NumElts / MovMskElts;
45336 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
45337 Ops[0] = Cmp;
45338 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
45339 }
45340 }
45341 }
45342
45343 // Try to remove bitcasts from input and output of mask arithmetic to
45344 // remove GPR<->K-register crossings.
45345 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
45346 return V;
45347
45348 // Convert a bitcasted integer logic operation that has one bitcasted
45349 // floating-point operand into a floating-point logic operation. This may
45350 // create a load of a constant, but that is cheaper than materializing the
45351 // constant in an integer register and transferring it to an SSE register or
45352 // transferring the SSE operand to integer register and back.
45353 unsigned FPOpcode;
45354 switch (N0.getOpcode()) {
45355 // clang-format off
45356 case ISD::AND: FPOpcode = X86ISD::FAND; break;
45357 case ISD::OR: FPOpcode = X86ISD::FOR; break;
45358 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
45359 default: return SDValue();
45360 // clang-format on
45361 }
45362
45363 // Check if we have a bitcast from another integer type as well.
45364 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
45365 (Subtarget.hasSSE2() && VT == MVT::f64) ||
45366 (Subtarget.hasFP16() && VT == MVT::f16) ||
45367 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
45368 TLI.isTypeLegal(VT))))
45369 return SDValue();
45370
45371 SDValue LogicOp0 = N0.getOperand(0);
45372 SDValue LogicOp1 = N0.getOperand(1);
45373 SDLoc DL0(N0);
45374
45375 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
45376 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
45377 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
45378 LogicOp0.getOperand(0).getValueType() == VT &&
45379 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
45380 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
45381 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45382 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
45383 }
45384 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
45385 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
45386 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
45387 LogicOp1.getOperand(0).getValueType() == VT &&
45388 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
45389 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
45390 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45391 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
45392 }
45393
45394 return SDValue();
45395}
45396
45397// (mul (zext a), (sext, b))
45398static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
45399 SDValue &Op1) {
45400 Op0 = Mul.getOperand(0);
45401 Op1 = Mul.getOperand(1);
45402
45403 // The operand1 should be signed extend
45404 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
45405 std::swap(Op0, Op1);
45406
45407 auto IsFreeTruncation = [](SDValue &Op) -> bool {
45408 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
45409 Op.getOpcode() == ISD::SIGN_EXTEND) &&
45410 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
45411 return true;
45412
45413 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
45414 return (BV && BV->isConstant());
45415 };
45416
45417 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
45418 // value, we need to check Op0 is zero extended value. Op1 should be signed
45419 // value, so we just check the signed bits.
45420 if ((IsFreeTruncation(Op0) &&
45421 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
45422 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
45423 return true;
45424
45425 return false;
45426}
45427
45428// Given a ABS node, detect the following pattern:
45429// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
45430// This is useful as it is the input into a SAD pattern.
45431static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
45432 SDValue AbsOp1 = Abs->getOperand(0);
45433 if (AbsOp1.getOpcode() != ISD::SUB)
45434 return false;
45435
45436 Op0 = AbsOp1.getOperand(0);
45437 Op1 = AbsOp1.getOperand(1);
45438
45439 // Check if the operands of the sub are zero-extended from vectors of i8.
45440 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
45441 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
45442 Op1.getOpcode() != ISD::ZERO_EXTEND ||
45443 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
45444 return false;
45445
45446 return true;
45447}
45448
45450 unsigned &LogBias, const SDLoc &DL,
45451 const X86Subtarget &Subtarget) {
45452 // Extend or truncate to MVT::i8 first.
45453 MVT Vi8VT =
45454 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
45455 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
45456 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
45457
45458 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
45459 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
45460 // The src A, B element type is i8, but the dst C element type is i32.
45461 // When we calculate the reduce stage, we use src vector type vXi8 for it
45462 // so we need logbias 2 to avoid extra 2 stages.
45463 LogBias = 2;
45464
45465 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
45466 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
45467 RegSize = std::max(512u, RegSize);
45468
45469 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45470 // fill in the missing vector elements with 0.
45471 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
45472 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
45473 Ops[0] = LHS;
45474 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45475 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45476 Ops[0] = RHS;
45477 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45478
45479 // Actually build the DotProduct, split as 256/512 bits for
45480 // AVXVNNI/AVX512VNNI.
45481 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45482 ArrayRef<SDValue> Ops) {
45483 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
45484 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
45485 };
45486 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
45487 SDValue Zero = DAG.getConstant(0, DL, DpVT);
45488
45489 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
45490 DpBuilder, false);
45491}
45492
45493// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
45494// to these zexts.
45495static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
45496 const SDValue &Zext1, const SDLoc &DL,
45497 const X86Subtarget &Subtarget) {
45498 // Find the appropriate width for the PSADBW.
45499 EVT InVT = Zext0.getOperand(0).getValueType();
45500 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
45501
45502 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45503 // fill in the missing vector elements with 0.
45504 unsigned NumConcat = RegSize / InVT.getSizeInBits();
45505 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
45506 Ops[0] = Zext0.getOperand(0);
45507 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45508 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45509 Ops[0] = Zext1.getOperand(0);
45510 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45511
45512 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45513 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45514 ArrayRef<SDValue> Ops) {
45515 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45516 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
45517 };
45518 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
45519 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
45520 PSADBWBuilder);
45521}
45522
45523// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
45524// PHMINPOSUW.
45526 const X86Subtarget &Subtarget) {
45527 // Bail without SSE41.
45528 if (!Subtarget.hasSSE41())
45529 return SDValue();
45530
45531 EVT ExtractVT = Extract->getValueType(0);
45532 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
45533 return SDValue();
45534
45535 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
45536 ISD::NodeType BinOp;
45537 SDValue Src = DAG.matchBinOpReduction(
45538 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
45539 if (!Src)
45540 return SDValue();
45541
45542 EVT SrcVT = Src.getValueType();
45543 EVT SrcSVT = SrcVT.getScalarType();
45544 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
45545 return SDValue();
45546
45547 SDLoc DL(Extract);
45548 SDValue MinPos = Src;
45549
45550 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
45551 while (SrcVT.getSizeInBits() > 128) {
45552 SDValue Lo, Hi;
45553 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
45554 SrcVT = Lo.getValueType();
45555 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
45556 }
45557 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
45558 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
45559 "Unexpected value type");
45560
45561 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
45562 // to flip the value accordingly.
45563 SDValue Mask;
45564 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
45565 if (BinOp == ISD::SMAX)
45566 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
45567 else if (BinOp == ISD::SMIN)
45568 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
45569 else if (BinOp == ISD::UMAX)
45570 Mask = DAG.getAllOnesConstant(DL, SrcVT);
45571
45572 if (Mask)
45573 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45574
45575 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
45576 // shuffling each upper element down and insert zeros. This means that the
45577 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
45578 // ready for the PHMINPOS.
45579 if (ExtractVT == MVT::i8) {
45581 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
45582 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
45583 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
45584 }
45585
45586 // Perform the PHMINPOS on a v8i16 vector,
45587 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
45588 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
45589 MinPos = DAG.getBitcast(SrcVT, MinPos);
45590
45591 if (Mask)
45592 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45593
45594 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
45595 DAG.getVectorIdxConstant(0, DL));
45596}
45597
45598// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
45600 const X86Subtarget &Subtarget) {
45601 // Bail without SSE2.
45602 if (!Subtarget.hasSSE2())
45603 return SDValue();
45604
45605 EVT ExtractVT = Extract->getValueType(0);
45606 unsigned BitWidth = ExtractVT.getSizeInBits();
45607 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
45608 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
45609 return SDValue();
45610
45611 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
45612 ISD::NodeType BinOp;
45613 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
45614 if (!Match && ExtractVT == MVT::i1)
45615 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
45616 if (!Match)
45617 return SDValue();
45618
45619 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
45620 // which we can't support here for now.
45621 if (Match.getScalarValueSizeInBits() != BitWidth)
45622 return SDValue();
45623
45624 SDValue Movmsk;
45625 SDLoc DL(Extract);
45626 EVT MatchVT = Match.getValueType();
45627 unsigned NumElts = MatchVT.getVectorNumElements();
45628 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
45629 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45630 LLVMContext &Ctx = *DAG.getContext();
45631
45632 if (ExtractVT == MVT::i1) {
45633 // Special case for (pre-legalization) vXi1 reductions.
45634 if (NumElts > 64 || !isPowerOf2_32(NumElts))
45635 return SDValue();
45636 if (Match.getOpcode() == ISD::SETCC) {
45637 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
45638 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
45639 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
45640 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
45641 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
45642 X86::CondCode X86CC;
45643 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
45644 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
45645 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
45646 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
45647 DAG, X86CC))
45648 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
45649 getSETCC(X86CC, V, DL, DAG));
45650 }
45651 }
45652 if (TLI.isTypeLegal(MatchVT)) {
45653 // If this is a legal AVX512 predicate type then we can just bitcast.
45654 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45655 Movmsk = DAG.getBitcast(MovmskVT, Match);
45656 } else {
45657 // Use combineBitcastvxi1 to create the MOVMSK.
45658 while (NumElts > MaxElts) {
45659 SDValue Lo, Hi;
45660 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45661 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45662 NumElts /= 2;
45663 }
45664 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45665 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
45666 }
45667 if (!Movmsk)
45668 return SDValue();
45669 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
45670 } else {
45671 // FIXME: Better handling of k-registers or 512-bit vectors?
45672 unsigned MatchSizeInBits = Match.getValueSizeInBits();
45673 if (!(MatchSizeInBits == 128 ||
45674 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
45675 return SDValue();
45676
45677 // Make sure this isn't a vector of 1 element. The perf win from using
45678 // MOVMSK diminishes with less elements in the reduction, but it is
45679 // generally better to get the comparison over to the GPRs as soon as
45680 // possible to reduce the number of vector ops.
45681 if (Match.getValueType().getVectorNumElements() < 2)
45682 return SDValue();
45683
45684 // Check that we are extracting a reduction of all sign bits.
45685 if (DAG.ComputeNumSignBits(Match) != BitWidth)
45686 return SDValue();
45687
45688 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
45689 SDValue Lo, Hi;
45690 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45691 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45692 MatchSizeInBits = Match.getValueSizeInBits();
45693 }
45694
45695 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
45696 MVT MaskSrcVT;
45697 if (64 == BitWidth || 32 == BitWidth)
45699 MatchSizeInBits / BitWidth);
45700 else
45701 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
45702
45703 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
45704 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
45705 NumElts = MaskSrcVT.getVectorNumElements();
45706 }
45707 assert((NumElts <= 32 || NumElts == 64) &&
45708 "Not expecting more than 64 elements");
45709
45710 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
45711 if (BinOp == ISD::XOR) {
45712 // parity -> (PARITY(MOVMSK X))
45713 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
45714 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
45715 }
45716
45717 SDValue CmpC;
45718 ISD::CondCode CondCode;
45719 if (BinOp == ISD::OR) {
45720 // any_of -> MOVMSK != 0
45721 CmpC = DAG.getConstant(0, DL, CmpVT);
45722 CondCode = ISD::CondCode::SETNE;
45723 } else {
45724 // all_of -> MOVMSK == ((1 << NumElts) - 1)
45725 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
45726 DL, CmpVT);
45727 CondCode = ISD::CondCode::SETEQ;
45728 }
45729
45730 // The setcc produces an i8 of 0/1, so extend that to the result width and
45731 // negate to get the final 0/-1 mask value.
45732 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
45733 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
45734 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
45735 return DAG.getNegative(Zext, DL, ExtractVT);
45736}
45737
45739 const X86Subtarget &Subtarget) {
45740 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
45741 return SDValue();
45742
45743 EVT ExtractVT = Extract->getValueType(0);
45744 // Verify the type we're extracting is i32, as the output element type of
45745 // vpdpbusd is i32.
45746 if (ExtractVT != MVT::i32)
45747 return SDValue();
45748
45749 EVT VT = Extract->getOperand(0).getValueType();
45751 return SDValue();
45752
45753 // Match shuffle + add pyramid.
45754 ISD::NodeType BinOp;
45755 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45756
45757 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
45758 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
45759 // before adding into the accumulator.
45760 // TODO:
45761 // We also need to verify that the multiply has at least 2x the number of bits
45762 // of the input. We shouldn't match
45763 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
45764 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
45765 // Root = Root.getOperand(0);
45766
45767 // If there was a match, we want Root to be a mul.
45768 if (!Root || Root.getOpcode() != ISD::MUL)
45769 return SDValue();
45770
45771 // Check whether we have an extend and mul pattern
45772 SDValue LHS, RHS;
45773 if (!detectExtMul(DAG, Root, LHS, RHS))
45774 return SDValue();
45775
45776 // Create the dot product instruction.
45777 SDLoc DL(Extract);
45778 unsigned StageBias;
45779 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
45780
45781 // If the original vector was wider than 4 elements, sum over the results
45782 // in the DP vector.
45783 unsigned Stages = Log2_32(VT.getVectorNumElements());
45784 EVT DpVT = DP.getValueType();
45785
45786 if (Stages > StageBias) {
45787 unsigned DpElems = DpVT.getVectorNumElements();
45788
45789 for (unsigned i = Stages - StageBias; i > 0; --i) {
45790 SmallVector<int, 16> Mask(DpElems, -1);
45791 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45792 Mask[j] = MaskEnd + j;
45793
45794 SDValue Shuffle =
45795 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
45796 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
45797 }
45798 }
45799
45800 // Return the lowest ExtractSizeInBits bits.
45801 EVT ResVT =
45802 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45803 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
45804 DP = DAG.getBitcast(ResVT, DP);
45805 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
45806 Extract->getOperand(1));
45807}
45808
45810 const X86Subtarget &Subtarget) {
45811 // PSADBW is only supported on SSE2 and up.
45812 if (!Subtarget.hasSSE2())
45813 return SDValue();
45814
45815 EVT ExtractVT = Extract->getValueType(0);
45816 // Verify the type we're extracting is either i32 or i64.
45817 // FIXME: Could support other types, but this is what we have coverage for.
45818 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
45819 return SDValue();
45820
45821 EVT VT = Extract->getOperand(0).getValueType();
45823 return SDValue();
45824
45825 // Match shuffle + add pyramid.
45826 ISD::NodeType BinOp;
45827 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45828
45829 // The operand is expected to be zero extended from i8
45830 // (verified in detectZextAbsDiff).
45831 // In order to convert to i64 and above, additional any/zero/sign
45832 // extend is expected.
45833 // The zero extend from 32 bit has no mathematical effect on the result.
45834 // Also the sign extend is basically zero extend
45835 // (extends the sign bit which is zero).
45836 // So it is correct to skip the sign/zero extend instruction.
45837 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
45838 Root.getOpcode() == ISD::ZERO_EXTEND ||
45839 Root.getOpcode() == ISD::ANY_EXTEND))
45840 Root = Root.getOperand(0);
45841
45842 // If there was a match, we want Root to be a select that is the root of an
45843 // abs-diff pattern.
45844 if (!Root || Root.getOpcode() != ISD::ABS)
45845 return SDValue();
45846
45847 // Check whether we have an abs-diff pattern feeding into the select.
45848 SDValue Zext0, Zext1;
45849 if (!detectZextAbsDiff(Root, Zext0, Zext1))
45850 return SDValue();
45851
45852 // Create the SAD instruction.
45853 SDLoc DL(Extract);
45854 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
45855
45856 // If the original vector was wider than 8 elements, sum over the results
45857 // in the SAD vector.
45858 unsigned Stages = Log2_32(VT.getVectorNumElements());
45859 EVT SadVT = SAD.getValueType();
45860 if (Stages > 3) {
45861 unsigned SadElems = SadVT.getVectorNumElements();
45862
45863 for(unsigned i = Stages - 3; i > 0; --i) {
45864 SmallVector<int, 16> Mask(SadElems, -1);
45865 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45866 Mask[j] = MaskEnd + j;
45867
45868 SDValue Shuffle =
45869 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
45870 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
45871 }
45872 }
45873
45874 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
45875 // Return the lowest ExtractSizeInBits bits.
45876 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45877 SadVT.getSizeInBits() / ExtractSizeInBits);
45878 SAD = DAG.getBitcast(ResVT, SAD);
45879 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
45880 Extract->getOperand(1));
45881}
45882
45883// If this extract is from a loaded vector value and will be used as an
45884// integer, that requires a potentially expensive XMM -> GPR transfer.
45885// Additionally, if we can convert to a scalar integer load, that will likely
45886// be folded into a subsequent integer op.
45887// Note: SrcVec might not have a VecVT type, but it must be the same size.
45888// Note: Unlike the related fold for this in DAGCombiner, this is not limited
45889// to a single-use of the loaded vector. For the reasons above, we
45890// expect this to be profitable even if it creates an extra load.
45891static SDValue
45893 const SDLoc &dl, SelectionDAG &DAG,
45895 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45896 "Only EXTRACT_VECTOR_ELT supported so far");
45897
45898 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45899 EVT VT = N->getValueType(0);
45900
45901 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
45902 return Use->getOpcode() == ISD::STORE ||
45903 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
45904 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
45905 });
45906
45907 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
45908 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
45909 VecVT.getVectorElementType() == VT &&
45910 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
45911 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
45912 SDValue NewPtr = TLI.getVectorElementPointer(
45913 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
45914 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
45915 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
45916 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
45917 SDValue Load =
45918 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
45919 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
45920 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
45921 return Load;
45922 }
45923
45924 return SDValue();
45925}
45926
45927// Attempt to peek through a target shuffle and extract the scalar from the
45928// source.
45931 const X86Subtarget &Subtarget) {
45932 if (DCI.isBeforeLegalizeOps())
45933 return SDValue();
45934
45935 SDLoc dl(N);
45936 SDValue Src = N->getOperand(0);
45937 SDValue Idx = N->getOperand(1);
45938
45939 EVT VT = N->getValueType(0);
45940 EVT SrcVT = Src.getValueType();
45941 EVT SrcSVT = SrcVT.getVectorElementType();
45942 unsigned SrcEltBits = SrcSVT.getSizeInBits();
45943 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45944
45945 // Don't attempt this for boolean mask vectors or unknown extraction indices.
45946 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
45947 return SDValue();
45948
45949 const APInt &IdxC = N->getConstantOperandAPInt(1);
45950 if (IdxC.uge(NumSrcElts))
45951 return SDValue();
45952
45953 SDValue SrcBC = peekThroughBitcasts(Src);
45954
45955 // Handle extract(bitcast(broadcast(scalar_value))).
45956 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
45957 SDValue SrcOp = SrcBC.getOperand(0);
45958 EVT SrcOpVT = SrcOp.getValueType();
45959 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
45960 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
45961 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
45962 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
45963 // TODO support non-zero offsets.
45964 if (Offset == 0) {
45965 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
45966 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
45967 return SrcOp;
45968 }
45969 }
45970 }
45971
45972 // If we're extracting a single element from a broadcast load and there are
45973 // no other users, just create a single load.
45974 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
45975 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
45976 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
45977 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
45978 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
45979 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
45980 MemIntr->getBasePtr(),
45981 MemIntr->getPointerInfo(),
45982 MemIntr->getOriginalAlign(),
45983 MemIntr->getMemOperand()->getFlags());
45984 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
45985 return Load;
45986 }
45987 }
45988
45989 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
45990 // TODO: Move to DAGCombine?
45991 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
45992 SrcBC.getValueType().isInteger() &&
45993 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
45994 SrcBC.getScalarValueSizeInBits() ==
45995 SrcBC.getOperand(0).getValueSizeInBits()) {
45996 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
45997 if (IdxC.ult(Scale)) {
45998 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
45999 SDValue Scl = SrcBC.getOperand(0);
46000 EVT SclVT = Scl.getValueType();
46001 if (Offset) {
46002 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46003 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46004 }
46005 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46006 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46007 return Scl;
46008 }
46009 }
46010
46011 // Handle extract(truncate(x)) for 0'th index.
46012 // TODO: Treat this as a faux shuffle?
46013 // TODO: When can we use this for general indices?
46014 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46015 (SrcVT.getSizeInBits() % 128) == 0) {
46016 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46017 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46018 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46019 Idx);
46020 }
46021
46022 // We can only legally extract other elements from 128-bit vectors and in
46023 // certain circumstances, depending on SSE-level.
46024 // TODO: Investigate float/double extraction if it will be just stored.
46025 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46026 unsigned Idx) {
46027 EVT VecSVT = VecVT.getScalarType();
46028 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46029 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46030 VecSVT == MVT::i64)) {
46031 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46032 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46033 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46034 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46035 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46036 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46037 Idx &= (NumEltsPerLane - 1);
46038 }
46039 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46040 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46041 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46042 DAG.getBitcast(VecVT, Vec),
46043 DAG.getVectorIdxConstant(Idx, dl));
46044 }
46045 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46046 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46047 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46048 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46049 DAG.getTargetConstant(Idx, dl, MVT::i8));
46050 }
46051 return SDValue();
46052 };
46053
46054 // Resolve the target shuffle inputs and mask.
46057 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46058 return SDValue();
46059
46060 // Shuffle inputs must be the same size as the result.
46061 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46062 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46063 }))
46064 return SDValue();
46065
46066 // Attempt to narrow/widen the shuffle mask to the correct size.
46067 if (Mask.size() != NumSrcElts) {
46068 if ((NumSrcElts % Mask.size()) == 0) {
46069 SmallVector<int, 16> ScaledMask;
46070 int Scale = NumSrcElts / Mask.size();
46071 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46072 Mask = std::move(ScaledMask);
46073 } else if ((Mask.size() % NumSrcElts) == 0) {
46074 // Simplify Mask based on demanded element.
46075 int ExtractIdx = (int)IdxC.getZExtValue();
46076 int Scale = Mask.size() / NumSrcElts;
46077 int Lo = Scale * ExtractIdx;
46078 int Hi = Scale * (ExtractIdx + 1);
46079 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46080 if (i < Lo || Hi <= i)
46081 Mask[i] = SM_SentinelUndef;
46082
46083 SmallVector<int, 16> WidenedMask;
46084 while (Mask.size() > NumSrcElts &&
46085 canWidenShuffleElements(Mask, WidenedMask))
46086 Mask = std::move(WidenedMask);
46087 }
46088 }
46089
46090 // If narrowing/widening failed, see if we can extract+zero-extend.
46091 int ExtractIdx;
46092 EVT ExtractVT;
46093 if (Mask.size() == NumSrcElts) {
46094 ExtractIdx = Mask[IdxC.getZExtValue()];
46095 ExtractVT = SrcVT;
46096 } else {
46097 unsigned Scale = Mask.size() / NumSrcElts;
46098 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46099 return SDValue();
46100 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46101 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46102 return SDValue();
46103 ExtractIdx = Mask[ScaledIdx];
46104 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46105 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46106 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46107 "Failed to widen vector type");
46108 }
46109
46110 // If the shuffle source element is undef/zero then we can just accept it.
46111 if (ExtractIdx == SM_SentinelUndef)
46112 return DAG.getUNDEF(VT);
46113
46114 if (ExtractIdx == SM_SentinelZero)
46115 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46116 : DAG.getConstant(0, dl, VT);
46117
46118 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46119 ExtractIdx = ExtractIdx % Mask.size();
46120 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46121 return DAG.getZExtOrTrunc(V, dl, VT);
46122
46123 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46125 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46126 return V;
46127
46128 return SDValue();
46129}
46130
46131/// Extracting a scalar FP value from vector element 0 is free, so extract each
46132/// operand first, then perform the math as a scalar op.
46134 const X86Subtarget &Subtarget,
46136 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46137 SDValue Vec = ExtElt->getOperand(0);
46138 SDValue Index = ExtElt->getOperand(1);
46139 EVT VT = ExtElt->getValueType(0);
46140 EVT VecVT = Vec.getValueType();
46141
46142 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46143 // non-zero element because the shuffle+scalar op will be cheaper?
46144 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46145 return SDValue();
46146
46147 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46148 // extract, the condition code), so deal with those as a special-case.
46149 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46150 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46151 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46152 return SDValue();
46153
46154 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46155 SDLoc DL(ExtElt);
46156 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46157 Vec.getOperand(0), Index);
46158 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46159 Vec.getOperand(1), Index);
46160 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46161 }
46162
46163 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46164 VT != MVT::f64)
46165 return SDValue();
46166
46167 // Vector FP selects don't fit the pattern of FP math ops (because the
46168 // condition has a different type and we have to change the opcode), so deal
46169 // with those here.
46170 // FIXME: This is restricted to pre type legalization. If we loosen this we
46171 // need to convert vector bool to a scalar bool.
46172 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46173 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46174 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
46175 assert(Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
46176 "Unexpected cond type for combine");
46177 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46178 SDLoc DL(ExtElt);
46181 Vec.getOperand(0), Index);
46182 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46183 Vec.getOperand(1), Index);
46184 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46185 Vec.getOperand(2), Index);
46186 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46187 }
46188
46189 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46190 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46191 // missed load folding and fma+fneg combining.
46192 switch (Vec.getOpcode()) {
46193 case ISD::FMA: // Begin 3 operands
46194 case ISD::FMAD:
46195 case ISD::FADD: // Begin 2 operands
46196 case ISD::FSUB:
46197 case ISD::FMUL:
46198 case ISD::FDIV:
46199 case ISD::FREM:
46200 case ISD::FCOPYSIGN:
46201 case ISD::FMINNUM:
46202 case ISD::FMAXNUM:
46203 case ISD::FMINNUM_IEEE:
46204 case ISD::FMAXNUM_IEEE:
46205 case ISD::FMAXIMUM:
46206 case ISD::FMINIMUM:
46207 case ISD::FMAXIMUMNUM:
46208 case ISD::FMINIMUMNUM:
46209 case X86ISD::FMAX:
46210 case X86ISD::FMIN:
46211 case ISD::FABS: // Begin 1 operand
46212 case ISD::FSQRT:
46213 case ISD::FRINT:
46214 case ISD::FCEIL:
46215 case ISD::FTRUNC:
46216 case ISD::FNEARBYINT:
46217 case ISD::FROUNDEVEN:
46218 case ISD::FROUND:
46219 case ISD::FFLOOR:
46220 case X86ISD::FRCP:
46221 case X86ISD::FRSQRT: {
46222 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46223 SDLoc DL(ExtElt);
46225 for (SDValue Op : Vec->ops())
46226 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46227 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46228 }
46229 default:
46230 return SDValue();
46231 }
46232 llvm_unreachable("All opcodes should return within switch");
46233}
46234
46235/// Try to convert a vector reduction sequence composed of binops and shuffles
46236/// into horizontal ops.
46238 const X86Subtarget &Subtarget) {
46239 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46240
46241 // We need at least SSE2 to anything here.
46242 if (!Subtarget.hasSSE2())
46243 return SDValue();
46244
46245 ISD::NodeType Opc;
46246 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46247 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46248 if (!Rdx)
46249 return SDValue();
46250
46251 SDValue Index = ExtElt->getOperand(1);
46252 assert(isNullConstant(Index) &&
46253 "Reduction doesn't end in an extract from index 0");
46254
46255 EVT VT = ExtElt->getValueType(0);
46256 EVT VecVT = Rdx.getValueType();
46257 if (VecVT.getScalarType() != VT)
46258 return SDValue();
46259
46260 SDLoc DL(ExtElt);
46261 unsigned NumElts = VecVT.getVectorNumElements();
46262 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46263
46264 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46265 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46266 if (V.getValueType() == MVT::v4i8) {
46267 if (ZeroExtend && Subtarget.hasSSE41()) {
46268 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46269 DAG.getConstant(0, DL, MVT::v4i32),
46270 DAG.getBitcast(MVT::i32, V),
46271 DAG.getVectorIdxConstant(0, DL));
46272 return DAG.getBitcast(MVT::v16i8, V);
46273 }
46274 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46275 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46276 : DAG.getUNDEF(MVT::v4i8));
46277 }
46278 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46279 DAG.getUNDEF(MVT::v8i8));
46280 };
46281
46282 // vXi8 mul reduction - promote to vXi16 mul reduction.
46283 if (Opc == ISD::MUL) {
46284 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
46285 return SDValue();
46286 if (VecVT.getSizeInBits() >= 128) {
46287 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
46288 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46289 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46290 Lo = DAG.getBitcast(WideVT, Lo);
46291 Hi = DAG.getBitcast(WideVT, Hi);
46292 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
46293 while (Rdx.getValueSizeInBits() > 128) {
46294 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46295 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
46296 }
46297 } else {
46298 Rdx = WidenToV16I8(Rdx, false);
46299 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
46300 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
46301 }
46302 if (NumElts >= 8)
46303 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46304 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46305 {4, 5, 6, 7, -1, -1, -1, -1}));
46306 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46307 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46308 {2, 3, -1, -1, -1, -1, -1, -1}));
46309 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46310 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46311 {1, -1, -1, -1, -1, -1, -1, -1}));
46312 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46313 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46314 }
46315
46316 // vXi8 add reduction - sub 128-bit vector.
46317 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
46318 Rdx = WidenToV16I8(Rdx, true);
46319 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46320 DAG.getConstant(0, DL, MVT::v16i8));
46321 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46322 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46323 }
46324
46325 // Must be a >=128-bit vector with pow2 elements.
46326 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
46327 return SDValue();
46328
46329 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
46330 if (VT == MVT::i8) {
46331 while (Rdx.getValueSizeInBits() > 128) {
46332 SDValue Lo, Hi;
46333 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46334 VecVT = Lo.getValueType();
46335 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
46336 }
46337 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
46338
46340 MVT::v16i8, DL, Rdx, Rdx,
46341 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
46342 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
46343 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46344 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
46345 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46346 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46347 }
46348
46349 // See if we can use vXi8 PSADBW add reduction for larger zext types.
46350 // If the source vector values are 0-255, then we can use PSADBW to
46351 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
46352 // TODO: See if its worth avoiding vXi16/i32 truncations?
46353 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
46354 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
46355 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
46356 Subtarget.hasAVX512())) {
46357 if (Rdx.getValueType() == MVT::v8i16) {
46358 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
46359 DAG.getUNDEF(MVT::v8i16));
46360 } else {
46361 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
46362 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
46363 if (ByteVT.getSizeInBits() < 128)
46364 Rdx = WidenToV16I8(Rdx, true);
46365 }
46366
46367 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46368 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46369 ArrayRef<SDValue> Ops) {
46370 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46371 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
46372 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
46373 };
46374 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
46375 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
46376
46377 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
46378 while (Rdx.getValueSizeInBits() > 128) {
46379 SDValue Lo, Hi;
46380 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46381 VecVT = Lo.getValueType();
46382 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
46383 }
46384 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
46385
46386 if (NumElts > 8) {
46387 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
46388 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
46389 }
46390
46391 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
46392 Rdx = DAG.getBitcast(VecVT, Rdx);
46393 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46394 }
46395
46396 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
46397 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
46398 return SDValue();
46399
46400 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
46401
46402 // 256-bit horizontal instructions operate on 128-bit chunks rather than
46403 // across the whole vector, so we need an extract + hop preliminary stage.
46404 // This is the only step where the operands of the hop are not the same value.
46405 // TODO: We could extend this to handle 512-bit or even longer vectors.
46406 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
46407 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
46408 unsigned NumElts = VecVT.getVectorNumElements();
46409 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
46410 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
46411 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
46412 VecVT = Rdx.getValueType();
46413 }
46414 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
46415 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
46416 return SDValue();
46417
46418 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
46419 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
46420 for (unsigned i = 0; i != ReductionSteps; ++i)
46421 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
46422
46423 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46424}
46425
46426/// Detect vector gather/scatter index generation and convert it from being a
46427/// bunch of shuffles and extracts into a somewhat faster sequence.
46428/// For i686, the best sequence is apparently storing the value and loading
46429/// scalars back, while for x64 we should use 64-bit extracts and shifts.
46432 const X86Subtarget &Subtarget) {
46433 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
46434 return NewOp;
46435
46436 SDValue InputVector = N->getOperand(0);
46437 SDValue EltIdx = N->getOperand(1);
46438 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
46439
46440 EVT SrcVT = InputVector.getValueType();
46441 EVT VT = N->getValueType(0);
46442 SDLoc dl(InputVector);
46443 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
46444 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46445 unsigned NumEltBits = VT.getScalarSizeInBits();
46446 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46447
46448 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
46449 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46450
46451 // Integer Constant Folding.
46452 if (CIdx && VT.isInteger()) {
46453 APInt UndefVecElts;
46454 SmallVector<APInt, 16> EltBits;
46455 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
46456 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
46457 EltBits, /*AllowWholeUndefs*/ true,
46458 /*AllowPartialUndefs*/ false)) {
46459 uint64_t Idx = CIdx->getZExtValue();
46460 if (UndefVecElts[Idx])
46461 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46462 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
46463 }
46464
46465 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
46466 // Improves lowering of bool masks on rust which splits them into byte array.
46467 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
46468 SDValue Src = peekThroughBitcasts(InputVector);
46469 if (Src.getValueType().getScalarType() == MVT::i1 &&
46470 TLI.isTypeLegal(Src.getValueType())) {
46471 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
46472 SDValue Sub = DAG.getNode(
46473 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
46474 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
46475 return DAG.getBitcast(VT, Sub);
46476 }
46477 }
46478 }
46479
46480 if (IsPextr) {
46481 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
46482 DCI))
46483 return SDValue(N, 0);
46484
46485 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
46486 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
46487 InputVector.getOpcode() == X86ISD::PINSRW) &&
46488 InputVector.getOperand(2) == EltIdx) {
46489 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
46490 "Vector type mismatch");
46491 SDValue Scl = InputVector.getOperand(1);
46492 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
46493 return DAG.getZExtOrTrunc(Scl, dl, VT);
46494 }
46495
46496 // TODO - Remove this once we can handle the implicit zero-extension of
46497 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
46498 // combineBasicSADPattern.
46499 return SDValue();
46500 }
46501
46502 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
46503 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
46504 InputVector.getOpcode() == ISD::BITCAST &&
46505 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46506 isNullConstant(EltIdx) && InputVector.hasOneUse())
46507 return DAG.getBitcast(VT, InputVector);
46508
46509 // Detect mmx to i32 conversion through a v2i32 elt extract.
46510 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
46511 InputVector.getOpcode() == ISD::BITCAST &&
46512 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46513 isNullConstant(EltIdx) && InputVector.hasOneUse())
46514 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
46515 InputVector.getOperand(0));
46516
46517 // Check whether this extract is the root of a sum of absolute differences
46518 // pattern. This has to be done here because we really want it to happen
46519 // pre-legalization,
46520 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
46521 return SAD;
46522
46523 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
46524 return VPDPBUSD;
46525
46526 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
46527 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
46528 return Cmp;
46529
46530 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
46531 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
46532 return MinMax;
46533
46534 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
46535 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
46536 return V;
46537
46538 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
46539 return V;
46540
46541 if (CIdx)
46543 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
46544 dl, DAG, DCI))
46545 return V;
46546
46547 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
46548 // and then testing the relevant element.
46549 //
46550 // Note that we only combine extracts on the *same* result number, i.e.
46551 // t0 = merge_values a0, a1, a2, a3
46552 // i1 = extract_vector_elt t0, Constant:i64<2>
46553 // i1 = extract_vector_elt t0, Constant:i64<3>
46554 // but not
46555 // i1 = extract_vector_elt t0:1, Constant:i64<2>
46556 // since the latter would need its own MOVMSK.
46557 if (SrcVT.getScalarType() == MVT::i1) {
46558 bool IsVar = !CIdx;
46559 SmallVector<SDNode *, 16> BoolExtracts;
46560 unsigned ResNo = InputVector.getResNo();
46561 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
46562 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46563 Use->getOperand(0).getResNo() == ResNo &&
46564 Use->getValueType(0) == MVT::i1) {
46565 BoolExtracts.push_back(Use);
46566 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
46567 return true;
46568 }
46569 return false;
46570 };
46571 // TODO: Can we drop the oneuse check for constant extracts?
46572 if (all_of(InputVector->users(), IsBoolExtract) &&
46573 (IsVar || BoolExtracts.size() > 1)) {
46574 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
46575 if (SDValue BC =
46576 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
46577 for (SDNode *Use : BoolExtracts) {
46578 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
46579 // Mask = 1 << MaskIdx
46580 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
46581 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
46582 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
46583 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
46584 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
46585 DCI.CombineTo(Use, Res);
46586 }
46587 return SDValue(N, 0);
46588 }
46589 }
46590 }
46591
46592 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
46593 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
46594 SDValue TruncSrc = InputVector.getOperand(0);
46595 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
46596 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
46597 SDValue NewExt =
46598 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
46599 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
46600 }
46601 }
46602
46603 return SDValue();
46604}
46605
46606// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
46607// This is more or less the reverse of combineBitcastvxi1.
46609 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
46610 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
46611 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
46612 Opcode != ISD::ANY_EXTEND)
46613 return SDValue();
46614 if (!DCI.isBeforeLegalizeOps())
46615 return SDValue();
46616 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46617 return SDValue();
46618
46619 EVT SVT = VT.getScalarType();
46620 EVT InSVT = N0.getValueType().getScalarType();
46621 unsigned EltSizeInBits = SVT.getSizeInBits();
46622
46623 // Input type must be extending a bool vector (bit-casted from a scalar
46624 // integer) to legal integer types.
46625 if (!VT.isVector())
46626 return SDValue();
46627 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
46628 return SDValue();
46629 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
46630 return SDValue();
46631
46632 SDValue N00 = N0.getOperand(0);
46633 EVT SclVT = N00.getValueType();
46634 if (!SclVT.isScalarInteger())
46635 return SDValue();
46636
46637 SDValue Vec;
46638 SmallVector<int> ShuffleMask;
46639 unsigned NumElts = VT.getVectorNumElements();
46640 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
46641
46642 // Broadcast the scalar integer to the vector elements.
46643 if (NumElts > EltSizeInBits) {
46644 // If the scalar integer is greater than the vector element size, then we
46645 // must split it down into sub-sections for broadcasting. For example:
46646 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
46647 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
46648 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
46649 unsigned Scale = NumElts / EltSizeInBits;
46650 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
46651 bool UseBroadcast = Subtarget.hasInt256() &&
46652 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
46653 Vec = UseBroadcast
46654 ? DAG.getSplat(BroadcastVT, DL, N00)
46655 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46656 Vec = DAG.getBitcast(VT, Vec);
46657
46658 for (unsigned i = 0; i != Scale; ++i) {
46659 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
46660 ShuffleMask.append(EltSizeInBits, i + Offset);
46661 }
46662 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46663 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
46664 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
46665 // If we have register broadcast instructions, use the scalar size as the
46666 // element type for the shuffle. Then cast to the wider element type. The
46667 // widened bits won't be used, and this might allow the use of a broadcast
46668 // load.
46669 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
46670 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
46671 (NumElts * EltSizeInBits) / NumElts);
46672 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
46673 } else {
46674 // For smaller scalar integers, we can simply any-extend it to the vector
46675 // element size (we don't care about the upper bits) and broadcast it to all
46676 // elements.
46677 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
46678 }
46679
46680 // Now, mask the relevant bit in each element.
46682 for (unsigned i = 0; i != NumElts; ++i) {
46683 int BitIdx = (i % EltSizeInBits);
46684 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
46685 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
46686 }
46687 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
46688 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
46689
46690 // Compare against the bitmask and extend the result.
46691 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
46692 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
46693 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
46694
46695 // For SEXT, this is now done, otherwise shift the result down for
46696 // zero-extension.
46697 if (Opcode == ISD::SIGN_EXTEND)
46698 return Vec;
46699 return DAG.getNode(ISD::SRL, DL, VT, Vec,
46700 DAG.getConstant(EltSizeInBits - 1, DL, VT));
46701}
46702
46703/// If a vector select has an operand that is -1 or 0, try to simplify the
46704/// select to a bitwise logic operation.
46705/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
46706static SDValue
46709 const X86Subtarget &Subtarget) {
46710 SDValue Cond = N->getOperand(0);
46711 SDValue LHS = N->getOperand(1);
46712 SDValue RHS = N->getOperand(2);
46713 EVT VT = LHS.getValueType();
46714 EVT CondVT = Cond.getValueType();
46715 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46716
46717 if (N->getOpcode() != ISD::VSELECT)
46718 return SDValue();
46719
46720 assert(CondVT.isVector() && "Vector select expects a vector selector!");
46721
46722 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
46723 // TODO: Can we assert that both operands are not zeros (because that should
46724 // get simplified at node creation time)?
46725 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
46726 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
46727
46728 // If both inputs are 0/undef, create a complete zero vector.
46729 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
46730 if (TValIsAllZeros && FValIsAllZeros) {
46731 if (VT.isFloatingPoint())
46732 return DAG.getConstantFP(0.0, DL, VT);
46733 return DAG.getConstant(0, DL, VT);
46734 }
46735
46736 // To use the condition operand as a bitwise mask, it must have elements that
46737 // are the same size as the select elements. Ie, the condition operand must
46738 // have already been promoted from the IR select condition type <N x i1>.
46739 // Don't check if the types themselves are equal because that excludes
46740 // vector floating-point selects.
46741 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
46742 return SDValue();
46743
46744 // Try to invert the condition if true value is not all 1s and false value is
46745 // not all 0s. Only do this if the condition has one use.
46746 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
46747 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
46748 // Check if the selector will be produced by CMPP*/PCMP*.
46749 Cond.getOpcode() == ISD::SETCC &&
46750 // Check if SETCC has already been promoted.
46751 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
46752 CondVT) {
46753 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
46754
46755 if (TValIsAllZeros || FValIsAllOnes) {
46756 SDValue CC = Cond.getOperand(2);
46758 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
46759 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
46760 NewCC);
46761 std::swap(LHS, RHS);
46762 TValIsAllOnes = FValIsAllOnes;
46763 FValIsAllZeros = TValIsAllZeros;
46764 }
46765 }
46766
46767 // Cond value must be 'sign splat' to be converted to a logical op.
46768 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
46769 return SDValue();
46770
46771 // vselect Cond, 111..., 000... -> Cond
46772 if (TValIsAllOnes && FValIsAllZeros)
46773 return DAG.getBitcast(VT, Cond);
46774
46775 if (!TLI.isTypeLegal(CondVT))
46776 return SDValue();
46777
46778 // vselect Cond, 111..., X -> or Cond, X
46779 if (TValIsAllOnes) {
46780 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46781 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
46782 return DAG.getBitcast(VT, Or);
46783 }
46784
46785 // vselect Cond, X, 000... -> and Cond, X
46786 if (FValIsAllZeros) {
46787 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
46788 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
46789 return DAG.getBitcast(VT, And);
46790 }
46791
46792 // vselect Cond, 000..., X -> andn Cond, X
46793 if (TValIsAllZeros) {
46794 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46795 SDValue AndN;
46796 // The canonical form differs for i1 vectors - x86andnp is not used
46797 if (CondVT.getScalarType() == MVT::i1)
46798 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
46799 CastRHS);
46800 else
46801 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
46802 return DAG.getBitcast(VT, AndN);
46803 }
46804
46805 return SDValue();
46806}
46807
46808/// If both arms of a vector select are concatenated vectors, split the select,
46809/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
46810/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
46811/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
46813 const X86Subtarget &Subtarget) {
46814 unsigned Opcode = N->getOpcode();
46815 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
46816 return SDValue();
46817
46818 // TODO: Split 512-bit vectors too?
46819 EVT VT = N->getValueType(0);
46820 if (!VT.is256BitVector())
46821 return SDValue();
46822
46823 // TODO: Split as long as any 2 of the 3 operands are concatenated?
46824 SDValue Cond = N->getOperand(0);
46825 SDValue TVal = N->getOperand(1);
46826 SDValue FVal = N->getOperand(2);
46827 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
46828 !isFreeToSplitVector(TVal.getNode(), DAG) ||
46829 !isFreeToSplitVector(FVal.getNode(), DAG))
46830 return SDValue();
46831
46832 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
46833 ArrayRef<SDValue> Ops) {
46834 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
46835 };
46836 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
46837 /*CheckBWI*/ false);
46838}
46839
46841 const SDLoc &DL) {
46842 SDValue Cond = N->getOperand(0);
46843 SDValue LHS = N->getOperand(1);
46844 SDValue RHS = N->getOperand(2);
46845
46846 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
46847 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
46848 if (!TrueC || !FalseC)
46849 return SDValue();
46850
46851 // Don't do this for crazy integer types.
46852 EVT VT = N->getValueType(0);
46853 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
46854 return SDValue();
46855
46856 // We're going to use the condition bit in math or logic ops. We could allow
46857 // this with a wider condition value (post-legalization it becomes an i8),
46858 // but if nothing is creating selects that late, it doesn't matter.
46859 if (Cond.getValueType() != MVT::i1)
46860 return SDValue();
46861
46862 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
46863 // 3, 5, or 9 with i32/i64, so those get transformed too.
46864 // TODO: For constants that overflow or do not differ by power-of-2 or small
46865 // multiplier, convert to 'and' + 'add'.
46866 const APInt &TrueVal = TrueC->getAPIntValue();
46867 const APInt &FalseVal = FalseC->getAPIntValue();
46868
46869 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
46870 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
46871 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
46872 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46873 if (CC == ISD::SETEQ || CC == ISD::SETNE)
46874 return SDValue();
46875 }
46876
46877 bool OV;
46878 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
46879 if (OV)
46880 return SDValue();
46881
46882 APInt AbsDiff = Diff.abs();
46883 if (AbsDiff.isPowerOf2() ||
46884 ((VT == MVT::i32 || VT == MVT::i64) &&
46885 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
46886
46887 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
46888 // of the condition can usually be folded into a compare predicate, but even
46889 // without that, the sequence should be cheaper than a CMOV alternative.
46890 if (TrueVal.slt(FalseVal)) {
46891 Cond = DAG.getNOT(DL, Cond, MVT::i1);
46892 std::swap(TrueC, FalseC);
46893 }
46894
46895 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
46896 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
46897
46898 // Multiply condition by the difference if non-one.
46899 if (!AbsDiff.isOne())
46900 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
46901
46902 // Add the base if non-zero.
46903 if (!FalseC->isZero())
46904 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
46905
46906 return R;
46907 }
46908
46909 return SDValue();
46910}
46911
46912/// If this is a *dynamic* select (non-constant condition) and we can match
46913/// this node with one of the variable blend instructions, restructure the
46914/// condition so that blends can use the high (sign) bit of each element.
46915/// This function will also call SimplifyDemandedBits on already created
46916/// BLENDV to perform additional simplifications.
46918 const SDLoc &DL,
46920 const X86Subtarget &Subtarget) {
46921 SDValue Cond = N->getOperand(0);
46922 if ((N->getOpcode() != ISD::VSELECT &&
46923 N->getOpcode() != X86ISD::BLENDV) ||
46925 return SDValue();
46926
46927 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46928 unsigned BitWidth = Cond.getScalarValueSizeInBits();
46929 EVT VT = N->getValueType(0);
46930
46931 // We can only handle the cases where VSELECT is directly legal on the
46932 // subtarget. We custom lower VSELECT nodes with constant conditions and
46933 // this makes it hard to see whether a dynamic VSELECT will correctly
46934 // lower, so we both check the operation's status and explicitly handle the
46935 // cases where a *dynamic* blend will fail even though a constant-condition
46936 // blend could be custom lowered.
46937 // FIXME: We should find a better way to handle this class of problems.
46938 // Potentially, we should combine constant-condition vselect nodes
46939 // pre-legalization into shuffles and not mark as many types as custom
46940 // lowered.
46942 return SDValue();
46943 // FIXME: We don't support i16-element blends currently. We could and
46944 // should support them by making *all* the bits in the condition be set
46945 // rather than just the high bit and using an i8-element blend.
46946 if (VT.getVectorElementType() == MVT::i16)
46947 return SDValue();
46948 // Dynamic blending was only available from SSE4.1 onward.
46949 if (VT.is128BitVector() && !Subtarget.hasSSE41())
46950 return SDValue();
46951 // Byte blends are only available in AVX2
46952 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
46953 return SDValue();
46954 // There are no 512-bit blend instructions that use sign bits.
46955 if (VT.is512BitVector())
46956 return SDValue();
46957
46958 // Don't optimize before the condition has been transformed to a legal type
46959 // and don't ever optimize vector selects that map to AVX512 mask-registers.
46960 if (BitWidth < 8 || BitWidth > 64)
46961 return SDValue();
46962
46963 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
46964 for (SDUse &Use : Cond->uses())
46965 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
46966 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
46967 Use.getOperandNo() != 0)
46968 return false;
46969
46970 return true;
46971 };
46972
46974
46975 if (OnlyUsedAsSelectCond(Cond)) {
46976 KnownBits Known;
46978 !DCI.isBeforeLegalizeOps());
46979 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
46980 return SDValue();
46981
46982 // If we changed the computation somewhere in the DAG, this change will
46983 // affect all users of Cond. Update all the nodes so that we do not use
46984 // the generic VSELECT anymore. Otherwise, we may perform wrong
46985 // optimizations as we messed with the actual expectation for the vector
46986 // boolean values.
46987 for (SDNode *U : Cond->users()) {
46988 if (U->getOpcode() == X86ISD::BLENDV)
46989 continue;
46990
46991 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
46992 Cond, U->getOperand(1), U->getOperand(2));
46993 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
46994 DCI.AddToWorklist(U);
46995 }
46996 DCI.CommitTargetLoweringOpt(TLO);
46997 return SDValue(N, 0);
46998 }
46999
47000 // Otherwise we can still at least try to simplify multiple use bits.
47002 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47003 N->getOperand(1), N->getOperand(2));
47004
47005 return SDValue();
47006}
47007
47008// Try to match:
47009// (or (and (M, (sub 0, X)), (pandn M, X)))
47010// which is a special case of:
47011// (select M, (sub 0, X), X)
47012// Per:
47013// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47014// We know that, if fNegate is 0 or 1:
47015// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47016//
47017// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47018// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47019// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47020// This lets us transform our vselect to:
47021// (add (xor X, M), (and M, 1))
47022// And further to:
47023// (sub (xor X, M), M)
47025 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47026 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47027 EVT MaskVT = Mask.getValueType();
47028 assert(MaskVT.isInteger() &&
47029 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47030 "Mask must be zero/all-bits");
47031
47032 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
47033 return SDValue();
47035 return SDValue();
47036
47037 auto IsNegV = [](SDNode *N, SDValue V) {
47038 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
47039 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
47040 };
47041
47042 SDValue V;
47043 if (IsNegV(Y.getNode(), X))
47044 V = X;
47045 else if (IsNegV(X.getNode(), Y))
47046 V = Y;
47047 else
47048 return SDValue();
47049
47050 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47051 SDValue SubOp2 = Mask;
47052
47053 // If the negate was on the false side of the select, then
47054 // the operands of the SUB need to be swapped. PR 27251.
47055 // This is because the pattern being matched above is
47056 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47057 // but if the pattern matched was
47058 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47059 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47060 // pattern also needs to be a negation of the replacement pattern above.
47061 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47062 // sub accomplishes the negation of the replacement pattern.
47063 if (V == Y)
47064 std::swap(SubOp1, SubOp2);
47065
47066 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47067 return DAG.getBitcast(VT, Res);
47068}
47069
47071 const X86Subtarget &Subtarget) {
47072 if (!Subtarget.hasAVX512())
47073 return SDValue();
47074 if (N->getOpcode() != ISD::VSELECT)
47075 return SDValue();
47076
47077 SDValue Cond = N->getOperand(0);
47078 SDValue LHS = N->getOperand(1);
47079 SDValue RHS = N->getOperand(2);
47080
47081 if (canCombineAsMaskOperation(LHS, Subtarget))
47082 return SDValue();
47083
47084 if (!canCombineAsMaskOperation(RHS, Subtarget))
47085 return SDValue();
47086
47087 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
47088 return SDValue();
47089
47090 // Commute LHS and RHS to create opportunity to select mask instruction.
47091 // (vselect M, L, R) -> (vselect ~M, R, L)
47092 ISD::CondCode NewCC =
47093 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
47094 Cond.getOperand(0).getValueType());
47095 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
47096 Cond.getOperand(1), NewCC);
47097 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47098}
47099
47100/// Do target-specific dag combines on SELECT and VSELECT nodes.
47103 const X86Subtarget &Subtarget) {
47104 SDLoc DL(N);
47105 SDValue Cond = N->getOperand(0);
47106 SDValue LHS = N->getOperand(1);
47107 SDValue RHS = N->getOperand(2);
47108
47109 // Try simplification again because we use this function to optimize
47110 // BLENDV nodes that are not handled by the generic combiner.
47111 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47112 return V;
47113
47114 // When avx512 is available the lhs operand of select instruction can be
47115 // folded with mask instruction, while the rhs operand can't. Commute the
47116 // lhs and rhs of the select instruction to create the opportunity of
47117 // folding.
47118 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47119 return V;
47120
47121 EVT VT = LHS.getValueType();
47122 EVT CondVT = Cond.getValueType();
47123 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47124 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47125
47126 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47127 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47128 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47129 if (CondVT.isVector() && CondVT.isInteger() &&
47130 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47131 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47134 DL, DAG, Subtarget))
47135 return V;
47136
47137 // Convert vselects with constant condition into shuffles.
47138 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
47139 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
47142 N->getOpcode() == X86ISD::BLENDV))
47143 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
47144 }
47145
47146 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47147 // by forcing the unselected elements to zero.
47148 // TODO: Can we handle more shuffles with this?
47149 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
47150 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
47151 LHS.hasOneUse() && RHS.hasOneUse()) {
47152 MVT SimpleVT = VT.getSimpleVT();
47153 SmallVector<SDValue, 1> LHSOps, RHSOps;
47154 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
47155 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
47156 getTargetShuffleMask(LHS, true, LHSOps, LHSMask) &&
47157 getTargetShuffleMask(RHS, true, RHSOps, RHSMask)) {
47158 int NumElts = VT.getVectorNumElements();
47159 for (int i = 0; i != NumElts; ++i) {
47160 // getConstVector sets negative shuffle mask values as undef, so ensure
47161 // we hardcode SM_SentinelZero values to zero (0x80).
47162 if (CondMask[i] < NumElts) {
47163 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
47164 RHSMask[i] = 0x80;
47165 } else {
47166 LHSMask[i] = 0x80;
47167 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
47168 }
47169 }
47170 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
47171 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
47172 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
47173 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
47174 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
47175 }
47176 }
47177
47178 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47179 // instructions match the semantics of the common C idiom x<y?x:y but not
47180 // x<=y?x:y, because of how they handle negative zero (which can be
47181 // ignored in unsafe-math mode).
47182 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47183 if ((Cond.getOpcode() == ISD::SETCC ||
47184 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47185 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47186 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47187 (Subtarget.hasSSE2() ||
47188 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47189 bool IsStrict = Cond->isStrictFPOpcode();
47191 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47192 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47193 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47194
47195 unsigned Opcode = 0;
47196 // Check for x CC y ? x : y.
47197 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47198 switch (CC) {
47199 default: break;
47200 case ISD::SETULT:
47201 // Converting this to a min would handle NaNs incorrectly, and swapping
47202 // the operands would cause it to handle comparisons between positive
47203 // and negative zero incorrectly.
47204 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47206 !(DAG.isKnownNeverZeroFloat(LHS) ||
47208 break;
47209 std::swap(LHS, RHS);
47210 }
47211 Opcode = X86ISD::FMIN;
47212 break;
47213 case ISD::SETOLE:
47214 // Converting this to a min would handle comparisons between positive
47215 // and negative zero incorrectly.
47218 break;
47219 Opcode = X86ISD::FMIN;
47220 break;
47221 case ISD::SETULE:
47222 // Converting this to a min would handle both negative zeros and NaNs
47223 // incorrectly, but we can swap the operands to fix both.
47224 std::swap(LHS, RHS);
47225 [[fallthrough]];
47226 case ISD::SETOLT:
47227 case ISD::SETLT:
47228 case ISD::SETLE:
47229 Opcode = X86ISD::FMIN;
47230 break;
47231
47232 case ISD::SETOGE:
47233 // Converting this to a max would handle comparisons between positive
47234 // and negative zero incorrectly.
47237 break;
47238 Opcode = X86ISD::FMAX;
47239 break;
47240 case ISD::SETUGT:
47241 // Converting this to a max would handle NaNs incorrectly, and swapping
47242 // the operands would cause it to handle comparisons between positive
47243 // and negative zero incorrectly.
47244 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47246 !(DAG.isKnownNeverZeroFloat(LHS) ||
47248 break;
47249 std::swap(LHS, RHS);
47250 }
47251 Opcode = X86ISD::FMAX;
47252 break;
47253 case ISD::SETUGE:
47254 // Converting this to a max would handle both negative zeros and NaNs
47255 // incorrectly, but we can swap the operands to fix both.
47256 std::swap(LHS, RHS);
47257 [[fallthrough]];
47258 case ISD::SETOGT:
47259 case ISD::SETGT:
47260 case ISD::SETGE:
47261 Opcode = X86ISD::FMAX;
47262 break;
47263 }
47264 // Check for x CC y ? y : x -- a min/max with reversed arms.
47265 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47266 switch (CC) {
47267 default: break;
47268 case ISD::SETOGE:
47269 // Converting this to a min would handle comparisons between positive
47270 // and negative zero incorrectly, and swapping the operands would
47271 // cause it to handle NaNs incorrectly.
47273 !(DAG.isKnownNeverZeroFloat(LHS) ||
47274 DAG.isKnownNeverZeroFloat(RHS))) {
47275 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47276 break;
47277 std::swap(LHS, RHS);
47278 }
47279 Opcode = X86ISD::FMIN;
47280 break;
47281 case ISD::SETUGT:
47282 // Converting this to a min would handle NaNs incorrectly.
47283 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47284 break;
47285 Opcode = X86ISD::FMIN;
47286 break;
47287 case ISD::SETUGE:
47288 // Converting this to a min would handle both negative zeros and NaNs
47289 // incorrectly, but we can swap the operands to fix both.
47290 std::swap(LHS, RHS);
47291 [[fallthrough]];
47292 case ISD::SETOGT:
47293 case ISD::SETGT:
47294 case ISD::SETGE:
47295 Opcode = X86ISD::FMIN;
47296 break;
47297
47298 case ISD::SETULT:
47299 // Converting this to a max would handle NaNs incorrectly.
47300 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47301 break;
47302 Opcode = X86ISD::FMAX;
47303 break;
47304 case ISD::SETOLE:
47305 // Converting this to a max would handle comparisons between positive
47306 // and negative zero incorrectly, and swapping the operands would
47307 // cause it to handle NaNs incorrectly.
47309 !DAG.isKnownNeverZeroFloat(LHS) &&
47310 !DAG.isKnownNeverZeroFloat(RHS)) {
47311 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47312 break;
47313 std::swap(LHS, RHS);
47314 }
47315 Opcode = X86ISD::FMAX;
47316 break;
47317 case ISD::SETULE:
47318 // Converting this to a max would handle both negative zeros and NaNs
47319 // incorrectly, but we can swap the operands to fix both.
47320 std::swap(LHS, RHS);
47321 [[fallthrough]];
47322 case ISD::SETOLT:
47323 case ISD::SETLT:
47324 case ISD::SETLE:
47325 Opcode = X86ISD::FMAX;
47326 break;
47327 }
47328 }
47329
47330 if (Opcode) {
47331 if (IsStrict) {
47332 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47334 DL, {N->getValueType(0), MVT::Other},
47335 {Cond.getOperand(0), LHS, RHS});
47336 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47337 return Ret;
47338 }
47339 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47340 }
47341 }
47342
47343 // Some mask scalar intrinsics rely on checking if only one bit is set
47344 // and implement it in C code like this:
47345 // A[0] = (U & 1) ? A[0] : W[0];
47346 // This creates some redundant instructions that break pattern matching.
47347 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47348 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47349 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47350 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47351 SDValue AndNode = Cond.getOperand(0);
47352 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47353 isNullConstant(Cond.getOperand(1)) &&
47354 isOneConstant(AndNode.getOperand(1))) {
47355 // LHS and RHS swapped due to
47356 // setcc outputting 1 when AND resulted in 0 and vice versa.
47357 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47358 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47359 }
47360 }
47361
47362 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47363 // lowering on KNL. In this case we convert it to
47364 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47365 // The same situation all vectors of i8 and i16 without BWI.
47366 // Make sure we extend these even before type legalization gets a chance to
47367 // split wide vectors.
47368 // Since SKX these selects have a proper lowering.
47369 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47370 CondVT.getVectorElementType() == MVT::i1 &&
47371 (VT.getVectorElementType() == MVT::i8 ||
47372 VT.getVectorElementType() == MVT::i16)) {
47373 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47374 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47375 }
47376
47377 // AVX512 - Extend select to merge with target shuffle.
47378 // select(mask, extract_subvector(shuffle(x)), y) -->
47379 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47380 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47381 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47382 CondVT.getVectorElementType() == MVT::i1) {
47383 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
47384 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47385 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
47386 isNullConstant(Op.getOperand(1)) &&
47387 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
47388 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
47389 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
47390 ISD::isBuildVectorAllZeros(Alt.getNode()));
47391 };
47392
47393 bool SelectableLHS = SelectableOp(LHS, RHS);
47394 bool SelectableRHS = SelectableOp(RHS, LHS);
47395 if (SelectableLHS || SelectableRHS) {
47396 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
47397 : RHS.getOperand(0).getValueType();
47398 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
47399 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
47400 VT.getSizeInBits());
47401 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
47402 VT.getSizeInBits());
47403 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
47404 DAG.getUNDEF(SrcCondVT), Cond,
47405 DAG.getVectorIdxConstant(0, DL));
47406 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
47407 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
47408 }
47409 }
47410
47411 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
47412 return V;
47413
47414 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
47415 Cond.hasOneUse()) {
47416 EVT CondVT = Cond.getValueType();
47417 SDValue Cond0 = Cond.getOperand(0);
47418 SDValue Cond1 = Cond.getOperand(1);
47419 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47420
47421 // Canonicalize min/max:
47422 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
47423 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
47424 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
47425 // the need for an extra compare against zero. e.g.
47426 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
47427 // subl %esi, %edi
47428 // testl %edi, %edi
47429 // movl $0, %eax
47430 // cmovgl %edi, %eax
47431 // =>
47432 // xorl %eax, %eax
47433 // subl %esi, $edi
47434 // cmovsl %eax, %edi
47435 //
47436 // We can also canonicalize
47437 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
47438 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
47439 // This allows the use of a test instruction for the compare.
47440 if (LHS == Cond0 && RHS == Cond1) {
47441 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
47444 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47445 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47446 }
47447 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
47448 ISD::CondCode NewCC = ISD::SETUGE;
47449 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47450 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47451 }
47452 }
47453
47454 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
47455 // fold eq + gt/lt nested selects into ge/le selects
47456 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
47457 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
47458 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
47459 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
47460 // .. etc ..
47461 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
47462 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
47463 SDValue InnerSetCC = RHS.getOperand(0);
47464 ISD::CondCode InnerCC =
47465 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
47466 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
47467 Cond0 == InnerSetCC.getOperand(0) &&
47468 Cond1 == InnerSetCC.getOperand(1)) {
47469 ISD::CondCode NewCC;
47470 switch (CC == ISD::SETEQ ? InnerCC : CC) {
47471 // clang-format off
47472 case ISD::SETGT: NewCC = ISD::SETGE; break;
47473 case ISD::SETLT: NewCC = ISD::SETLE; break;
47474 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
47475 case ISD::SETULT: NewCC = ISD::SETULE; break;
47476 default: NewCC = ISD::SETCC_INVALID; break;
47477 // clang-format on
47478 }
47479 if (NewCC != ISD::SETCC_INVALID) {
47480 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
47481 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
47482 }
47483 }
47484 }
47485 }
47486
47487 // Check if the first operand is all zeros and Cond type is vXi1.
47488 // If this an avx512 target we can improve the use of zero masking by
47489 // swapping the operands and inverting the condition.
47490 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
47491 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
47492 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
47493 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
47494 // Invert the cond to not(cond) : xor(op,allones)=not(op)
47495 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
47496 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
47497 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
47498 }
47499
47500 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
47501 // get split by legalization.
47502 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
47503 CondVT.getVectorElementType() == MVT::i1 &&
47504 TLI.isTypeLegal(VT.getScalarType())) {
47505 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
47507 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
47508 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
47509 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
47510 }
47511 }
47512
47513 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
47514 // with out-of-bounds clamping.
47515
47516 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
47517 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
47518 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
47519 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
47520 // exceeding bitwidth-1.
47521 if (N->getOpcode() == ISD::VSELECT) {
47522 using namespace llvm::SDPatternMatch;
47523 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
47524 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
47525 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
47526 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
47528 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
47530 m_SpecificCondCode(ISD::SETULT)))) {
47531 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
47532 : X86ISD::VSHLV,
47533 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
47534 }
47535 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
47536 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
47537 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
47538 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
47540 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
47542 m_SpecificCondCode(ISD::SETUGE)))) {
47543 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
47544 : X86ISD::VSHLV,
47545 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
47546 }
47547 }
47548
47549 // Early exit check
47550 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
47551 return SDValue();
47552
47553 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DL, DCI, Subtarget))
47554 return V;
47555
47556 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
47557 return V;
47558
47559 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
47560 return V;
47561
47562 // select(~Cond, X, Y) -> select(Cond, Y, X)
47563 if (CondVT.getScalarType() != MVT::i1) {
47564 if (SDValue CondNot = IsNOT(Cond, DAG))
47565 return DAG.getNode(N->getOpcode(), DL, VT,
47566 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
47567
47568 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
47569 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
47570 Cond.getOperand(0).getOpcode() == ISD::AND &&
47571 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
47572 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
47573 Cond.getScalarValueSizeInBits(),
47574 /*AllowUndefs=*/true) &&
47575 Cond.hasOneUse()) {
47576 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
47577 Cond.getOperand(0).getOperand(1));
47578 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47579 }
47580
47581 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
47582 // signbit.
47583 if (Cond.getOpcode() == X86ISD::PCMPGT &&
47584 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
47585 Cond.hasOneUse()) {
47586 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
47587 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
47588 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47589 }
47590 }
47591
47592 // Try to optimize vXi1 selects if both operands are either all constants or
47593 // bitcasts from scalar integer type. In that case we can convert the operands
47594 // to integer and use an integer select which will be converted to a CMOV.
47595 // We need to take a little bit of care to avoid creating an i64 type after
47596 // type legalization.
47597 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
47598 VT.getVectorElementType() == MVT::i1 &&
47599 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
47601 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
47602 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
47603 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
47604
47605 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
47606 LHS.getOperand(0).getValueType() == IntVT)) &&
47607 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
47608 RHS.getOperand(0).getValueType() == IntVT))) {
47609 if (LHSIsConst)
47611 else
47612 LHS = LHS.getOperand(0);
47613
47614 if (RHSIsConst)
47616 else
47617 RHS = RHS.getOperand(0);
47618
47619 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
47620 return DAG.getBitcast(VT, Select);
47621 }
47622 }
47623 }
47624
47625 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
47626 // single bits, then invert the predicate and swap the select operands.
47627 // This can lower using a vector shift bit-hack rather than mask and compare.
47628 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
47629 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
47630 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
47631 Cond.getOperand(0).getOpcode() == ISD::AND &&
47632 isNullOrNullSplat(Cond.getOperand(1)) &&
47633 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
47634 Cond.getOperand(0).getValueType() == VT) {
47635 // The 'and' mask must be composed of power-of-2 constants.
47636 SDValue And = Cond.getOperand(0);
47637 auto *C = isConstOrConstSplat(And.getOperand(1));
47638 if (C && C->getAPIntValue().isPowerOf2()) {
47639 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
47640 SDValue NotCond =
47641 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
47642 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
47643 }
47644
47645 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
47646 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
47647 // 16-bit lacks a proper blendv.
47648 unsigned EltBitWidth = VT.getScalarSizeInBits();
47649 bool CanShiftBlend =
47650 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
47651 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
47652 (Subtarget.hasXOP()));
47653 if (CanShiftBlend &&
47654 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
47655 return C->getAPIntValue().isPowerOf2();
47656 })) {
47657 // Create a left-shift constant to get the mask bits over to the sign-bit.
47658 SDValue Mask = And.getOperand(1);
47659 SmallVector<int, 32> ShlVals;
47660 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
47661 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
47662 ShlVals.push_back(EltBitWidth - 1 -
47663 MaskVal->getAPIntValue().exactLogBase2());
47664 }
47665 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
47666 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
47667 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
47668 SDValue NewCond =
47669 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
47670 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
47671 }
47672 }
47673
47674 return SDValue();
47675}
47676
47677/// Combine:
47678/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
47679/// to:
47680/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
47681/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
47682/// Note that this is only legal for some op/cc combinations.
47684 SelectionDAG &DAG,
47685 const X86Subtarget &Subtarget) {
47686 // This combine only operates on CMP-like nodes.
47687 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47688 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47689 return SDValue();
47690
47691 // Can't replace the cmp if it has more uses than the one we're looking at.
47692 // FIXME: We would like to be able to handle this, but would need to make sure
47693 // all uses were updated.
47694 if (!Cmp.hasOneUse())
47695 return SDValue();
47696
47697 // This only applies to variations of the common case:
47698 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
47699 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
47700 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
47701 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
47702 // Using the proper condcodes (see below), overflow is checked for.
47703
47704 // FIXME: We can generalize both constraints:
47705 // - XOR/OR/AND (if they were made to survive AtomicExpand)
47706 // - LHS != 1
47707 // if the result is compared.
47708
47709 SDValue CmpLHS = Cmp.getOperand(0);
47710 SDValue CmpRHS = Cmp.getOperand(1);
47711 EVT CmpVT = CmpLHS.getValueType();
47712
47713 if (!CmpLHS.hasOneUse())
47714 return SDValue();
47715
47716 unsigned Opc = CmpLHS.getOpcode();
47717 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
47718 return SDValue();
47719
47720 SDValue OpRHS = CmpLHS.getOperand(2);
47721 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
47722 if (!OpRHSC)
47723 return SDValue();
47724
47725 APInt Addend = OpRHSC->getAPIntValue();
47726 if (Opc == ISD::ATOMIC_LOAD_SUB)
47727 Addend = -Addend;
47728
47729 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
47730 if (!CmpRHSC)
47731 return SDValue();
47732
47733 APInt Comparison = CmpRHSC->getAPIntValue();
47734 APInt NegAddend = -Addend;
47735
47736 // See if we can adjust the CC to make the comparison match the negated
47737 // addend.
47738 if (Comparison != NegAddend) {
47739 APInt IncComparison = Comparison + 1;
47740 if (IncComparison == NegAddend) {
47741 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
47742 Comparison = IncComparison;
47743 CC = X86::COND_AE;
47744 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
47745 Comparison = IncComparison;
47746 CC = X86::COND_L;
47747 }
47748 }
47749 APInt DecComparison = Comparison - 1;
47750 if (DecComparison == NegAddend) {
47751 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
47752 Comparison = DecComparison;
47753 CC = X86::COND_A;
47754 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
47755 Comparison = DecComparison;
47756 CC = X86::COND_LE;
47757 }
47758 }
47759 }
47760
47761 // If the addend is the negation of the comparison value, then we can do
47762 // a full comparison by emitting the atomic arithmetic as a locked sub.
47763 if (Comparison == NegAddend) {
47764 // The CC is fine, but we need to rewrite the LHS of the comparison as an
47765 // atomic sub.
47766 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
47767 auto AtomicSub = DAG.getAtomic(
47768 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
47769 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
47770 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
47771 AN->getMemOperand());
47772 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
47773 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47774 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47775 return LockOp;
47776 }
47777
47778 // We can handle comparisons with zero in a number of cases by manipulating
47779 // the CC used.
47780 if (!Comparison.isZero())
47781 return SDValue();
47782
47783 if (CC == X86::COND_S && Addend == 1)
47784 CC = X86::COND_LE;
47785 else if (CC == X86::COND_NS && Addend == 1)
47786 CC = X86::COND_G;
47787 else if (CC == X86::COND_G && Addend == -1)
47788 CC = X86::COND_GE;
47789 else if (CC == X86::COND_LE && Addend == -1)
47790 CC = X86::COND_L;
47791 else
47792 return SDValue();
47793
47794 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
47795 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47796 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47797 return LockOp;
47798}
47799
47800// Check whether we're just testing the signbit, and whether we can simplify
47801// this by tracking where the signbit came from.
47803 SelectionDAG &DAG) {
47804 if (CC != X86::COND_S && CC != X86::COND_NS)
47805 return SDValue();
47806
47807 if (!Cmp.hasOneUse())
47808 return SDValue();
47809
47810 SDValue Src;
47811 if (Cmp.getOpcode() == X86ISD::CMP) {
47812 // CMP(X,0) -> signbit test
47813 if (!isNullConstant(Cmp.getOperand(1)))
47814 return SDValue();
47815 Src = Cmp.getOperand(0);
47816 // Peek through a SRA node as we just need the signbit.
47817 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
47818 // TODO: Use SimplifyDemandedBits instead of just SRA?
47819 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
47820 return SDValue();
47821 Src = Src.getOperand(0);
47822 } else if (Cmp.getOpcode() == X86ISD::OR) {
47823 // OR(X,Y) -> see if only one operand contributes to the signbit.
47824 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
47825 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
47826 Src = Cmp.getOperand(1);
47827 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
47828 Src = Cmp.getOperand(0);
47829 else
47830 return SDValue();
47831 } else {
47832 return SDValue();
47833 }
47834
47835 // Replace with a TEST on the MSB.
47836 SDLoc DL(Cmp);
47837 MVT SrcVT = Src.getSimpleValueType();
47838 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
47839
47840 // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
47841 // peek through and adjust the TEST bit.
47842 if (Src.getOpcode() == ISD::SHL) {
47843 if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) {
47844 Src = Src.getOperand(0);
47845 BitMask.lshrInPlace(*ShiftAmt);
47846 }
47847 }
47848
47849 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
47850 DAG.getConstant(BitMask, DL, SrcVT));
47852 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
47853 DAG.getConstant(0, DL, SrcVT));
47854}
47855
47856// Check whether a boolean test is testing a boolean value generated by
47857// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
47858// code.
47859//
47860// Simplify the following patterns:
47861// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
47862// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
47863// to (Op EFLAGS Cond)
47864//
47865// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
47866// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
47867// to (Op EFLAGS !Cond)
47868//
47869// where Op could be BRCOND or CMOV.
47870//
47872 // This combine only operates on CMP-like nodes.
47873 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47874 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47875 return SDValue();
47876
47877 // Quit if not used as a boolean value.
47878 if (CC != X86::COND_E && CC != X86::COND_NE)
47879 return SDValue();
47880
47881 // Check CMP operands. One of them should be 0 or 1 and the other should be
47882 // an SetCC or extended from it.
47883 SDValue Op1 = Cmp.getOperand(0);
47884 SDValue Op2 = Cmp.getOperand(1);
47885
47886 SDValue SetCC;
47887 const ConstantSDNode* C = nullptr;
47888 bool needOppositeCond = (CC == X86::COND_E);
47889 bool checkAgainstTrue = false; // Is it a comparison against 1?
47890
47891 if ((C = dyn_cast<ConstantSDNode>(Op1)))
47892 SetCC = Op2;
47893 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
47894 SetCC = Op1;
47895 else // Quit if all operands are not constants.
47896 return SDValue();
47897
47898 if (C->getZExtValue() == 1) {
47899 needOppositeCond = !needOppositeCond;
47900 checkAgainstTrue = true;
47901 } else if (C->getZExtValue() != 0)
47902 // Quit if the constant is neither 0 or 1.
47903 return SDValue();
47904
47905 bool truncatedToBoolWithAnd = false;
47906 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
47907 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
47908 SetCC.getOpcode() == ISD::TRUNCATE ||
47909 SetCC.getOpcode() == ISD::AND) {
47910 if (SetCC.getOpcode() == ISD::AND) {
47911 int OpIdx = -1;
47912 if (isOneConstant(SetCC.getOperand(0)))
47913 OpIdx = 1;
47914 if (isOneConstant(SetCC.getOperand(1)))
47915 OpIdx = 0;
47916 if (OpIdx < 0)
47917 break;
47918 SetCC = SetCC.getOperand(OpIdx);
47919 truncatedToBoolWithAnd = true;
47920 } else
47921 SetCC = SetCC.getOperand(0);
47922 }
47923
47924 switch (SetCC.getOpcode()) {
47926 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
47927 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
47928 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
47929 // truncated to i1 using 'and'.
47930 if (checkAgainstTrue && !truncatedToBoolWithAnd)
47931 break;
47933 "Invalid use of SETCC_CARRY!");
47934 [[fallthrough]];
47935 case X86ISD::SETCC:
47936 // Set the condition code or opposite one if necessary.
47938 if (needOppositeCond)
47940 return SetCC.getOperand(1);
47941 case X86ISD::CMOV: {
47942 // Check whether false/true value has canonical one, i.e. 0 or 1.
47943 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
47944 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
47945 // Quit if true value is not a constant.
47946 if (!TVal)
47947 return SDValue();
47948 // Quit if false value is not a constant.
47949 if (!FVal) {
47950 SDValue Op = SetCC.getOperand(0);
47951 // Skip 'zext' or 'trunc' node.
47952 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
47953 Op.getOpcode() == ISD::TRUNCATE)
47954 Op = Op.getOperand(0);
47955 // A special case for rdrand/rdseed, where 0 is set if false cond is
47956 // found.
47957 if ((Op.getOpcode() != X86ISD::RDRAND &&
47958 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
47959 return SDValue();
47960 }
47961 // Quit if false value is not the constant 0 or 1.
47962 bool FValIsFalse = true;
47963 if (FVal && FVal->getZExtValue() != 0) {
47964 if (FVal->getZExtValue() != 1)
47965 return SDValue();
47966 // If FVal is 1, opposite cond is needed.
47967 needOppositeCond = !needOppositeCond;
47968 FValIsFalse = false;
47969 }
47970 // Quit if TVal is not the constant opposite of FVal.
47971 if (FValIsFalse && TVal->getZExtValue() != 1)
47972 return SDValue();
47973 if (!FValIsFalse && TVal->getZExtValue() != 0)
47974 return SDValue();
47976 if (needOppositeCond)
47978 return SetCC.getOperand(3);
47979 }
47980 }
47981
47982 return SDValue();
47983}
47984
47985/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
47986/// Match:
47987/// (X86or (X86setcc) (X86setcc))
47988/// (X86cmp (and (X86setcc) (X86setcc)), 0)
47990 X86::CondCode &CC1, SDValue &Flags,
47991 bool &isAnd) {
47992 if (Cond->getOpcode() == X86ISD::CMP) {
47993 if (!isNullConstant(Cond->getOperand(1)))
47994 return false;
47995
47996 Cond = Cond->getOperand(0);
47997 }
47998
47999 isAnd = false;
48000
48001 SDValue SetCC0, SetCC1;
48002 switch (Cond->getOpcode()) {
48003 default: return false;
48004 case ISD::AND:
48005 case X86ISD::AND:
48006 isAnd = true;
48007 [[fallthrough]];
48008 case ISD::OR:
48009 case X86ISD::OR:
48010 SetCC0 = Cond->getOperand(0);
48011 SetCC1 = Cond->getOperand(1);
48012 break;
48013 };
48014
48015 // Make sure we have SETCC nodes, using the same flags value.
48016 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48017 SetCC1.getOpcode() != X86ISD::SETCC ||
48018 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48019 return false;
48020
48021 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48022 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48023 Flags = SetCC0->getOperand(1);
48024 return true;
48025}
48026
48027// When legalizing carry, we create carries via add X, -1
48028// If that comes from an actual carry, via setcc, we use the
48029// carry directly.
48031 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48032 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48033 bool FoundAndLSB = false;
48034 SDValue Carry = EFLAGS.getOperand(0);
48035 while (Carry.getOpcode() == ISD::TRUNCATE ||
48036 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48037 (Carry.getOpcode() == ISD::AND &&
48038 isOneConstant(Carry.getOperand(1)))) {
48039 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48040 Carry = Carry.getOperand(0);
48041 }
48042 if (Carry.getOpcode() == X86ISD::SETCC ||
48043 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48044 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48045 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48046 SDValue CarryOp1 = Carry.getOperand(1);
48047 if (CarryCC == X86::COND_B)
48048 return CarryOp1;
48049 if (CarryCC == X86::COND_A) {
48050 // Try to convert COND_A into COND_B in an attempt to facilitate
48051 // materializing "setb reg".
48052 //
48053 // Do not flip "e > c", where "c" is a constant, because Cmp
48054 // instruction cannot take an immediate as its first operand.
48055 //
48056 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48057 CarryOp1.getNode()->hasOneUse() &&
48058 CarryOp1.getValueType().isInteger() &&
48059 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48060 SDValue SubCommute =
48061 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48062 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48063 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48064 }
48065 }
48066 // If this is a check of the z flag of an add with 1, switch to the
48067 // C flag.
48068 if (CarryCC == X86::COND_E &&
48069 CarryOp1.getOpcode() == X86ISD::ADD &&
48070 isOneConstant(CarryOp1.getOperand(1)))
48071 return CarryOp1;
48072 } else if (FoundAndLSB) {
48073 SDLoc DL(Carry);
48074 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48075 if (Carry.getOpcode() == ISD::SRL) {
48076 BitNo = Carry.getOperand(1);
48077 Carry = Carry.getOperand(0);
48078 }
48079 return getBT(Carry, BitNo, DL, DAG);
48080 }
48081 }
48082 }
48083
48084 return SDValue();
48085}
48086
48087/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48088/// to avoid the inversion.
48090 SelectionDAG &DAG,
48091 const X86Subtarget &Subtarget) {
48092 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48093 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48094 EFLAGS.getOpcode() != X86ISD::TESTP)
48095 return SDValue();
48096
48097 // PTEST/TESTP sets EFLAGS as:
48098 // TESTZ: ZF = (Op0 & Op1) == 0
48099 // TESTC: CF = (~Op0 & Op1) == 0
48100 // TESTNZC: ZF == 0 && CF == 0
48101 MVT VT = EFLAGS.getSimpleValueType();
48102 SDValue Op0 = EFLAGS.getOperand(0);
48103 SDValue Op1 = EFLAGS.getOperand(1);
48104 MVT OpVT = Op0.getSimpleValueType();
48105 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48106
48107 // TEST*(~X,Y) == TEST*(X,Y)
48108 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48109 X86::CondCode InvCC;
48110 switch (CC) {
48111 case X86::COND_B:
48112 // testc -> testz.
48113 InvCC = X86::COND_E;
48114 break;
48115 case X86::COND_AE:
48116 // !testc -> !testz.
48117 InvCC = X86::COND_NE;
48118 break;
48119 case X86::COND_E:
48120 // testz -> testc.
48121 InvCC = X86::COND_B;
48122 break;
48123 case X86::COND_NE:
48124 // !testz -> !testc.
48125 InvCC = X86::COND_AE;
48126 break;
48127 case X86::COND_A:
48128 case X86::COND_BE:
48129 // testnzc -> testnzc (no change).
48130 InvCC = CC;
48131 break;
48132 default:
48133 InvCC = X86::COND_INVALID;
48134 break;
48135 }
48136
48137 if (InvCC != X86::COND_INVALID) {
48138 CC = InvCC;
48139 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48140 DAG.getBitcast(OpVT, NotOp0), Op1);
48141 }
48142 }
48143
48144 if (CC == X86::COND_B || CC == X86::COND_AE) {
48145 // TESTC(X,~X) == TESTC(X,-1)
48146 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48147 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48148 SDLoc DL(EFLAGS);
48149 return DAG.getNode(
48150 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48151 DAG.getBitcast(OpVT,
48152 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48153 }
48154 }
48155 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48156 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48158 SDValue BC0 = peekThroughBitcasts(Op0);
48159 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48161 SDLoc DL(EFLAGS);
48163 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48164 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48165 }
48166 }
48167 }
48168
48169 if (CC == X86::COND_E || CC == X86::COND_NE) {
48170 // TESTZ(X,~Y) == TESTC(Y,X)
48171 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48173 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48174 DAG.getBitcast(OpVT, NotOp1), Op0);
48175 }
48176
48177 if (Op0 == Op1) {
48178 SDValue BC = peekThroughBitcasts(Op0);
48179 EVT BCVT = BC.getValueType();
48180
48181 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48182 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48183 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48184 DAG.getBitcast(OpVT, BC.getOperand(0)),
48185 DAG.getBitcast(OpVT, BC.getOperand(1)));
48186 }
48187
48188 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48189 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48191 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48192 DAG.getBitcast(OpVT, BC.getOperand(0)),
48193 DAG.getBitcast(OpVT, BC.getOperand(1)));
48194 }
48195
48196 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48197 // to more efficiently extract the sign bits and compare that.
48198 // TODO: Handle TESTC with comparison inversion.
48199 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48200 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48201 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48202 unsigned EltBits = BCVT.getScalarSizeInBits();
48203 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48204 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48205 APInt SignMask = APInt::getSignMask(EltBits);
48206 if (SDValue Res =
48207 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48208 // For vXi16 cases we need to use pmovmksb and extract every other
48209 // sign bit.
48210 SDLoc DL(EFLAGS);
48211 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48212 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48213 MVT FloatVT =
48214 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48215 Res = DAG.getBitcast(FloatVT, Res);
48216 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48217 } else if (EltBits == 16) {
48218 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48219 Res = DAG.getBitcast(MovmskVT, Res);
48220 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48221 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48222 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48223 } else {
48224 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48225 }
48226 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48227 DAG.getConstant(0, DL, MVT::i32));
48228 }
48229 }
48230 }
48231 }
48232
48233 // TESTZ(-1,X) == TESTZ(X,X)
48235 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48236
48237 // TESTZ(X,-1) == TESTZ(X,X)
48239 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48240
48241 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48242 // TODO: Add COND_NE handling?
48243 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48244 SDValue Src0 = peekThroughBitcasts(Op0);
48245 SDValue Src1 = peekThroughBitcasts(Op1);
48246 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48248 peekThroughBitcasts(Src0.getOperand(1)), true);
48250 peekThroughBitcasts(Src1.getOperand(1)), true);
48251 if (Src0 && Src1) {
48252 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48253 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48254 DAG.getBitcast(OpVT2, Src0),
48255 DAG.getBitcast(OpVT2, Src1));
48256 }
48257 }
48258 }
48259 }
48260
48261 return SDValue();
48262}
48263
48264// Attempt to simplify the MOVMSK input based on the comparison type.
48266 SelectionDAG &DAG,
48267 const X86Subtarget &Subtarget) {
48268 // Handle eq/ne against zero (any_of).
48269 // Handle eq/ne against -1 (all_of).
48270 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48271 return SDValue();
48272 if (EFLAGS.getValueType() != MVT::i32)
48273 return SDValue();
48274 unsigned CmpOpcode = EFLAGS.getOpcode();
48275 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48276 return SDValue();
48277 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48278 if (!CmpConstant)
48279 return SDValue();
48280 const APInt &CmpVal = CmpConstant->getAPIntValue();
48281
48282 SDValue CmpOp = EFLAGS.getOperand(0);
48283 unsigned CmpBits = CmpOp.getValueSizeInBits();
48284 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48285
48286 // Peek through any truncate.
48287 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48288 CmpOp = CmpOp.getOperand(0);
48289
48290 // Bail if we don't find a MOVMSK.
48291 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48292 return SDValue();
48293
48294 SDValue Vec = CmpOp.getOperand(0);
48295 MVT VecVT = Vec.getSimpleValueType();
48296 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48297 "Unexpected MOVMSK operand");
48298 unsigned NumElts = VecVT.getVectorNumElements();
48299 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48300
48301 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48302 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48303 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48304 if (!IsAnyOf && !IsAllOf)
48305 return SDValue();
48306
48307 // TODO: Check more combining cases for me.
48308 // Here we check the cmp use number to decide do combining or not.
48309 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48310 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48311 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48312
48313 // See if we can peek through to a vector with a wider element type, if the
48314 // signbits extend down to all the sub-elements as well.
48315 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48316 // potential SimplifyDemandedBits/Elts cases.
48317 // If we looked through a truncate that discard bits, we can't do this
48318 // transform.
48319 // FIXME: We could do this transform for truncates that discarded bits by
48320 // inserting an AND mask between the new MOVMSK and the CMP.
48321 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48322 SDValue BC = peekThroughBitcasts(Vec);
48323 MVT BCVT = BC.getSimpleValueType();
48324 unsigned BCNumElts = BCVT.getVectorNumElements();
48325 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48326 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48327 BCNumEltBits > NumEltBits &&
48328 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48329 SDLoc DL(EFLAGS);
48330 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48331 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48332 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48333 DAG.getConstant(CmpMask, DL, MVT::i32));
48334 }
48335 }
48336
48337 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48338 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48339 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48340 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48341 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48343 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48344 Ops.size() == 2) {
48345 SDLoc DL(EFLAGS);
48346 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48347 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48348 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48349 DAG.getBitcast(SubVT, Ops[0]),
48350 DAG.getBitcast(SubVT, Ops[1]));
48351 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48352 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48353 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48354 DAG.getConstant(CmpMask, DL, MVT::i32));
48355 }
48356 }
48357
48358 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48359 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48360 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48361 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48362 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48363 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48364 SDValue BC = peekThroughBitcasts(Vec);
48365 // Ensure MOVMSK was testing every signbit of BC.
48366 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48367 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48368 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48369 BC.getOperand(0), BC.getOperand(1));
48370 V = DAG.getBitcast(TestVT, V);
48371 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48372 }
48373 // Check for 256-bit split vector cases.
48374 if (BC.getOpcode() == ISD::AND &&
48375 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48376 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48377 SDValue LHS = BC.getOperand(0);
48378 SDValue RHS = BC.getOperand(1);
48379 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48380 LHS.getOperand(0), LHS.getOperand(1));
48381 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48382 RHS.getOperand(0), RHS.getOperand(1));
48383 LHS = DAG.getBitcast(TestVT, LHS);
48384 RHS = DAG.getBitcast(TestVT, RHS);
48385 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
48386 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48387 }
48388 }
48389 }
48390
48391 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
48392 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
48393 // sign bits prior to the comparison with zero unless we know that
48394 // the vXi16 splats the sign bit down to the lower i8 half.
48395 // TODO: Handle all_of patterns.
48396 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
48397 SDValue VecOp0 = Vec.getOperand(0);
48398 SDValue VecOp1 = Vec.getOperand(1);
48399 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
48400 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
48401 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
48402 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
48403 SDLoc DL(EFLAGS);
48404 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
48405 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48406 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
48407 if (!SignExt0) {
48408 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
48409 DAG.getConstant(0xAAAA, DL, MVT::i16));
48410 }
48411 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48412 DAG.getConstant(0, DL, MVT::i16));
48413 }
48414 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
48415 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
48416 if (CmpBits >= 16 && Subtarget.hasInt256() &&
48417 (IsAnyOf || (SignExt0 && SignExt1))) {
48418 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
48419 SDLoc DL(EFLAGS);
48420 SDValue Result = peekThroughBitcasts(Src);
48421 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
48422 Result.getValueType().getVectorNumElements() <= NumElts) {
48423 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
48424 Result.getOperand(0), Result.getOperand(1));
48425 V = DAG.getBitcast(MVT::v4i64, V);
48426 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48427 }
48428 Result = DAG.getBitcast(MVT::v32i8, Result);
48429 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48430 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
48431 if (!SignExt0 || !SignExt1) {
48432 assert(IsAnyOf &&
48433 "Only perform v16i16 signmasks for any_of patterns");
48434 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
48435 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48436 }
48437 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48438 DAG.getConstant(CmpMask, DL, MVT::i32));
48439 }
48440 }
48441 }
48442
48443 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
48444 // Since we peek through a bitcast, we need to be careful if the base vector
48445 // type has smaller elements than the MOVMSK type. In that case, even if
48446 // all the elements are demanded by the shuffle mask, only the "high"
48447 // elements which have highbits that align with highbits in the MOVMSK vec
48448 // elements are actually demanded. A simplification of spurious operations
48449 // on the "low" elements take place during other simplifications.
48450 //
48451 // For example:
48452 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
48453 // demanded, because we are swapping around the result can change.
48454 //
48455 // To address this, we check that we can scale the shuffle mask to MOVMSK
48456 // element width (this will ensure "high" elements match). Its slightly overly
48457 // conservative, but fine for an edge case fold.
48458 SmallVector<int, 32> ShuffleMask;
48459 SmallVector<SDValue, 2> ShuffleInputs;
48460 if (NumElts <= CmpBits &&
48461 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
48462 ShuffleMask, DAG) &&
48463 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
48464 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
48465 canScaleShuffleElements(ShuffleMask, NumElts)) {
48466 SDLoc DL(EFLAGS);
48467 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
48468 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48469 Result =
48470 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
48471 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
48472 }
48473
48474 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
48475 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
48476 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
48477 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
48478 // iff every element is referenced.
48479 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
48480 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
48481 (NumEltBits == 32 || NumEltBits == 64)) {
48482 SDLoc DL(EFLAGS);
48483 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
48484 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
48485 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
48486 SDValue LHS = Vec;
48487 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
48488 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48489 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
48490 DAG.getBitcast(FloatVT, LHS),
48491 DAG.getBitcast(FloatVT, RHS));
48492 }
48493
48494 return SDValue();
48495}
48496
48497/// Optimize an EFLAGS definition used according to the condition code \p CC
48498/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
48499/// uses of chain values.
48501 SelectionDAG &DAG,
48502 const X86Subtarget &Subtarget) {
48503 if (CC == X86::COND_B)
48504 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
48505 return Flags;
48506
48507 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
48508 return R;
48509
48510 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
48511 return R;
48512
48513 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
48514 return R;
48515
48516 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
48517 return R;
48518
48519 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
48520}
48521
48522/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
48525 const X86Subtarget &Subtarget) {
48526 SDLoc DL(N);
48527 EVT VT = N->getValueType(0);
48528 SDValue FalseOp = N->getOperand(0);
48529 SDValue TrueOp = N->getOperand(1);
48530 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
48531 SDValue Cond = N->getOperand(3);
48532
48533 // cmov X, X, ?, ? --> X
48534 if (TrueOp == FalseOp)
48535 return TrueOp;
48536
48537 // Try to simplify the EFLAGS and condition code operands.
48538 // We can't always do this as FCMOV only supports a subset of X86 cond.
48539 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
48540 if (!(FalseOp.getValueType() == MVT::f80 ||
48541 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
48542 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
48543 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
48544 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
48545 Flags};
48546 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
48547 }
48548 }
48549
48550 // If this is a select between two integer constants, try to do some
48551 // optimizations. Note that the operands are ordered the opposite of SELECT
48552 // operands.
48553 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
48554 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
48555 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
48556 // larger than FalseC (the false value).
48557 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
48559 std::swap(TrueC, FalseC);
48560 std::swap(TrueOp, FalseOp);
48561 }
48562
48563 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
48564 // This is efficient for any integer data type (including i8/i16) and
48565 // shift amount.
48566 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
48567 Cond = getSETCC(CC, Cond, DL, DAG);
48568
48569 // Zero extend the condition if needed.
48570 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
48571
48572 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
48573 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
48574 DAG.getConstant(ShAmt, DL, MVT::i8));
48575 return Cond;
48576 }
48577
48578 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
48579 // for any integer data type, including i8/i16.
48580 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
48581 Cond = getSETCC(CC, Cond, DL, DAG);
48582
48583 // Zero extend the condition if needed.
48585 FalseC->getValueType(0), Cond);
48586 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48587 SDValue(FalseC, 0));
48588 return Cond;
48589 }
48590
48591 // Optimize cases that will turn into an LEA instruction. This requires
48592 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
48593 if (VT == MVT::i32 || VT == MVT::i64) {
48594 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
48595 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
48596 "Implicit constant truncation");
48597
48598 bool isFastMultiplier = false;
48599 if (Diff.ult(10)) {
48600 switch (Diff.getZExtValue()) {
48601 default: break;
48602 case 1: // result = add base, cond
48603 case 2: // result = lea base( , cond*2)
48604 case 3: // result = lea base(cond, cond*2)
48605 case 4: // result = lea base( , cond*4)
48606 case 5: // result = lea base(cond, cond*4)
48607 case 8: // result = lea base( , cond*8)
48608 case 9: // result = lea base(cond, cond*8)
48609 isFastMultiplier = true;
48610 break;
48611 }
48612 }
48613
48614 if (isFastMultiplier) {
48615 Cond = getSETCC(CC, Cond, DL ,DAG);
48616 // Zero extend the condition if needed.
48617 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
48618 Cond);
48619 // Scale the condition by the difference.
48620 if (Diff != 1)
48621 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
48622 DAG.getConstant(Diff, DL, Cond.getValueType()));
48623
48624 // Add the base if non-zero.
48625 if (FalseC->getAPIntValue() != 0)
48626 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48627 SDValue(FalseC, 0));
48628 return Cond;
48629 }
48630 }
48631 }
48632 }
48633
48634 // Handle these cases:
48635 // (select (x != c), e, c) -> select (x != c), e, x),
48636 // (select (x == c), c, e) -> select (x == c), x, e)
48637 // where the c is an integer constant, and the "select" is the combination
48638 // of CMOV and CMP.
48639 //
48640 // The rationale for this change is that the conditional-move from a constant
48641 // needs two instructions, however, conditional-move from a register needs
48642 // only one instruction.
48643 //
48644 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
48645 // some instruction-combining opportunities. This opt needs to be
48646 // postponed as late as possible.
48647 //
48648 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
48649 // the DCI.xxxx conditions are provided to postpone the optimization as
48650 // late as possible.
48651
48652 ConstantSDNode *CmpAgainst = nullptr;
48653 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
48654 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
48655 !isa<ConstantSDNode>(Cond.getOperand(0))) {
48656
48657 if (CC == X86::COND_NE &&
48658 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
48660 std::swap(TrueOp, FalseOp);
48661 }
48662
48663 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
48664 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
48665 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
48666 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
48667 }
48668 }
48669 }
48670
48671 // Transform:
48672 //
48673 // (cmov 1 T (uge T 2))
48674 //
48675 // to:
48676 //
48677 // (adc T 0 (sub T 1))
48678 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
48679 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
48680 SDValue Cond0 = Cond.getOperand(0);
48681 if (Cond0.getOpcode() == ISD::TRUNCATE)
48682 Cond0 = Cond0.getOperand(0);
48683 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
48684 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
48685 EVT CondVT = Cond->getValueType(0);
48686 // Subtract 1 and generate a carry.
48687 SDValue NewSub =
48688 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
48689 DAG.getConstant(1, DL, CondVT));
48690 SDValue EFLAGS(NewSub.getNode(), 1);
48691 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
48692 DAG.getConstant(0, DL, VT), EFLAGS);
48693 }
48694 }
48695
48696 // Fold and/or of setcc's to double CMOV:
48697 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
48698 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
48699 //
48700 // This combine lets us generate:
48701 // cmovcc1 (jcc1 if we don't have CMOV)
48702 // cmovcc2 (same)
48703 // instead of:
48704 // setcc1
48705 // setcc2
48706 // and/or
48707 // cmovne (jne if we don't have CMOV)
48708 // When we can't use the CMOV instruction, it might increase branch
48709 // mispredicts.
48710 // When we can use CMOV, or when there is no mispredict, this improves
48711 // throughput and reduces register pressure.
48712 //
48713 if (CC == X86::COND_NE) {
48714 SDValue Flags;
48715 X86::CondCode CC0, CC1;
48716 bool isAndSetCC;
48717 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
48718 if (isAndSetCC) {
48719 std::swap(FalseOp, TrueOp);
48722 }
48723
48724 SDValue LOps[] = {FalseOp, TrueOp,
48725 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
48726 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
48727 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
48728 Flags};
48729 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
48730 return CMOV;
48731 }
48732 }
48733
48734 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
48735 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
48736 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
48737 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
48738 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
48739 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
48740 SDValue Add = TrueOp;
48741 SDValue Const = FalseOp;
48742 // Canonicalize the condition code for easier matching and output.
48743 if (CC == X86::COND_E)
48744 std::swap(Add, Const);
48745
48746 // We might have replaced the constant in the cmov with the LHS of the
48747 // compare. If so change it to the RHS of the compare.
48748 if (Const == Cond.getOperand(0))
48749 Const = Cond.getOperand(1);
48750
48751 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
48752 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
48753 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
48754 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
48755 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
48756 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
48757 // This should constant fold.
48758 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
48759 SDValue CMov =
48760 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
48761 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
48762 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
48763 }
48764 }
48765
48766 return SDValue();
48767}
48768
48769/// Different mul shrinking modes.
48771
48773 EVT VT = N->getOperand(0).getValueType();
48774 if (VT.getScalarSizeInBits() != 32)
48775 return false;
48776
48777 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
48778 unsigned SignBits[2] = {1, 1};
48779 bool IsPositive[2] = {false, false};
48780 for (unsigned i = 0; i < 2; i++) {
48781 SDValue Opd = N->getOperand(i);
48782
48783 SignBits[i] = DAG.ComputeNumSignBits(Opd);
48784 IsPositive[i] = DAG.SignBitIsZero(Opd);
48785 }
48786
48787 bool AllPositive = IsPositive[0] && IsPositive[1];
48788 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
48789 // When ranges are from -128 ~ 127, use MULS8 mode.
48790 if (MinSignBits >= 25)
48791 Mode = ShrinkMode::MULS8;
48792 // When ranges are from 0 ~ 255, use MULU8 mode.
48793 else if (AllPositive && MinSignBits >= 24)
48794 Mode = ShrinkMode::MULU8;
48795 // When ranges are from -32768 ~ 32767, use MULS16 mode.
48796 else if (MinSignBits >= 17)
48797 Mode = ShrinkMode::MULS16;
48798 // When ranges are from 0 ~ 65535, use MULU16 mode.
48799 else if (AllPositive && MinSignBits >= 16)
48800 Mode = ShrinkMode::MULU16;
48801 else
48802 return false;
48803 return true;
48804}
48805
48806/// When the operands of vector mul are extended from smaller size values,
48807/// like i8 and i16, the type of mul may be shrinked to generate more
48808/// efficient code. Two typical patterns are handled:
48809/// Pattern1:
48810/// %2 = sext/zext <N x i8> %1 to <N x i32>
48811/// %4 = sext/zext <N x i8> %3 to <N x i32>
48812// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48813/// %5 = mul <N x i32> %2, %4
48814///
48815/// Pattern2:
48816/// %2 = zext/sext <N x i16> %1 to <N x i32>
48817/// %4 = zext/sext <N x i16> %3 to <N x i32>
48818/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48819/// %5 = mul <N x i32> %2, %4
48820///
48821/// There are four mul shrinking modes:
48822/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
48823/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
48824/// generate pmullw+sext32 for it (MULS8 mode).
48825/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
48826/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
48827/// generate pmullw+zext32 for it (MULU8 mode).
48828/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
48829/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
48830/// generate pmullw+pmulhw for it (MULS16 mode).
48831/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
48832/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
48833/// generate pmullw+pmulhuw for it (MULU16 mode).
48835 const X86Subtarget &Subtarget) {
48836 // Check for legality
48837 // pmullw/pmulhw are not supported by SSE.
48838 if (!Subtarget.hasSSE2())
48839 return SDValue();
48840
48841 // Check for profitability
48842 // pmulld is supported since SSE41. It is better to use pmulld
48843 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
48844 // the expansion.
48845 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
48846 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
48847 return SDValue();
48848
48849 ShrinkMode Mode;
48850 if (!canReduceVMulWidth(N, DAG, Mode))
48851 return SDValue();
48852
48853 SDValue N0 = N->getOperand(0);
48854 SDValue N1 = N->getOperand(1);
48855 EVT VT = N->getOperand(0).getValueType();
48856 unsigned NumElts = VT.getVectorNumElements();
48857 if ((NumElts % 2) != 0)
48858 return SDValue();
48859
48860 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
48861
48862 // Shrink the operands of mul.
48863 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
48864 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
48865
48866 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
48867 // lower part is needed.
48868 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
48869 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
48870 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
48872 DL, VT, MulLo);
48873
48874 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
48875 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
48876 // the higher part is also needed.
48877 SDValue MulHi =
48878 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
48879 ReducedVT, NewN0, NewN1);
48880
48881 // Repack the lower part and higher part result of mul into a wider
48882 // result.
48883 // Generate shuffle functioning as punpcklwd.
48884 SmallVector<int, 16> ShuffleMask(NumElts);
48885 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48886 ShuffleMask[2 * i] = i;
48887 ShuffleMask[2 * i + 1] = i + NumElts;
48888 }
48889 SDValue ResLo =
48890 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48891 ResLo = DAG.getBitcast(ResVT, ResLo);
48892 // Generate shuffle functioning as punpckhwd.
48893 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48894 ShuffleMask[2 * i] = i + NumElts / 2;
48895 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
48896 }
48897 SDValue ResHi =
48898 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48899 ResHi = DAG.getBitcast(ResVT, ResHi);
48900 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
48901}
48902
48904 EVT VT, const SDLoc &DL) {
48905
48906 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
48907 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48908 DAG.getConstant(Mult, DL, VT));
48909 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
48910 DAG.getConstant(Shift, DL, MVT::i8));
48911 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48912 N->getOperand(0));
48913 return Result;
48914 };
48915
48916 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
48917 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48918 DAG.getConstant(Mul1, DL, VT));
48919 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
48920 DAG.getConstant(Mul2, DL, VT));
48921 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48922 N->getOperand(0));
48923 return Result;
48924 };
48925
48926 switch (MulAmt) {
48927 default:
48928 break;
48929 case 11:
48930 // mul x, 11 => add ((shl (mul x, 5), 1), x)
48931 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
48932 case 21:
48933 // mul x, 21 => add ((shl (mul x, 5), 2), x)
48934 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
48935 case 41:
48936 // mul x, 41 => add ((shl (mul x, 5), 3), x)
48937 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
48938 case 22:
48939 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
48940 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48941 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
48942 case 19:
48943 // mul x, 19 => add ((shl (mul x, 9), 1), x)
48944 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
48945 case 37:
48946 // mul x, 37 => add ((shl (mul x, 9), 2), x)
48947 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
48948 case 73:
48949 // mul x, 73 => add ((shl (mul x, 9), 3), x)
48950 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
48951 case 13:
48952 // mul x, 13 => add ((shl (mul x, 3), 2), x)
48953 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
48954 case 23:
48955 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
48956 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
48957 case 26:
48958 // mul x, 26 => add ((mul (mul x, 5), 5), x)
48959 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
48960 case 28:
48961 // mul x, 28 => add ((mul (mul x, 9), 3), x)
48962 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
48963 case 29:
48964 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
48965 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48966 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
48967 }
48968
48969 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
48970 // by a single LEA.
48971 // First check if this a sum of two power of 2s because that's easy. Then
48972 // count how many zeros are up to the first bit.
48973 // TODO: We can do this even without LEA at a cost of two shifts and an add.
48974 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
48975 unsigned ScaleShift = llvm::countr_zero(MulAmt);
48976 if (ScaleShift >= 1 && ScaleShift < 4) {
48977 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
48978 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48979 DAG.getConstant(ShiftAmt, DL, MVT::i8));
48980 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48981 DAG.getConstant(ScaleShift, DL, MVT::i8));
48982 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
48983 }
48984 }
48985
48986 return SDValue();
48987}
48988
48989// If the upper 17 bits of either element are zero and the other element are
48990// zero/sign bits then we can use PMADDWD, which is always at least as quick as
48991// PMULLD, except on KNL.
48993 SelectionDAG &DAG,
48994 const X86Subtarget &Subtarget) {
48995 if (!Subtarget.hasSSE2())
48996 return SDValue();
48997
48998 if (Subtarget.isPMADDWDSlow())
48999 return SDValue();
49000
49001 EVT VT = N->getValueType(0);
49002
49003 // Only support vXi32 vectors.
49004 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49005 return SDValue();
49006
49007 // Make sure the type is legal or can split/widen to a legal type.
49008 // With AVX512 but without BWI, we would need to split v32i16.
49009 unsigned NumElts = VT.getVectorNumElements();
49010 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49011 return SDValue();
49012
49013 // With AVX512 but without BWI, we would need to split v32i16.
49014 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49015 return SDValue();
49016
49017 SDValue N0 = N->getOperand(0);
49018 SDValue N1 = N->getOperand(1);
49019
49020 // If we are zero/sign extending two steps without SSE4.1, its better to
49021 // reduce the vmul width instead.
49022 if (!Subtarget.hasSSE41() &&
49023 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49024 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49025 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49026 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49027 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49028 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49029 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49030 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49031 return SDValue();
49032
49033 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49034 // the vmul width instead.
49035 if (!Subtarget.hasSSE41() &&
49036 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49037 N0.getOperand(0).getValueSizeInBits() > 128) &&
49038 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49039 N1.getOperand(0).getValueSizeInBits() > 128))
49040 return SDValue();
49041
49042 // Sign bits must extend down to the lowest i16.
49043 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49044 DAG.ComputeMaxSignificantBits(N0) > 16)
49045 return SDValue();
49046
49047 // At least one of the elements must be zero in the upper 17 bits, or can be
49048 // safely made zero without altering the final result.
49049 auto GetZeroableOp = [&](SDValue Op) {
49050 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49051 if (DAG.MaskedValueIsZero(Op, Mask17))
49052 return Op;
49053 // Mask off upper 16-bits of sign-extended constants.
49055 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49056 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49057 SDValue Src = Op.getOperand(0);
49058 // Convert sext(vXi16) to zext(vXi16).
49059 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49060 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49061 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49062 // which will expand the extension.
49063 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49064 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49065 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49066 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49067 }
49068 }
49069 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49070 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49071 N->isOnlyUserOf(Op.getNode())) {
49072 SDValue Src = Op.getOperand(0);
49073 if (Src.getScalarValueSizeInBits() == 16)
49074 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49075 }
49076 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49077 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49078 N->isOnlyUserOf(Op.getNode())) {
49079 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49080 Op.getOperand(1));
49081 }
49082 return SDValue();
49083 };
49084 SDValue ZeroN0 = GetZeroableOp(N0);
49085 SDValue ZeroN1 = GetZeroableOp(N1);
49086 if (!ZeroN0 && !ZeroN1)
49087 return SDValue();
49088 N0 = ZeroN0 ? ZeroN0 : N0;
49089 N1 = ZeroN1 ? ZeroN1 : N1;
49090
49091 // Use SplitOpsAndApply to handle AVX splitting.
49092 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49093 ArrayRef<SDValue> Ops) {
49094 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49095 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49096 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49097 DAG.getBitcast(OpVT, Ops[0]),
49098 DAG.getBitcast(OpVT, Ops[1]));
49099 };
49100 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49101}
49102
49104 const X86Subtarget &Subtarget) {
49105 if (!Subtarget.hasSSE2())
49106 return SDValue();
49107
49108 EVT VT = N->getValueType(0);
49109
49110 // Only support vXi64 vectors.
49111 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49112 VT.getVectorNumElements() < 2 ||
49114 return SDValue();
49115
49116 SDValue N0 = N->getOperand(0);
49117 SDValue N1 = N->getOperand(1);
49118
49119 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49120 // 32-bits. We can lower with this if the sign bits stretch that far.
49121 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49122 DAG.ComputeNumSignBits(N1) > 32) {
49123 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49124 ArrayRef<SDValue> Ops) {
49125 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49126 };
49127 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49128 /*CheckBWI*/ false);
49129 }
49130
49131 // If the upper bits are zero we can use a single pmuludq.
49132 APInt Mask = APInt::getHighBitsSet(64, 32);
49133 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49134 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49135 ArrayRef<SDValue> Ops) {
49136 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49137 };
49138 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49139 /*CheckBWI*/ false);
49140 }
49141
49142 return SDValue();
49143}
49144
49147 const X86Subtarget &Subtarget) {
49148 EVT VT = N->getValueType(0);
49149 SDLoc DL(N);
49150
49151 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49152 return V;
49153
49154 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49155 return V;
49156
49157 if (DCI.isBeforeLegalize() && VT.isVector())
49158 return reduceVMULWidth(N, DL, DAG, Subtarget);
49159
49160 if (VT != MVT::i64 && VT != MVT::i32 &&
49161 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49162 return SDValue();
49163
49164 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49165 if (!Known1.isConstant())
49166 return SDValue();
49167
49168 const APInt &C = Known1.getConstant();
49169 if (C.isZero())
49170 return DAG.getConstant(0, DL, VT);
49171
49172 if (C.isAllOnes())
49173 return DAG.getNegative(N->getOperand(0), DL, VT);
49174
49175 if (isPowerOf2_64(C.getZExtValue()))
49176 return SDValue();
49177
49178 // Optimize a single multiply with constant into two operations in order to
49179 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49181 return SDValue();
49182
49183 // An imul is usually smaller than the alternative sequence.
49185 return SDValue();
49186
49187 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49188 return SDValue();
49189
49190 int64_t SignMulAmt = C.getSExtValue();
49191 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49192 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49193
49194 SDValue NewMul = SDValue();
49195 if (VT == MVT::i64 || VT == MVT::i32) {
49196 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49197 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49198 DAG.getConstant(AbsMulAmt, DL, VT));
49199 if (SignMulAmt < 0)
49200 NewMul = DAG.getNegative(NewMul, DL, VT);
49201
49202 return NewMul;
49203 }
49204
49205 uint64_t MulAmt1 = 0;
49206 uint64_t MulAmt2 = 0;
49207 if ((AbsMulAmt % 9) == 0) {
49208 MulAmt1 = 9;
49209 MulAmt2 = AbsMulAmt / 9;
49210 } else if ((AbsMulAmt % 5) == 0) {
49211 MulAmt1 = 5;
49212 MulAmt2 = AbsMulAmt / 5;
49213 } else if ((AbsMulAmt % 3) == 0) {
49214 MulAmt1 = 3;
49215 MulAmt2 = AbsMulAmt / 3;
49216 }
49217
49218 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49219 if (MulAmt2 &&
49220 (isPowerOf2_64(MulAmt2) ||
49221 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49222
49223 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49224 N->user_begin()->getOpcode() == ISD::ADD))
49225 // If second multiplifer is pow2, issue it first. We want the multiply
49226 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49227 // use is an add. Only do this for positive multiply amounts since the
49228 // negate would prevent it from being used as an address mode anyway.
49229 std::swap(MulAmt1, MulAmt2);
49230
49231 if (isPowerOf2_64(MulAmt1))
49232 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49233 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49234 else
49235 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49236 DAG.getConstant(MulAmt1, DL, VT));
49237
49238 if (isPowerOf2_64(MulAmt2))
49239 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49240 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49241 else
49242 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49243 DAG.getConstant(MulAmt2, DL, VT));
49244
49245 // Negate the result.
49246 if (SignMulAmt < 0)
49247 NewMul = DAG.getNegative(NewMul, DL, VT);
49248 } else if (!Subtarget.slowLEA())
49249 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49250 }
49251 if (!NewMul) {
49252 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49253 if (isPowerOf2_64(AbsMulAmt - 1)) {
49254 // (mul x, 2^N + 1) => (add (shl x, N), x)
49255 NewMul = DAG.getNode(
49256 ISD::ADD, DL, VT, N->getOperand(0),
49257 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49258 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49259 if (SignMulAmt < 0)
49260 NewMul = DAG.getNegative(NewMul, DL, VT);
49261 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49262 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49263 NewMul =
49264 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49265 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49266 // To negate, reverse the operands of the subtract.
49267 if (SignMulAmt < 0)
49268 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49269 else
49270 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49271 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49272 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49273 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49274 NewMul =
49275 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49276 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49277 NewMul = DAG.getNode(
49278 ISD::ADD, DL, VT, NewMul,
49279 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49280 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49281 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49282 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49283 NewMul =
49284 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49285 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49286 NewMul = DAG.getNode(
49287 ISD::SUB, DL, VT, NewMul,
49288 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49289 } else if (SignMulAmt >= 0 && VT.isVector() &&
49290 Subtarget.fastImmVectorShift()) {
49291 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49292 uint64_t ShiftAmt1;
49293 std::optional<unsigned> Opc;
49294 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49295 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49296 Opc = ISD::ADD;
49297 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49298 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49299 Opc = ISD::SUB;
49300 }
49301
49302 if (Opc) {
49303 SDValue Shift1 =
49304 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49305 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49306 SDValue Shift2 =
49307 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49308 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49309 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49310 }
49311 }
49312 }
49313
49314 return NewMul;
49315}
49316
49317// Try to form a MULHU or MULHS node by looking for
49318// (srl (mul ext, ext), 16)
49319// TODO: This is X86 specific because we want to be able to handle wide types
49320// before type legalization. But we can only do it if the vector will be
49321// legalized via widening/splitting. Type legalization can't handle promotion
49322// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49323// combiner.
49325 const SDLoc &DL,
49326 const X86Subtarget &Subtarget) {
49327 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49328 "SRL or SRA node is required here!");
49329
49330 if (!Subtarget.hasSSE2())
49331 return SDValue();
49332
49333 // The operation feeding into the shift must be a multiply.
49334 SDValue ShiftOperand = N->getOperand(0);
49335 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
49336 return SDValue();
49337
49338 // Input type should be at least vXi32.
49339 EVT VT = N->getValueType(0);
49340 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49341 return SDValue();
49342
49343 // Need a shift by 16.
49344 APInt ShiftAmt;
49345 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
49346 ShiftAmt != 16)
49347 return SDValue();
49348
49349 SDValue LHS = ShiftOperand.getOperand(0);
49350 SDValue RHS = ShiftOperand.getOperand(1);
49351
49352 unsigned ExtOpc = LHS.getOpcode();
49353 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49354 RHS.getOpcode() != ExtOpc)
49355 return SDValue();
49356
49357 // Peek through the extends.
49358 LHS = LHS.getOperand(0);
49359 RHS = RHS.getOperand(0);
49360
49361 // Ensure the input types match.
49362 EVT MulVT = LHS.getValueType();
49363 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49364 return SDValue();
49365
49366 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49367 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49368
49369 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49370 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49371}
49372
49374 const X86Subtarget &Subtarget) {
49375 using namespace llvm::SDPatternMatch;
49376 SDValue N0 = N->getOperand(0);
49377 SDValue N1 = N->getOperand(1);
49378 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
49379 EVT VT = N0.getValueType();
49380 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49381 SDLoc DL(N);
49382
49383 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49384 // with out-of-bounds clamping.
49385 if (N0.getOpcode() == ISD::VSELECT &&
49386 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
49387 SDValue Cond = N0.getOperand(0);
49388 SDValue N00 = N0.getOperand(1);
49389 SDValue N01 = N0.getOperand(2);
49390 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
49392 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49393 m_SpecificCondCode(ISD::SETULT)))) {
49394 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
49395 }
49396 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
49398 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49399 m_SpecificCondCode(ISD::SETUGE)))) {
49400 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
49401 }
49402 }
49403
49404 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
49405 // since the result of setcc_c is all zero's or all ones.
49406 if (VT.isInteger() && !VT.isVector() &&
49407 N1C && N0.getOpcode() == ISD::AND &&
49408 N0.getOperand(1).getOpcode() == ISD::Constant) {
49409 SDValue N00 = N0.getOperand(0);
49410 APInt Mask = N0.getConstantOperandAPInt(1);
49411 Mask <<= N1C->getAPIntValue();
49412 bool MaskOK = false;
49413 // We can handle cases concerning bit-widening nodes containing setcc_c if
49414 // we carefully interrogate the mask to make sure we are semantics
49415 // preserving.
49416 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
49417 // of the underlying setcc_c operation if the setcc_c was zero extended.
49418 // Consider the following example:
49419 // zext(setcc_c) -> i32 0x0000FFFF
49420 // c1 -> i32 0x0000FFFF
49421 // c2 -> i32 0x00000001
49422 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
49423 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
49424 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
49425 MaskOK = true;
49426 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
49428 MaskOK = true;
49429 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
49430 N00.getOpcode() == ISD::ANY_EXTEND) &&
49432 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
49433 }
49434 if (MaskOK && Mask != 0)
49435 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
49436 }
49437
49438 return SDValue();
49439}
49440
49442 const X86Subtarget &Subtarget) {
49443 using namespace llvm::SDPatternMatch;
49444 SDValue N0 = N->getOperand(0);
49445 SDValue N1 = N->getOperand(1);
49446 EVT VT = N0.getValueType();
49447 unsigned Size = VT.getSizeInBits();
49448 SDLoc DL(N);
49449
49450 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
49451 return V;
49452
49453 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
49454 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
49455 SDValue ShrAmtVal;
49456 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
49458 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
49459 }
49460
49461 // fold (SRA (SHL X, ShlConst), SraConst)
49462 // into (SHL (sext_in_reg X), ShlConst - SraConst)
49463 // or (sext_in_reg X)
49464 // or (SRA (sext_in_reg X), SraConst - ShlConst)
49465 // depending on relation between SraConst and ShlConst.
49466 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
49467 // us to do the sext_in_reg from corresponding bit.
49468
49469 // sexts in X86 are MOVs. The MOVs have the same code size
49470 // as above SHIFTs (only SHIFT on 1 has lower code size).
49471 // However the MOVs have 2 advantages to a SHIFT:
49472 // 1. MOVs can write to a register that differs from source
49473 // 2. MOVs accept memory operands
49474
49475 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
49476 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
49478 return SDValue();
49479
49480 SDValue N00 = N0.getOperand(0);
49481 SDValue N01 = N0.getOperand(1);
49482 APInt ShlConst = N01->getAsAPIntVal();
49483 APInt SraConst = N1->getAsAPIntVal();
49484 EVT CVT = N1.getValueType();
49485
49486 if (CVT != N01.getValueType())
49487 return SDValue();
49488 if (SraConst.isNegative())
49489 return SDValue();
49490
49491 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
49492 unsigned ShiftSize = SVT.getSizeInBits();
49493 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
49494 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
49495 continue;
49496 SDValue NN =
49497 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
49498 if (SraConst.eq(ShlConst))
49499 return NN;
49500 if (SraConst.ult(ShlConst))
49501 return DAG.getNode(ISD::SHL, DL, VT, NN,
49502 DAG.getConstant(ShlConst - SraConst, DL, CVT));
49503 return DAG.getNode(ISD::SRA, DL, VT, NN,
49504 DAG.getConstant(SraConst - ShlConst, DL, CVT));
49505 }
49506 return SDValue();
49507}
49508
49511 const X86Subtarget &Subtarget) {
49512 using namespace llvm::SDPatternMatch;
49513 SDValue N0 = N->getOperand(0);
49514 SDValue N1 = N->getOperand(1);
49515 EVT VT = N0.getValueType();
49516 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49517 SDLoc DL(N);
49518
49519 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
49520 return V;
49521
49522 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49523 // with out-of-bounds clamping.
49524 if (N0.getOpcode() == ISD::VSELECT &&
49525 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
49526 SDValue Cond = N0.getOperand(0);
49527 SDValue N00 = N0.getOperand(1);
49528 SDValue N01 = N0.getOperand(2);
49529 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
49531 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49532 m_SpecificCondCode(ISD::SETULT)))) {
49533 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
49534 }
49535 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
49537 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49538 m_SpecificCondCode(ISD::SETUGE)))) {
49539 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
49540 }
49541 }
49542
49543 // Only do this on the last DAG combine as it can interfere with other
49544 // combines.
49545 if (!DCI.isAfterLegalizeDAG())
49546 return SDValue();
49547
49548 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
49549 // TODO: This is a generic DAG combine that became an x86-only combine to
49550 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
49551 // and-not ('andn').
49552 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
49553 return SDValue();
49554
49555 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
49556 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
49557 if (!ShiftC || !AndC)
49558 return SDValue();
49559
49560 // If we can shrink the constant mask below 8-bits or 32-bits, then this
49561 // transform should reduce code size. It may also enable secondary transforms
49562 // from improved known-bits analysis or instruction selection.
49563 APInt MaskVal = AndC->getAPIntValue();
49564
49565 // If this can be matched by a zero extend, don't optimize.
49566 if (MaskVal.isMask()) {
49567 unsigned TO = MaskVal.countr_one();
49568 if (TO >= 8 && isPowerOf2_32(TO))
49569 return SDValue();
49570 }
49571
49572 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
49573 unsigned OldMaskSize = MaskVal.getSignificantBits();
49574 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
49575 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
49576 (OldMaskSize > 32 && NewMaskSize <= 32)) {
49577 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
49578 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
49579 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
49580 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
49581 }
49582 return SDValue();
49583}
49584
49586 const X86Subtarget &Subtarget) {
49587 unsigned Opcode = N->getOpcode();
49588 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
49589
49590 SDLoc DL(N);
49591 EVT VT = N->getValueType(0);
49592 SDValue N0 = N->getOperand(0);
49593 SDValue N1 = N->getOperand(1);
49594 EVT SrcVT = N0.getValueType();
49595
49596 SDValue BC0 =
49597 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
49598 SDValue BC1 =
49599 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
49600
49601 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
49602 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
49603 // truncation trees that help us avoid lane crossing shuffles.
49604 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
49605 // TODO: We don't handle vXf64 shuffles yet.
49606 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
49607 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
49609 SmallVector<int> ShuffleMask, ScaledMask;
49610 SDValue Vec = peekThroughBitcasts(BCSrc);
49611 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
49613 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
49614 // shuffle to a v4X64 width - we can probably relax this in the future.
49615 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
49616 ShuffleOps[0].getValueType().is256BitVector() &&
49617 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
49618 SDValue Lo, Hi;
49619 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
49620 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
49621 Lo = DAG.getBitcast(SrcVT, Lo);
49622 Hi = DAG.getBitcast(SrcVT, Hi);
49623 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
49624 Res = DAG.getBitcast(ShufVT, Res);
49625 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
49626 return DAG.getBitcast(VT, Res);
49627 }
49628 }
49629 }
49630 }
49631
49632 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
49633 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
49634 // If either/both ops are a shuffle that can scale to v2x64,
49635 // then see if we can perform this as a v4x32 post shuffle.
49636 SmallVector<SDValue> Ops0, Ops1;
49637 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
49638 bool IsShuf0 =
49639 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
49640 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
49641 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
49642 bool IsShuf1 =
49643 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
49644 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
49645 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
49646 if (IsShuf0 || IsShuf1) {
49647 if (!IsShuf0) {
49648 Ops0.assign({BC0});
49649 ScaledMask0.assign({0, 1});
49650 }
49651 if (!IsShuf1) {
49652 Ops1.assign({BC1});
49653 ScaledMask1.assign({0, 1});
49654 }
49655
49656 SDValue LHS, RHS;
49657 int PostShuffle[4] = {-1, -1, -1, -1};
49658 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
49659 if (M < 0)
49660 return true;
49661 Idx = M % 2;
49662 SDValue Src = Ops[M / 2];
49663 if (!LHS || LHS == Src) {
49664 LHS = Src;
49665 return true;
49666 }
49667 if (!RHS || RHS == Src) {
49668 Idx += 2;
49669 RHS = Src;
49670 return true;
49671 }
49672 return false;
49673 };
49674 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
49675 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
49676 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
49677 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
49678 LHS = DAG.getBitcast(SrcVT, LHS);
49679 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
49680 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
49681 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
49682 Res = DAG.getBitcast(ShufVT, Res);
49683 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
49684 return DAG.getBitcast(VT, Res);
49685 }
49686 }
49687 }
49688
49689 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
49690 if (VT.is256BitVector() && Subtarget.hasInt256()) {
49691 SmallVector<int> Mask0, Mask1;
49692 SmallVector<SDValue> Ops0, Ops1;
49693 SmallVector<int, 2> ScaledMask0, ScaledMask1;
49694 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
49695 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
49696 !Ops0.empty() && !Ops1.empty() &&
49697 all_of(Ops0,
49698 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49699 all_of(Ops1,
49700 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49701 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
49702 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
49703 SDValue Op00 = peekThroughBitcasts(Ops0.front());
49704 SDValue Op10 = peekThroughBitcasts(Ops1.front());
49705 SDValue Op01 = peekThroughBitcasts(Ops0.back());
49706 SDValue Op11 = peekThroughBitcasts(Ops1.back());
49707 if ((Op00 == Op11) && (Op01 == Op10)) {
49708 std::swap(Op10, Op11);
49710 }
49711 if ((Op00 == Op10) && (Op01 == Op11)) {
49712 const int Map[4] = {0, 2, 1, 3};
49713 SmallVector<int, 4> ShuffleMask(
49714 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
49715 Map[ScaledMask1[1]]});
49716 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
49717 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
49718 DAG.getBitcast(SrcVT, Op01));
49719 Res = DAG.getBitcast(ShufVT, Res);
49720 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
49721 return DAG.getBitcast(VT, Res);
49722 }
49723 }
49724 }
49725
49726 return SDValue();
49727}
49728
49731 const X86Subtarget &Subtarget) {
49732 unsigned Opcode = N->getOpcode();
49733 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
49734 "Unexpected pack opcode");
49735
49736 EVT VT = N->getValueType(0);
49737 SDValue N0 = N->getOperand(0);
49738 SDValue N1 = N->getOperand(1);
49739 unsigned NumDstElts = VT.getVectorNumElements();
49740 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
49741 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
49742 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
49743 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
49744 "Unexpected PACKSS/PACKUS input type");
49745
49746 bool IsSigned = (X86ISD::PACKSS == Opcode);
49747
49748 // Constant Folding.
49749 APInt UndefElts0, UndefElts1;
49750 SmallVector<APInt, 32> EltBits0, EltBits1;
49751 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
49752 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
49753 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
49754 /*AllowWholeUndefs*/ true,
49755 /*AllowPartialUndefs*/ true) &&
49756 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
49757 /*AllowWholeUndefs*/ true,
49758 /*AllowPartialUndefs*/ true)) {
49759 unsigned NumLanes = VT.getSizeInBits() / 128;
49760 unsigned NumSrcElts = NumDstElts / 2;
49761 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
49762 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
49763
49764 APInt Undefs(NumDstElts, 0);
49765 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
49766 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
49767 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
49768 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
49769 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
49770 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
49771
49772 if (UndefElts[SrcIdx]) {
49773 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
49774 continue;
49775 }
49776
49777 APInt &Val = EltBits[SrcIdx];
49778 if (IsSigned) {
49779 // PACKSS: Truncate signed value with signed saturation.
49780 // Source values less than dst minint are saturated to minint.
49781 // Source values greater than dst maxint are saturated to maxint.
49782 Val = Val.truncSSat(DstBitsPerElt);
49783 } else {
49784 // PACKUS: Truncate signed value with unsigned saturation.
49785 // Source values less than zero are saturated to zero.
49786 // Source values greater than dst maxuint are saturated to maxuint.
49787 // NOTE: This is different from APInt::truncUSat.
49788 if (Val.isIntN(DstBitsPerElt))
49789 Val = Val.trunc(DstBitsPerElt);
49790 else if (Val.isNegative())
49791 Val = APInt::getZero(DstBitsPerElt);
49792 else
49793 Val = APInt::getAllOnes(DstBitsPerElt);
49794 }
49795 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
49796 }
49797 }
49798
49799 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
49800 }
49801
49802 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
49803 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49804 return V;
49805
49806 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
49807 // Currently limit this to allsignbits cases only.
49808 if (IsSigned &&
49809 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
49810 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
49811 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
49812 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
49813 if (Not0 && Not1) {
49814 SDLoc DL(N);
49815 MVT SrcVT = N0.getSimpleValueType();
49816 SDValue Pack =
49817 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
49818 DAG.getBitcast(SrcVT, Not1));
49819 return DAG.getNOT(DL, Pack, VT);
49820 }
49821 }
49822
49823 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
49824 // truncate to create a larger truncate.
49825 if (Subtarget.hasAVX512() &&
49826 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
49827 N0.getOperand(0).getValueType() == MVT::v8i32) {
49828 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
49829 (!IsSigned &&
49830 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
49831 if (Subtarget.hasVLX())
49832 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
49833
49834 // Widen input to v16i32 so we can truncate that.
49835 SDLoc dl(N);
49836 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
49837 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
49838 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
49839 }
49840 }
49841
49842 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
49843 if (VT.is128BitVector()) {
49844 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49845 SDValue Src0, Src1;
49846 if (N0.getOpcode() == ExtOpc &&
49848 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49849 Src0 = N0.getOperand(0);
49850 }
49851 if (N1.getOpcode() == ExtOpc &&
49853 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49854 Src1 = N1.getOperand(0);
49855 }
49856 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
49857 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
49858 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
49859 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
49860 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
49861 }
49862
49863 // Try again with pack(*_extend_vector_inreg, undef).
49864 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
49866 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
49867 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
49868 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
49869 DAG);
49870 }
49871
49872 // Attempt to combine as shuffle.
49873 SDValue Op(N, 0);
49874 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49875 return Res;
49876
49877 return SDValue();
49878}
49879
49882 const X86Subtarget &Subtarget) {
49883 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
49884 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
49885 "Unexpected horizontal add/sub opcode");
49886
49887 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
49888 MVT VT = N->getSimpleValueType(0);
49889 SDValue LHS = N->getOperand(0);
49890 SDValue RHS = N->getOperand(1);
49891
49892 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
49893 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
49894 LHS.getOpcode() == RHS.getOpcode() &&
49895 LHS.getValueType() == RHS.getValueType() &&
49896 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
49897 SDValue LHS0 = LHS.getOperand(0);
49898 SDValue LHS1 = LHS.getOperand(1);
49899 SDValue RHS0 = RHS.getOperand(0);
49900 SDValue RHS1 = RHS.getOperand(1);
49901 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
49902 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
49903 SDLoc DL(N);
49904 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
49905 LHS0.isUndef() ? LHS1 : LHS0,
49906 RHS0.isUndef() ? RHS1 : RHS0);
49907 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
49908 Res = DAG.getBitcast(ShufVT, Res);
49909 SDValue NewLHS =
49910 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49911 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
49912 SDValue NewRHS =
49913 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49914 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
49915 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
49916 DAG.getBitcast(VT, NewRHS));
49917 }
49918 }
49919 }
49920
49921 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
49922 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49923 return V;
49924
49925 return SDValue();
49926}
49927
49930 const X86Subtarget &Subtarget) {
49931 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
49932 X86ISD::VSRL == N->getOpcode()) &&
49933 "Unexpected shift opcode");
49934 EVT VT = N->getValueType(0);
49935 SDValue N0 = N->getOperand(0);
49936 SDValue N1 = N->getOperand(1);
49937
49938 // Shift zero -> zero.
49940 return DAG.getConstant(0, SDLoc(N), VT);
49941
49942 // Detect constant shift amounts.
49943 APInt UndefElts;
49944 SmallVector<APInt, 32> EltBits;
49945 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
49946 /*AllowWholeUndefs*/ true,
49947 /*AllowPartialUndefs*/ false)) {
49948 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
49949 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
49950 EltBits[0].getZExtValue(), DAG);
49951 }
49952
49953 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49954 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
49955 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
49956 return SDValue(N, 0);
49957
49958 return SDValue();
49959}
49960
49963 const X86Subtarget &Subtarget) {
49964 unsigned Opcode = N->getOpcode();
49965 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
49966 X86ISD::VSRLI == Opcode) &&
49967 "Unexpected shift opcode");
49968 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
49969 EVT VT = N->getValueType(0);
49970 SDValue N0 = N->getOperand(0);
49971 SDValue N1 = N->getOperand(1);
49972 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49973 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
49974 "Unexpected value type");
49975 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
49976
49977 // (shift undef, X) -> 0
49978 if (N0.isUndef())
49979 return DAG.getConstant(0, SDLoc(N), VT);
49980
49981 // Out of range logical bit shifts are guaranteed to be zero.
49982 // Out of range arithmetic bit shifts splat the sign bit.
49983 unsigned ShiftVal = N->getConstantOperandVal(1);
49984 if (ShiftVal >= NumBitsPerElt) {
49985 if (LogicalShift)
49986 return DAG.getConstant(0, SDLoc(N), VT);
49987 ShiftVal = NumBitsPerElt - 1;
49988 }
49989
49990 // (shift X, 0) -> X
49991 if (!ShiftVal)
49992 return N0;
49993
49994 // (shift 0, C) -> 0
49996 // N0 is all zeros or undef. We guarantee that the bits shifted into the
49997 // result are all zeros, not undef.
49998 return DAG.getConstant(0, SDLoc(N), VT);
49999
50000 // (VSRAI -1, C) -> -1
50001 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50002 // N0 is all ones or undef. We guarantee that the bits shifted into the
50003 // result are all ones, not undef.
50004 return DAG.getAllOnesConstant(SDLoc(N), VT);
50005
50006 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50007 unsigned NewShiftVal = Amt0 + Amt1;
50008 if (NewShiftVal >= NumBitsPerElt) {
50009 // Out of range logical bit shifts are guaranteed to be zero.
50010 // Out of range arithmetic bit shifts splat the sign bit.
50011 if (LogicalShift)
50012 return DAG.getConstant(0, SDLoc(N), VT);
50013 NewShiftVal = NumBitsPerElt - 1;
50014 }
50015 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50016 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50017 };
50018
50019 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50020 if (Opcode == N0.getOpcode())
50021 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50022
50023 // (shl (add X, X), C) -> (shl X, (C + 1))
50024 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50025 N0.getOperand(0) == N0.getOperand(1))
50026 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50027
50028 // We can decode 'whole byte' logical bit shifts as shuffles.
50029 if (LogicalShift && (ShiftVal % 8) == 0) {
50030 SDValue Op(N, 0);
50031 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50032 return Res;
50033 }
50034
50035 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50036 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50037 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50038 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50039 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50040 N0.getOpcode() == X86ISD::PSHUFD &&
50041 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50042 N0->hasOneUse()) {
50044 if (BC.getOpcode() == X86ISD::VSHLI &&
50045 BC.getScalarValueSizeInBits() == 64 &&
50046 BC.getConstantOperandVal(1) == 63) {
50047 SDLoc DL(N);
50048 SDValue Src = BC.getOperand(0);
50049 Src = DAG.getBitcast(VT, Src);
50050 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50051 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50052 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50053 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50054 return Src;
50055 }
50056 }
50057
50058 auto TryConstantFold = [&](SDValue V) {
50059 APInt UndefElts;
50060 SmallVector<APInt, 32> EltBits;
50061 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50062 /*AllowWholeUndefs*/ true,
50063 /*AllowPartialUndefs*/ true))
50064 return SDValue();
50065 assert(EltBits.size() == VT.getVectorNumElements() &&
50066 "Unexpected shift value type");
50067 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50068 // created an undef input due to no input bits being demanded, but user
50069 // still expects 0 in other bits.
50070 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50071 APInt &Elt = EltBits[i];
50072 if (UndefElts[i])
50073 Elt = 0;
50074 else if (X86ISD::VSHLI == Opcode)
50075 Elt <<= ShiftVal;
50076 else if (X86ISD::VSRAI == Opcode)
50077 Elt.ashrInPlace(ShiftVal);
50078 else
50079 Elt.lshrInPlace(ShiftVal);
50080 }
50081 // Reset undef elements since they were zeroed above.
50082 UndefElts = 0;
50083 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50084 };
50085
50086 // Constant Folding.
50087 if (N->isOnlyUserOf(N0.getNode())) {
50088 if (SDValue C = TryConstantFold(N0))
50089 return C;
50090
50091 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50092 // Don't break NOT patterns.
50094 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50095 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50097 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50098 SDLoc DL(N);
50099 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50100 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50101 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50102 }
50103 }
50104 }
50105
50106 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50107 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50108 DCI))
50109 return SDValue(N, 0);
50110
50111 return SDValue();
50112}
50113
50116 const X86Subtarget &Subtarget) {
50117 EVT VT = N->getValueType(0);
50118 unsigned Opcode = N->getOpcode();
50119 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50120 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50121 Opcode == ISD::INSERT_VECTOR_ELT) &&
50122 "Unexpected vector insertion");
50123
50124 SDValue Vec = N->getOperand(0);
50125 SDValue Scl = N->getOperand(1);
50126 SDValue Idx = N->getOperand(2);
50127
50128 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50129 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50130 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50131
50132 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50133 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50134 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50135 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50136 APInt::getAllOnes(NumBitsPerElt), DCI))
50137 return SDValue(N, 0);
50138 }
50139
50140 // Attempt to combine insertion patterns to a shuffle.
50141 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50142 SDValue Op(N, 0);
50143 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50144 return Res;
50145 }
50146
50147 return SDValue();
50148}
50149
50150/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50151/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50152/// OR -> CMPNEQSS.
50155 const X86Subtarget &Subtarget) {
50156 unsigned opcode;
50157
50158 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50159 // we're requiring SSE2 for both.
50160 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50161 SDValue N0 = N->getOperand(0);
50162 SDValue N1 = N->getOperand(1);
50163 SDValue CMP0 = N0.getOperand(1);
50164 SDValue CMP1 = N1.getOperand(1);
50165 SDLoc DL(N);
50166
50167 // The SETCCs should both refer to the same CMP.
50168 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50169 return SDValue();
50170
50171 SDValue CMP00 = CMP0->getOperand(0);
50172 SDValue CMP01 = CMP0->getOperand(1);
50173 EVT VT = CMP00.getValueType();
50174
50175 if (VT == MVT::f32 || VT == MVT::f64 ||
50176 (VT == MVT::f16 && Subtarget.hasFP16())) {
50177 bool ExpectingFlags = false;
50178 // Check for any users that want flags:
50179 for (const SDNode *U : N->users()) {
50180 if (ExpectingFlags)
50181 break;
50182
50183 switch (U->getOpcode()) {
50184 default:
50185 case ISD::BR_CC:
50186 case ISD::BRCOND:
50187 case ISD::SELECT:
50188 ExpectingFlags = true;
50189 break;
50190 case ISD::CopyToReg:
50191 case ISD::SIGN_EXTEND:
50192 case ISD::ZERO_EXTEND:
50193 case ISD::ANY_EXTEND:
50194 break;
50195 }
50196 }
50197
50198 if (!ExpectingFlags) {
50199 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50200 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50201
50202 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50203 X86::CondCode tmp = cc0;
50204 cc0 = cc1;
50205 cc1 = tmp;
50206 }
50207
50208 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50209 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50210 // FIXME: need symbolic constants for these magic numbers.
50211 // See X86ATTInstPrinter.cpp:printSSECC().
50212 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50213 if (Subtarget.hasAVX512()) {
50214 SDValue FSetCC =
50215 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50216 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50217 // Need to fill with zeros to ensure the bitcast will produce zeroes
50218 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50219 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50220 DAG.getConstant(0, DL, MVT::v16i1),
50221 FSetCC, DAG.getVectorIdxConstant(0, DL));
50222 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50223 N->getSimpleValueType(0));
50224 }
50225 SDValue OnesOrZeroesF =
50226 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50227 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50228
50229 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50230 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50231
50232 if (is64BitFP && !Subtarget.is64Bit()) {
50233 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50234 // 64-bit integer, since that's not a legal type. Since
50235 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50236 // bits, but can do this little dance to extract the lowest 32 bits
50237 // and work with those going forward.
50238 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50239 MVT::v2f64, OnesOrZeroesF);
50240 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50241 OnesOrZeroesF =
50242 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50243 DAG.getVectorIdxConstant(0, DL));
50244 IntVT = MVT::i32;
50245 }
50246
50247 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50248 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50249 DAG.getConstant(1, DL, IntVT));
50250 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50251 ANDed);
50252 return OneBitOfTruth;
50253 }
50254 }
50255 }
50256 }
50257 return SDValue();
50258}
50259
50260/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50262 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50263
50264 MVT VT = N->getSimpleValueType(0);
50265 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50266 return SDValue();
50267
50268 SDValue X, Y;
50269 SDValue N0 = N->getOperand(0);
50270 SDValue N1 = N->getOperand(1);
50271
50272 if (SDValue Not = IsNOT(N0, DAG)) {
50273 X = Not;
50274 Y = N1;
50275 } else if (SDValue Not = IsNOT(N1, DAG)) {
50276 X = Not;
50277 Y = N0;
50278 } else
50279 return SDValue();
50280
50281 X = DAG.getBitcast(VT, X);
50282 Y = DAG.getBitcast(VT, Y);
50283 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
50284}
50285
50286/// Try to fold:
50287/// and (vector_shuffle<Z,...,Z>
50288/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50289/// ->
50290/// andnp (vector_shuffle<Z,...,Z>
50291/// (insert_vector_elt undef, X, Z), undef), Y
50293 const X86Subtarget &Subtarget) {
50294 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50295
50296 EVT VT = N->getValueType(0);
50297 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50298 // value and require extra moves.
50299 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50300 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50301 return SDValue();
50302
50303 auto GetNot = [&DAG](SDValue V) {
50304 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
50305 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50306 // end-users are ISD::AND including cases
50307 // (and(extract_vector_element(SVN), Y)).
50308 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50309 !SVN->getOperand(1).isUndef()) {
50310 return SDValue();
50311 }
50312 SDValue IVEN = SVN->getOperand(0);
50313 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50314 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50315 return SDValue();
50316 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50317 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50318 return SDValue();
50319 SDValue Src = IVEN.getOperand(1);
50320 if (SDValue Not = IsNOT(Src, DAG)) {
50321 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50322 SDValue NotIVEN =
50324 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50325 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50326 SVN->getOperand(1), SVN->getMask());
50327 }
50328 return SDValue();
50329 };
50330
50331 SDValue X, Y;
50332 SDValue N0 = N->getOperand(0);
50333 SDValue N1 = N->getOperand(1);
50334 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50335
50336 if (SDValue Not = GetNot(N0)) {
50337 X = Not;
50338 Y = N1;
50339 } else if (SDValue Not = GetNot(N1)) {
50340 X = Not;
50341 Y = N0;
50342 } else
50343 return SDValue();
50344
50345 X = DAG.getBitcast(VT, X);
50346 Y = DAG.getBitcast(VT, Y);
50347 SDLoc DL(N);
50348
50349 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50350 // AVX2.
50351 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50353 SDValue LoX, HiX;
50354 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50355 SDValue LoY, HiY;
50356 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50357 EVT SplitVT = LoX.getValueType();
50358 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50359 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50360 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50361 }
50362
50363 if (TLI.isTypeLegal(VT))
50364 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50365
50366 return SDValue();
50367}
50368
50369// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50370// logical operations, like in the example below.
50371// or (and (truncate x, truncate y)),
50372// (xor (truncate z, build_vector (constants)))
50373// Given a target type \p VT, we generate
50374// or (and x, y), (xor z, zext(build_vector (constants)))
50375// given x, y and z are of type \p VT. We can do so, if operands are either
50376// truncates from VT types, the second operand is a vector of constants or can
50377// be recursively promoted.
50379 SelectionDAG &DAG, unsigned Depth) {
50380 // Limit recursion to avoid excessive compile times.
50382 return SDValue();
50383
50384 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
50385 return SDValue();
50386
50387 SDValue N0 = N.getOperand(0);
50388 SDValue N1 = N.getOperand(1);
50389
50390 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50391 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
50392 return SDValue();
50393
50394 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
50395 N0 = NN0;
50396 else {
50397 // The left side has to be a trunc.
50398 if (N0.getOpcode() != ISD::TRUNCATE)
50399 return SDValue();
50400
50401 // The type of the truncated inputs.
50402 if (N0.getOperand(0).getValueType() != VT)
50403 return SDValue();
50404
50405 N0 = N0.getOperand(0);
50406 }
50407
50408 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
50409 N1 = NN1;
50410 else {
50411 // The right side has to be a 'trunc' or a (foldable) constant.
50412 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
50413 N1.getOperand(0).getValueType() == VT;
50414 if (RHSTrunc)
50415 N1 = N1.getOperand(0);
50416 else if (SDValue Cst =
50418 N1 = Cst;
50419 else
50420 return SDValue();
50421 }
50422
50423 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
50424}
50425
50426// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
50427// register. In most cases we actually compare or select YMM-sized registers
50428// and mixing the two types creates horrible code. This method optimizes
50429// some of the transition sequences.
50430// Even with AVX-512 this is still useful for removing casts around logical
50431// operations on vXi1 mask types.
50433 SelectionDAG &DAG,
50434 const X86Subtarget &Subtarget) {
50435 EVT VT = N.getValueType();
50436 assert(VT.isVector() && "Expected vector type");
50437 assert((N.getOpcode() == ISD::ANY_EXTEND ||
50438 N.getOpcode() == ISD::ZERO_EXTEND ||
50439 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
50440
50441 SDValue Narrow = N.getOperand(0);
50442 EVT NarrowVT = Narrow.getValueType();
50443
50444 // Generate the wide operation.
50445 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
50446 if (!Op)
50447 return SDValue();
50448 switch (N.getOpcode()) {
50449 default: llvm_unreachable("Unexpected opcode");
50450 case ISD::ANY_EXTEND:
50451 return Op;
50452 case ISD::ZERO_EXTEND:
50453 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
50454 case ISD::SIGN_EXTEND:
50455 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
50456 Op, DAG.getValueType(NarrowVT));
50457 }
50458}
50459
50460static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
50461 unsigned FPOpcode;
50462 switch (Opcode) {
50463 // clang-format off
50464 default: llvm_unreachable("Unexpected input node for FP logic conversion");
50465 case ISD::AND: FPOpcode = X86ISD::FAND; break;
50466 case ISD::OR: FPOpcode = X86ISD::FOR; break;
50467 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
50468 // clang-format on
50469 }
50470 return FPOpcode;
50471}
50472
50473/// If both input operands of a logic op are being cast from floating-point
50474/// types or FP compares, try to convert this into a floating-point logic node
50475/// to avoid unnecessary moves from SSE to integer registers.
50476static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
50477 SDValue N0, SDValue N1,
50478 SelectionDAG &DAG,
50480 const X86Subtarget &Subtarget) {
50481 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50482 "Unexpected bit opcode");
50483
50484 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
50485 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
50486 return SDValue();
50487
50488 SDValue N00 = N0.getOperand(0);
50489 SDValue N10 = N1.getOperand(0);
50490 EVT N00Type = N00.getValueType();
50491 EVT N10Type = N10.getValueType();
50492
50493 // Ensure that both types are the same and are legal scalar fp types.
50494 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
50495 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
50496 (Subtarget.hasFP16() && N00Type == MVT::f16)))
50497 return SDValue();
50498
50499 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
50500 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
50501 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
50502 return DAG.getBitcast(VT, FPLogic);
50503 }
50504
50505 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
50506 !N1.hasOneUse())
50507 return SDValue();
50508
50509 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
50510 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
50511
50512 // The vector ISA for FP predicates is incomplete before AVX, so converting
50513 // COMIS* to CMPS* may not be a win before AVX.
50514 if (!Subtarget.hasAVX() &&
50515 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
50516 return SDValue();
50517
50518 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
50519 // and vector logic:
50520 // logic (setcc N00, N01), (setcc N10, N11) -->
50521 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
50522 unsigned NumElts = 128 / N00Type.getSizeInBits();
50523 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
50524 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
50525 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
50526 SDValue N01 = N0.getOperand(1);
50527 SDValue N11 = N1.getOperand(1);
50528 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
50529 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
50530 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
50531 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
50532 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
50533 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
50534 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
50535 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
50536}
50537
50538// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
50539// to reduce XMM->GPR traffic.
50540static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
50541 SDValue N1, SelectionDAG &DAG) {
50542 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50543 "Unexpected bit opcode");
50544
50545 // Both operands must be single use MOVMSK.
50546 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
50547 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
50548 return SDValue();
50549
50550 SDValue Vec0 = N0.getOperand(0);
50551 SDValue Vec1 = N1.getOperand(0);
50552 EVT VecVT0 = Vec0.getValueType();
50553 EVT VecVT1 = Vec1.getValueType();
50554
50555 // Both MOVMSK operands must be from vectors of the same size and same element
50556 // size, but its OK for a fp/int diff.
50557 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
50558 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
50559 return SDValue();
50560
50561 unsigned VecOpc =
50562 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
50563 SDValue Result =
50564 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
50565 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
50566}
50567
50568// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
50569// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
50570// handles in InstCombine.
50571static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
50572 SDValue N0, SDValue N1,
50573 SelectionDAG &DAG) {
50574 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50575 "Unexpected bit opcode");
50576
50577 // Both operands must be single use.
50578 if (!N0.hasOneUse() || !N1.hasOneUse())
50579 return SDValue();
50580
50581 // Search for matching shifts.
50584
50585 unsigned BCOpc = BC0.getOpcode();
50586 EVT BCVT = BC0.getValueType();
50587 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
50588 return SDValue();
50589
50590 switch (BCOpc) {
50591 case X86ISD::VSHLI:
50592 case X86ISD::VSRLI:
50593 case X86ISD::VSRAI: {
50594 if (BC0.getOperand(1) != BC1.getOperand(1))
50595 return SDValue();
50596 SDValue BitOp =
50597 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
50598 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
50599 return DAG.getBitcast(VT, Shift);
50600 }
50601 }
50602
50603 return SDValue();
50604}
50605
50606// Attempt to fold:
50607// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
50608// TODO: Handle PACKUS handling.
50609static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
50610 SDValue N0, SDValue N1, SelectionDAG &DAG) {
50611 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50612 "Unexpected bit opcode");
50613
50614 // Both operands must be single use.
50615 if (!N0.hasOneUse() || !N1.hasOneUse())
50616 return SDValue();
50617
50618 // Search for matching packs.
50621
50622 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
50623 return SDValue();
50624
50625 MVT DstVT = N0.getSimpleValueType();
50626 if (DstVT != N1.getSimpleValueType())
50627 return SDValue();
50628
50629 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
50630 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
50631
50632 // Limit to allsignbits packing.
50633 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
50634 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
50635 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
50636 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
50637 return SDValue();
50638
50639 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
50640 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
50641 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
50642}
50643
50644/// If this is a zero/all-bits result that is bitwise-anded with a low bits
50645/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
50646/// with a shift-right to eliminate loading the vector constant mask value.
50648 const X86Subtarget &Subtarget) {
50649 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
50650 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
50651 EVT VT = Op0.getValueType();
50652 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
50653 return SDValue();
50654
50655 // Try to convert an "is positive" signbit masking operation into arithmetic
50656 // shift and "andn". This saves a materialization of a -1 vector constant.
50657 // The "is negative" variant should be handled more generally because it only
50658 // requires "and" rather than "andn":
50659 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
50660 //
50661 // This is limited to the original type to avoid producing even more bitcasts.
50662 // If the bitcasts can't be eliminated, then it is unlikely that this fold
50663 // will be profitable.
50664 if (N->getValueType(0) == VT &&
50665 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
50666 SDValue X, Y;
50667 if (Op1.getOpcode() == X86ISD::PCMPGT &&
50668 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
50669 X = Op1.getOperand(0);
50670 Y = Op0;
50671 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
50672 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
50673 X = Op0.getOperand(0);
50674 Y = Op1;
50675 }
50676 if (X && Y) {
50677 SDLoc DL(N);
50678 SDValue Sra =
50680 VT.getScalarSizeInBits() - 1, DAG);
50681 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
50682 }
50683 }
50684
50685 APInt SplatVal;
50686 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
50687 return SDValue();
50688
50689 // Don't prevent creation of ANDN.
50690 if (isBitwiseNot(Op0))
50691 return SDValue();
50692
50693 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
50694 return SDValue();
50695
50696 unsigned EltBitWidth = VT.getScalarSizeInBits();
50697 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
50698 return SDValue();
50699
50700 SDLoc DL(N);
50701 unsigned ShiftVal = SplatVal.countr_one();
50702 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
50703 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
50704 return DAG.getBitcast(N->getValueType(0), Shift);
50705}
50706
50707// Get the index node from the lowered DAG of a GEP IR instruction with one
50708// indexing dimension.
50710 if (Ld->isIndexed())
50711 return SDValue();
50712
50713 SDValue Base = Ld->getBasePtr();
50714 if (Base.getOpcode() != ISD::ADD)
50715 return SDValue();
50716
50717 SDValue ShiftedIndex = Base.getOperand(0);
50718 if (ShiftedIndex.getOpcode() != ISD::SHL)
50719 return SDValue();
50720
50721 return ShiftedIndex.getOperand(0);
50722}
50723
50724static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
50725 return Subtarget.hasBMI2() &&
50726 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
50727}
50728
50729/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
50730/// This undoes the inverse fold performed in InstCombine
50732
50733 using namespace llvm::SDPatternMatch;
50734 MVT VT = N->getSimpleValueType(0);
50735 SDLoc DL(N);
50736 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50737 if (!TLI.hasAndNot(SDValue(N, 0)))
50738 return SDValue();
50739
50740 SDValue X, Y, Z;
50741 if (sd_match(N, m_And(m_Value(X),
50742 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
50743 // Don't fold if Y or Z are constants to prevent infinite loops.
50746 return DAG.getNode(
50747 ISD::AND, DL, VT, X,
50748 DAG.getNOT(
50749 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
50750 }
50751
50752 return SDValue();
50753}
50754
50755// This function recognizes cases where X86 bzhi instruction can replace and
50756// 'and-load' sequence.
50757// In case of loading integer value from an array of constants which is defined
50758// as follows:
50759//
50760// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
50761//
50762// then applying a bitwise and on the result with another input.
50763// It's equivalent to performing bzhi (zero high bits) on the input, with the
50764// same index of the load.
50766 const X86Subtarget &Subtarget) {
50767 MVT VT = Node->getSimpleValueType(0);
50768 SDLoc dl(Node);
50769
50770 // Check if subtarget has BZHI instruction for the node's type
50771 if (!hasBZHI(Subtarget, VT))
50772 return SDValue();
50773
50774 // Try matching the pattern for both operands.
50775 for (unsigned i = 0; i < 2; i++) {
50776 // continue if the operand is not a load instruction
50777 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
50778 if (!Ld)
50779 continue;
50780 const Value *MemOp = Ld->getMemOperand()->getValue();
50781 if (!MemOp)
50782 continue;
50783 // Get the Node which indexes into the array.
50785 if (!Index)
50786 continue;
50787
50788 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
50789 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
50790 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
50791 Constant *Init = GV->getInitializer();
50792 Type *Ty = Init->getType();
50793 if (!isa<ConstantDataArray>(Init) ||
50794 !Ty->getArrayElementType()->isIntegerTy() ||
50796 VT.getSizeInBits() ||
50797 Ty->getArrayNumElements() >
50799 continue;
50800
50801 // Check if the array's constant elements are suitable to our case.
50802 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
50803 bool ConstantsMatch = true;
50804 for (uint64_t j = 0; j < ArrayElementCount; j++) {
50805 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
50806 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
50807 ConstantsMatch = false;
50808 break;
50809 }
50810 }
50811 if (!ConstantsMatch)
50812 continue;
50813
50814 // Do the transformation (For 32-bit type):
50815 // -> (and (load arr[idx]), inp)
50816 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
50817 // that will be replaced with one bzhi instruction.
50818 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
50819 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
50820
50821 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
50822 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
50823 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
50824
50825 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
50826 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
50827 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
50828 }
50829 }
50830 }
50831 }
50832 return SDValue();
50833}
50834
50835// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
50836// Where C is a mask containing the same number of bits as the setcc and
50837// where the setcc will freely 0 upper bits of k-register. We can replace the
50838// undef in the concat with 0s and remove the AND. This mainly helps with
50839// v2i1/v4i1 setcc being casted to scalar.
50841 const X86Subtarget &Subtarget) {
50842 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
50843
50844 EVT VT = N->getValueType(0);
50845
50846 // Make sure this is an AND with constant. We will check the value of the
50847 // constant later.
50848 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
50849 if (!C1)
50850 return SDValue();
50851
50852 // This is implied by the ConstantSDNode.
50853 assert(!VT.isVector() && "Expected scalar VT!");
50854
50855 SDValue Src = N->getOperand(0);
50856 if (!Src.hasOneUse())
50857 return SDValue();
50858
50859 // (Optionally) peek through any_extend().
50860 if (Src.getOpcode() == ISD::ANY_EXTEND) {
50861 if (!Src.getOperand(0).hasOneUse())
50862 return SDValue();
50863 Src = Src.getOperand(0);
50864 }
50865
50866 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
50867 return SDValue();
50868
50869 Src = Src.getOperand(0);
50870 EVT SrcVT = Src.getValueType();
50871
50872 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50873 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
50874 !TLI.isTypeLegal(SrcVT))
50875 return SDValue();
50876
50877 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
50878 return SDValue();
50879
50880 // We only care about the first subvector of the concat, we expect the
50881 // other subvectors to be ignored due to the AND if we make the change.
50882 SDValue SubVec = Src.getOperand(0);
50883 EVT SubVecVT = SubVec.getValueType();
50884
50885 // The RHS of the AND should be a mask with as many bits as SubVec.
50886 if (!TLI.isTypeLegal(SubVecVT) ||
50887 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
50888 return SDValue();
50889
50890 // First subvector should be a setcc with a legal result type or a
50891 // AND containing at least one setcc with a legal result type.
50892 auto IsLegalSetCC = [&](SDValue V) {
50893 if (V.getOpcode() != ISD::SETCC)
50894 return false;
50895 EVT SetccVT = V.getOperand(0).getValueType();
50896 if (!TLI.isTypeLegal(SetccVT) ||
50897 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
50898 return false;
50899 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
50900 return false;
50901 return true;
50902 };
50903 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
50904 (IsLegalSetCC(SubVec.getOperand(0)) ||
50905 IsLegalSetCC(SubVec.getOperand(1))))))
50906 return SDValue();
50907
50908 // We passed all the checks. Rebuild the concat_vectors with zeroes
50909 // and cast it back to VT.
50910 SDLoc dl(N);
50911 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
50912 DAG.getConstant(0, dl, SubVecVT));
50913 Ops[0] = SubVec;
50914 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
50915 Ops);
50916 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
50917 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
50918}
50919
50920static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
50921 SDValue OpMustEq, SDValue Op, unsigned Depth) {
50922 // We don't want to go crazy with the recursion here. This isn't a super
50923 // important optimization.
50924 static constexpr unsigned kMaxDepth = 2;
50925
50926 // Only do this re-ordering if op has one use.
50927 if (!Op.hasOneUse())
50928 return SDValue();
50929
50930 SDLoc DL(Op);
50931 // If we hit another assosiative op, recurse further.
50932 if (Op.getOpcode() == Opc) {
50933 // Done recursing.
50934 if (Depth++ >= kMaxDepth)
50935 return SDValue();
50936
50937 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50938 if (SDValue R =
50939 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
50940 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
50941 Op.getOperand(1 - OpIdx));
50942
50943 } else if (Op.getOpcode() == ISD::SUB) {
50944 if (Opc == ISD::AND) {
50945 // BLSI: (and x, (sub 0, x))
50946 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
50947 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50948 }
50949 // Opc must be ISD::AND or ISD::XOR
50950 // BLSR: (and x, (sub x, 1))
50951 // BLSMSK: (xor x, (sub x, 1))
50952 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50953 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50954
50955 } else if (Op.getOpcode() == ISD::ADD) {
50956 // Opc must be ISD::AND or ISD::XOR
50957 // BLSR: (and x, (add x, -1))
50958 // BLSMSK: (xor x, (add x, -1))
50959 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50960 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50961 }
50962 return SDValue();
50963}
50964
50966 const X86Subtarget &Subtarget) {
50967 EVT VT = N->getValueType(0);
50968 // Make sure this node is a candidate for BMI instructions.
50969 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
50970 (VT != MVT::i32 && VT != MVT::i64))
50971 return SDValue();
50972
50973 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
50974
50975 // Try and match LHS and RHS.
50976 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50977 if (SDValue OpMatch =
50978 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
50979 N->getOperand(1 - OpIdx), 0))
50980 return OpMatch;
50981 return SDValue();
50982}
50983
50985 SelectionDAG &DAG,
50987 const X86Subtarget &ST) {
50988 // cmp(setcc(cc, X), 0)
50989 // brcond ne
50990 // ->
50991 // X
50992 // brcond cc
50993
50994 // sub(setcc(cc, X), 1)
50995 // brcond ne
50996 // ->
50997 // X
50998 // brcond ~cc
50999 //
51000 // if only flag has users
51001
51002 SDValue SetCC = N->getOperand(0);
51003
51004 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51005 return SDValue();
51006
51007 // Check the only user of flag is `brcond ne`.
51008 SDNode *BrCond = *Flag->user_begin();
51009 if (BrCond->getOpcode() != X86ISD::BRCOND)
51010 return SDValue();
51011 unsigned CondNo = 2;
51012 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51014 return SDValue();
51015
51016 SDValue X = SetCC.getOperand(1);
51017 // sub has two results while X only have one. DAG combine assumes the value
51018 // type matches.
51019 if (N->getOpcode() == X86ISD::SUB)
51020 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51021
51022 SDValue CCN = SetCC.getOperand(0);
51024 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51026 // Update CC for the consumer of the flag.
51027 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51028 // checking if the second condition evaluates to true. When comparing the
51029 // result with 1, we are checking uf the second condition evaluates to false.
51030 SmallVector<SDValue> Ops(BrCond->op_values());
51031 if (isNullConstant(N->getOperand(1)))
51032 Ops[CondNo] = CCN;
51033 else if (isOneConstant(N->getOperand(1)))
51034 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51035 else
51036 llvm_unreachable("expect constant 0 or 1");
51037
51038 SDValue NewBrCond =
51039 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51040 // Avoid self-assign error b/c CC1 can be `e/ne`.
51041 if (BrCond != NewBrCond.getNode())
51042 DCI.CombineTo(BrCond, NewBrCond);
51043 return X;
51044}
51045
51048 const X86Subtarget &ST) {
51049 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51050 // ->
51051 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51052
51053 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51054 // ->
51055 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51056 //
51057 // where cflags is determined by cc1.
51058
51059 if (!ST.hasCCMP())
51060 return SDValue();
51061
51062 SDValue SetCC0 = N->getOperand(0);
51063 SDValue SetCC1 = N->getOperand(1);
51064 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51065 SetCC1.getOpcode() != X86ISD::SETCC)
51066 return SDValue();
51067
51068 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51069 SDValue Op = V.getOperand(1);
51070 unsigned Opc = Op.getOpcode();
51071 if (Opc == X86ISD::SUB)
51072 return X86ISD::CCMP;
51073 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51074 return X86ISD::CTEST;
51075 return 0U;
51076 };
51077
51078 unsigned NewOpc = 0;
51079
51080 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51081 // appear on the right.
51082 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51083 std::swap(SetCC0, SetCC1);
51084 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51085 return SDValue();
51086 }
51087
51088 X86::CondCode CC0 =
51089 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51090 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51091 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51092 return SDValue();
51093
51094 bool IsOR = N->getOpcode() == ISD::OR;
51095
51096 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51097 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51098 // operator is OR. Similar for CC1.
51099 SDValue SrcCC =
51101 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51102 : SetCC0.getOperand(0);
51103 SDValue CC1N = SetCC1.getOperand(0);
51104 X86::CondCode CC1 =
51105 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51107 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51108 SDLoc DL(N);
51109 SDValue CFlags = DAG.getTargetConstant(
51110 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51111 SDValue Sub = SetCC1.getOperand(1);
51112
51113 // Replace any uses of the old flag produced by SUB/CMP with the new one
51114 // produced by CCMP/CTEST.
51115 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51116 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51117 {Sub.getOperand(0), Sub.getOperand(1),
51118 CFlags, SrcCC, SetCC0.getOperand(1)})
51119 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51120 {Sub.getOperand(0), Sub.getOperand(0),
51121 CFlags, SrcCC, SetCC0.getOperand(1)});
51122
51123 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51124}
51125
51128 const X86Subtarget &Subtarget) {
51129 SDValue N0 = N->getOperand(0);
51130 SDValue N1 = N->getOperand(1);
51131 EVT VT = N->getValueType(0);
51132 SDLoc dl(N);
51133 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51134
51135 // If this is SSE1 only convert to FAND to avoid scalarization.
51136 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51137 return DAG.getBitcast(MVT::v4i32,
51138 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51139 DAG.getBitcast(MVT::v4f32, N0),
51140 DAG.getBitcast(MVT::v4f32, N1)));
51141 }
51142
51143 // Use a 32-bit and+zext if upper bits known zero.
51144 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51145 APInt HiMask = APInt::getHighBitsSet(64, 32);
51146 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51147 DAG.MaskedValueIsZero(N0, HiMask)) {
51148 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51149 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51150 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51151 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51152 }
51153 }
51154
51155 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51156 // TODO: Support multiple SrcOps.
51157 if (VT == MVT::i1) {
51159 SmallVector<APInt, 2> SrcPartials;
51160 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51161 SrcOps.size() == 1) {
51162 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51163 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51164 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51165 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51166 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51167 if (Mask) {
51168 assert(SrcPartials[0].getBitWidth() == NumElts &&
51169 "Unexpected partial reduction mask");
51170 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51171 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51172 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51173 }
51174 }
51175 }
51176
51177 // InstCombine converts:
51178 // `(-x << C0) & C1`
51179 // to
51180 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51181 // This saves an IR instruction but on x86 the neg/shift version is preferable
51182 // so undo the transform.
51183
51184 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51185 // TODO: We don't actually need a splat for this, we just need the checks to
51186 // hold for each element.
51187 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51188 /*AllowTruncation*/ false);
51189 ConstantSDNode *N01C =
51190 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51191 /*AllowTruncation*/ false);
51192 if (N1C && N01C) {
51193 const APInt &MulC = N01C->getAPIntValue();
51194 const APInt &AndC = N1C->getAPIntValue();
51195 APInt MulCLowBit = MulC & (-MulC);
51196 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51197 (MulCLowBit + MulC).isPowerOf2()) {
51198 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51199 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51200 assert(MulCLowBitLog != -1 &&
51201 "Isolated lowbit is somehow not a power of 2!");
51202 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51203 DAG.getConstant(MulCLowBitLog, dl, VT));
51204 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51205 }
51206 }
51207 }
51208
51209 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51210 return SetCC;
51211
51212 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51213 return V;
51214
51215 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51216 return R;
51217
51218 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51219 return R;
51220
51221 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51222 return R;
51223
51224 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51225 DAG, DCI, Subtarget))
51226 return FPLogic;
51227
51228 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51229 return R;
51230
51231 if (DCI.isBeforeLegalizeOps())
51232 return SDValue();
51233
51234 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51235 return R;
51236
51237 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
51238 return R;
51239
51240 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
51241 return ShiftRight;
51242
51243 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51244 return R;
51245
51247 return R;
51248
51249 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51250 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51251 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51252 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51253 unsigned Opc0 = N0.getOpcode();
51254 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51256 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51257 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51258 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51259 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51260 }
51261 }
51262
51263 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51264 // avoids slow variable shift (moving shift amount to ECX etc.)
51265 if (isOneConstant(N1) && N0->hasOneUse()) {
51266 SDValue Src = N0;
51267 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51268 Src.getOpcode() == ISD::TRUNCATE) &&
51269 Src.getOperand(0)->hasOneUse())
51270 Src = Src.getOperand(0);
51271 bool ContainsNOT = false;
51272 X86::CondCode X86CC = X86::COND_B;
51273 // Peek through AND(NOT(SRL(X,Y)),1).
51274 if (isBitwiseNot(Src)) {
51275 Src = Src.getOperand(0);
51276 X86CC = X86::COND_AE;
51277 ContainsNOT = true;
51278 }
51279 if (Src.getOpcode() == ISD::SRL &&
51280 !isa<ConstantSDNode>(Src.getOperand(1))) {
51281 SDValue BitNo = Src.getOperand(1);
51282 Src = Src.getOperand(0);
51283 // Peek through AND(SRL(NOT(X),Y),1).
51284 if (isBitwiseNot(Src)) {
51285 Src = Src.getOperand(0);
51286 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51287 ContainsNOT = true;
51288 }
51289 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51290 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51291 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51292 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51293 }
51294 }
51295
51296 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51297 // Attempt to recursively combine a bitmask AND with shuffles.
51298 SDValue Op(N, 0);
51299 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51300 return Res;
51301
51302 // If either operand is a constant mask, then only the elements that aren't
51303 // zero are actually demanded by the other operand.
51304 auto GetDemandedMasks = [&](SDValue Op) {
51305 APInt UndefElts;
51306 SmallVector<APInt> EltBits;
51307 int NumElts = VT.getVectorNumElements();
51308 int EltSizeInBits = VT.getScalarSizeInBits();
51309 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51310 APInt DemandedElts = APInt::getAllOnes(NumElts);
51311 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51312 EltBits)) {
51313 DemandedBits.clearAllBits();
51314 DemandedElts.clearAllBits();
51315 for (int I = 0; I != NumElts; ++I) {
51316 if (UndefElts[I]) {
51317 // We can't assume an undef src element gives an undef dst - the
51318 // other src might be zero.
51319 DemandedBits.setAllBits();
51320 DemandedElts.setBit(I);
51321 } else if (!EltBits[I].isZero()) {
51322 DemandedBits |= EltBits[I];
51323 DemandedElts.setBit(I);
51324 }
51325 }
51326 }
51327 return std::make_pair(DemandedBits, DemandedElts);
51328 };
51329 APInt Bits0, Elts0;
51330 APInt Bits1, Elts1;
51331 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
51332 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
51333
51334 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
51335 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
51336 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
51337 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
51338 if (N->getOpcode() != ISD::DELETED_NODE)
51339 DCI.AddToWorklist(N);
51340 return SDValue(N, 0);
51341 }
51342
51343 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
51344 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
51345 if (NewN0 || NewN1)
51346 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
51347 NewN1 ? NewN1 : N1);
51348 }
51349
51350 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
51351 if ((VT.getScalarSizeInBits() % 8) == 0 &&
51353 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
51354 SDValue BitMask = N1;
51355 SDValue SrcVec = N0.getOperand(0);
51356 EVT SrcVecVT = SrcVec.getValueType();
51357
51358 // Check that the constant bitmask masks whole bytes.
51359 APInt UndefElts;
51360 SmallVector<APInt, 64> EltBits;
51361 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
51362 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
51363 llvm::all_of(EltBits, [](const APInt &M) {
51364 return M.isZero() || M.isAllOnes();
51365 })) {
51366 unsigned NumElts = SrcVecVT.getVectorNumElements();
51367 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
51368 unsigned Idx = N0.getConstantOperandVal(1);
51369
51370 // Create a root shuffle mask from the byte mask and the extracted index.
51371 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
51372 for (unsigned i = 0; i != Scale; ++i) {
51373 if (UndefElts[i])
51374 continue;
51375 int VecIdx = Scale * Idx + i;
51376 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
51377 }
51378
51380 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
51382 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
51383 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
51384 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
51385 N0.getOperand(1));
51386 }
51387 }
51388
51389 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
51390 return R;
51391
51392 return SDValue();
51393}
51394
51395// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
51397 SelectionDAG &DAG,
51398 const X86Subtarget &Subtarget) {
51399 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
51400
51401 MVT VT = N->getSimpleValueType(0);
51402 unsigned EltSizeInBits = VT.getScalarSizeInBits();
51403 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
51404 return SDValue();
51405
51406 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
51407 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
51408 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
51409 return SDValue();
51410
51411 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
51412 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
51413 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
51414 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
51415 return SDValue();
51416
51417 // Attempt to extract constant byte masks.
51418 APInt UndefElts0, UndefElts1;
51419 SmallVector<APInt, 32> EltBits0, EltBits1;
51420 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
51421 /*AllowWholeUndefs*/ false,
51422 /*AllowPartialUndefs*/ false))
51423 return SDValue();
51424 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
51425 /*AllowWholeUndefs*/ false,
51426 /*AllowPartialUndefs*/ false))
51427 return SDValue();
51428
51429 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
51430 // TODO - add UNDEF elts support.
51431 if (UndefElts0[i] || UndefElts1[i])
51432 return SDValue();
51433 if (EltBits0[i] != ~EltBits1[i])
51434 return SDValue();
51435 }
51436
51437 if (useVPTERNLOG(Subtarget, VT)) {
51438 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
51439 // VPTERNLOG is only available as vXi32/64-bit types.
51440 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
51441 MVT OpVT =
51442 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
51443 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
51444 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
51445 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
51446 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
51447 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
51448 DAG, Subtarget);
51449 return DAG.getBitcast(VT, Res);
51450 }
51451
51452 SDValue X = N->getOperand(0);
51453 SDValue Y =
51454 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
51455 DAG.getBitcast(VT, N1.getOperand(0)));
51456 return DAG.getNode(ISD::OR, DL, VT, X, Y);
51457}
51458
51459// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
51460static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
51461 if (N->getOpcode() != ISD::OR)
51462 return false;
51463
51464 SDValue N0 = N->getOperand(0);
51465 SDValue N1 = N->getOperand(1);
51466
51467 // Canonicalize AND to LHS.
51468 if (N1.getOpcode() == ISD::AND)
51469 std::swap(N0, N1);
51470
51471 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
51472 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
51473 return false;
51474
51475 Mask = N1.getOperand(0);
51476 X = N1.getOperand(1);
51477
51478 // Check to see if the mask appeared in both the AND and ANDNP.
51479 if (N0.getOperand(0) == Mask)
51480 Y = N0.getOperand(1);
51481 else if (N0.getOperand(1) == Mask)
51482 Y = N0.getOperand(0);
51483 else
51484 return false;
51485
51486 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
51487 // ANDNP combine allows other combines to happen that prevent matching.
51488 return true;
51489}
51490
51491// Try to fold:
51492// (or (and (m, y), (pandn m, x)))
51493// into:
51494// (vselect m, x, y)
51495// As a special case, try to fold:
51496// (or (and (m, (sub 0, x)), (pandn m, x)))
51497// into:
51498// (sub (xor X, M), M)
51500 SelectionDAG &DAG,
51501 const X86Subtarget &Subtarget) {
51502 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
51503
51504 EVT VT = N->getValueType(0);
51505 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
51506 (VT.is256BitVector() && Subtarget.hasInt256())))
51507 return SDValue();
51508
51509 SDValue X, Y, Mask;
51510 if (!matchLogicBlend(N, X, Y, Mask))
51511 return SDValue();
51512
51513 // Validate that X, Y, and Mask are bitcasts, and see through them.
51514 Mask = peekThroughBitcasts(Mask);
51517
51518 EVT MaskVT = Mask.getValueType();
51519 unsigned EltBits = MaskVT.getScalarSizeInBits();
51520
51521 // TODO: Attempt to handle floating point cases as well?
51522 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
51523 return SDValue();
51524
51525 // Attempt to combine to conditional negate: (sub (xor X, M), M)
51526 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
51527 DAG, Subtarget))
51528 return Res;
51529
51530 // PBLENDVB is only available on SSE 4.1.
51531 if (!Subtarget.hasSSE41())
51532 return SDValue();
51533
51534 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
51535 if (Subtarget.hasVLX())
51536 return SDValue();
51537
51538 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
51539
51540 X = DAG.getBitcast(BlendVT, X);
51541 Y = DAG.getBitcast(BlendVT, Y);
51542 Mask = DAG.getBitcast(BlendVT, Mask);
51543 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
51544 return DAG.getBitcast(VT, Mask);
51545}
51546
51547// Helper function for combineOrCmpEqZeroToCtlzSrl
51548// Transforms:
51549// seteq(cmp x, 0)
51550// into:
51551// srl(ctlz x), log2(bitsize(x))
51552// Input pattern is checked by caller.
51554 SDValue Cmp = Op.getOperand(1);
51555 EVT VT = Cmp.getOperand(0).getValueType();
51556 unsigned Log2b = Log2_32(VT.getSizeInBits());
51557 SDLoc dl(Op);
51558 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
51559 // The result of the shift is true or false, and on X86, the 32-bit
51560 // encoding of shr and lzcnt is more desirable.
51561 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
51562 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
51563 DAG.getConstant(Log2b, dl, MVT::i8));
51564 return Scc;
51565}
51566
51567// Try to transform:
51568// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
51569// into:
51570// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
51571// Will also attempt to match more generic cases, eg:
51572// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
51573// Only applies if the target supports the FastLZCNT feature.
51576 const X86Subtarget &Subtarget) {
51577 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
51578 return SDValue();
51579
51580 auto isORCandidate = [](SDValue N) {
51581 return (N->getOpcode() == ISD::OR && N->hasOneUse());
51582 };
51583
51584 // Check the zero extend is extending to 32-bit or more. The code generated by
51585 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
51586 // instructions to clear the upper bits.
51587 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
51588 !isORCandidate(N->getOperand(0)))
51589 return SDValue();
51590
51591 // Check the node matches: setcc(eq, cmp 0)
51592 auto isSetCCCandidate = [](SDValue N) {
51593 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
51594 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
51595 N->getOperand(1).getOpcode() == X86ISD::CMP &&
51596 isNullConstant(N->getOperand(1).getOperand(1)) &&
51597 N->getOperand(1).getValueType().bitsGE(MVT::i32);
51598 };
51599
51600 SDNode *OR = N->getOperand(0).getNode();
51601 SDValue LHS = OR->getOperand(0);
51602 SDValue RHS = OR->getOperand(1);
51603
51604 // Save nodes matching or(or, setcc(eq, cmp 0)).
51606 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
51607 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
51608 ORNodes.push_back(OR);
51609 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
51610 LHS = OR->getOperand(0);
51611 RHS = OR->getOperand(1);
51612 }
51613
51614 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
51615 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
51616 !isORCandidate(SDValue(OR, 0)))
51617 return SDValue();
51618
51619 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
51620 // to
51621 // or(srl(ctlz),srl(ctlz)).
51622 // The dag combiner can then fold it into:
51623 // srl(or(ctlz, ctlz)).
51624 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
51625 SDValue Ret, NewRHS;
51626 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
51627 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
51628
51629 if (!Ret)
51630 return SDValue();
51631
51632 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
51633 while (!ORNodes.empty()) {
51634 OR = ORNodes.pop_back_val();
51635 LHS = OR->getOperand(0);
51636 RHS = OR->getOperand(1);
51637 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
51638 if (RHS->getOpcode() == ISD::OR)
51639 std::swap(LHS, RHS);
51640 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
51641 if (!NewRHS)
51642 return SDValue();
51643 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
51644 }
51645
51646 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
51647}
51648
51650 SDValue And1_L, SDValue And1_R,
51651 const SDLoc &DL, SelectionDAG &DAG) {
51652 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
51653 return SDValue();
51654 SDValue NotOp = And0_L->getOperand(0);
51655 if (NotOp == And1_R)
51656 std::swap(And1_R, And1_L);
51657 if (NotOp != And1_L)
51658 return SDValue();
51659
51660 // (~(NotOp) & And0_R) | (NotOp & And1_R)
51661 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
51662 EVT VT = And1_L->getValueType(0);
51663 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
51664 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
51665 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
51666 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
51667 return Xor1;
51668}
51669
51670/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
51671/// equivalent `((x ^ y) & m) ^ y)` pattern.
51672/// This is typically a better representation for targets without a fused
51673/// "and-not" operation. This function is intended to be called from a
51674/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
51676 // Note that masked-merge variants using XOR or ADD expressions are
51677 // normalized to OR by InstCombine so we only check for OR.
51678 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
51679 SDValue N0 = Node->getOperand(0);
51680 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
51681 return SDValue();
51682 SDValue N1 = Node->getOperand(1);
51683 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
51684 return SDValue();
51685
51686 SDLoc DL(Node);
51687 SDValue N00 = N0->getOperand(0);
51688 SDValue N01 = N0->getOperand(1);
51689 SDValue N10 = N1->getOperand(0);
51690 SDValue N11 = N1->getOperand(1);
51691 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
51692 return Result;
51693 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
51694 return Result;
51695 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
51696 return Result;
51697 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
51698 return Result;
51699 return SDValue();
51700}
51701
51702/// If this is an add or subtract where one operand is produced by a cmp+setcc,
51703/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
51704/// with CMP+{ADC, SBB}.
51705/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
51706static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
51707 SDValue X, SDValue Y,
51708 SelectionDAG &DAG,
51709 bool ZeroSecondOpOnly = false) {
51710 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
51711 return SDValue();
51712
51713 // Look through a one-use zext.
51714 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
51715 Y = Y.getOperand(0);
51716
51718 SDValue EFLAGS;
51719 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
51720 CC = (X86::CondCode)Y.getConstantOperandVal(0);
51721 EFLAGS = Y.getOperand(1);
51722 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
51723 Y.hasOneUse()) {
51724 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
51725 }
51726
51727 if (!EFLAGS)
51728 return SDValue();
51729
51730 // If X is -1 or 0, then we have an opportunity to avoid constants required in
51731 // the general case below.
51732 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
51733 if (ConstantX && !ZeroSecondOpOnly) {
51734 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
51735 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
51736 // This is a complicated way to get -1 or 0 from the carry flag:
51737 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51738 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51739 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51740 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51741 EFLAGS);
51742 }
51743
51744 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
51745 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
51746 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
51747 EFLAGS.getValueType().isInteger() &&
51748 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51749 // Swap the operands of a SUB, and we have the same pattern as above.
51750 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
51751 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
51752 SDValue NewSub = DAG.getNode(
51753 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51754 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51755 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
51756 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51757 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51758 NewEFLAGS);
51759 }
51760 }
51761 }
51762
51763 if (CC == X86::COND_B) {
51764 // X + SETB Z --> adc X, 0
51765 // X - SETB Z --> sbb X, 0
51766 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
51767 DAG.getVTList(VT, MVT::i32), X,
51768 DAG.getConstant(0, DL, VT), EFLAGS);
51769 }
51770
51771 if (ZeroSecondOpOnly)
51772 return SDValue();
51773
51774 if (CC == X86::COND_A) {
51775 // Try to convert COND_A into COND_B in an attempt to facilitate
51776 // materializing "setb reg".
51777 //
51778 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
51779 // cannot take an immediate as its first operand.
51780 //
51781 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51782 EFLAGS.getValueType().isInteger() &&
51783 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51784 SDValue NewSub =
51785 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51786 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51787 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
51788 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
51789 DAG.getVTList(VT, MVT::i32), X,
51790 DAG.getConstant(0, DL, VT), NewEFLAGS);
51791 }
51792 }
51793
51794 if (CC == X86::COND_AE) {
51795 // X + SETAE --> sbb X, -1
51796 // X - SETAE --> adc X, -1
51797 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
51798 DAG.getVTList(VT, MVT::i32), X,
51799 DAG.getAllOnesConstant(DL, VT), EFLAGS);
51800 }
51801
51802 if (CC == X86::COND_BE) {
51803 // X + SETBE --> sbb X, -1
51804 // X - SETBE --> adc X, -1
51805 // Try to convert COND_BE into COND_AE in an attempt to facilitate
51806 // materializing "setae reg".
51807 //
51808 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
51809 // cannot take an immediate as its first operand.
51810 //
51811 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51812 EFLAGS.getValueType().isInteger() &&
51813 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51814 SDValue NewSub =
51815 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51816 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51817 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
51818 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
51819 DAG.getVTList(VT, MVT::i32), X,
51820 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
51821 }
51822 }
51823
51824 if (CC != X86::COND_E && CC != X86::COND_NE)
51825 return SDValue();
51826
51827 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
51828 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
51829 !EFLAGS.getOperand(0).getValueType().isInteger())
51830 return SDValue();
51831
51832 SDValue Z = EFLAGS.getOperand(0);
51833 EVT ZVT = Z.getValueType();
51834
51835 // If X is -1 or 0, then we have an opportunity to avoid constants required in
51836 // the general case below.
51837 if (ConstantX) {
51838 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
51839 // fake operands:
51840 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
51841 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
51842 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
51843 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
51844 SDValue Zero = DAG.getConstant(0, DL, ZVT);
51845 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51846 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
51847 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51848 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51849 SDValue(Neg.getNode(), 1));
51850 }
51851
51852 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
51853 // with fake operands:
51854 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
51855 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
51856 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
51857 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
51858 SDValue One = DAG.getConstant(1, DL, ZVT);
51859 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51860 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
51861 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51862 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51863 Cmp1.getValue(1));
51864 }
51865 }
51866
51867 // (cmp Z, 1) sets the carry flag if Z is 0.
51868 SDValue One = DAG.getConstant(1, DL, ZVT);
51869 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51870 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
51871
51872 // Add the flags type for ADC/SBB nodes.
51873 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
51874
51875 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
51876 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
51877 if (CC == X86::COND_NE)
51878 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
51879 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
51880
51881 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
51882 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
51883 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
51884 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
51885}
51886
51887/// If this is an add or subtract where one operand is produced by a cmp+setcc,
51888/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
51889/// with CMP+{ADC, SBB}.
51891 SelectionDAG &DAG) {
51892 bool IsSub = N->getOpcode() == ISD::SUB;
51893 SDValue X = N->getOperand(0);
51894 SDValue Y = N->getOperand(1);
51895 EVT VT = N->getValueType(0);
51896
51897 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
51898 return ADCOrSBB;
51899
51900 // Commute and try again (negate the result for subtracts).
51901 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
51902 if (IsSub)
51903 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
51904 return ADCOrSBB;
51905 }
51906
51907 return SDValue();
51908}
51909
51910static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
51911 SDValue N0, SDValue N1,
51912 SelectionDAG &DAG) {
51913 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
51914
51915 // Delegate to combineAddOrSubToADCOrSBB if we have:
51916 //
51917 // (xor/or (zero_extend (setcc)) imm)
51918 //
51919 // where imm is odd if and only if we have xor, in which case the XOR/OR are
51920 // equivalent to a SUB/ADD, respectively.
51921 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
51922 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
51923 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
51924 bool IsSub = Opc == ISD::XOR;
51925 bool N1COdd = N1C->getZExtValue() & 1;
51926 if (IsSub ? N1COdd : !N1COdd)
51927 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
51928 return R;
51929 }
51930 }
51931
51932 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
51933 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
51934 N0.getOperand(0).getOpcode() == ISD::AND &&
51938 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
51939 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
51940 N0.getOperand(0).getOperand(1));
51941 }
51942
51943 return SDValue();
51944}
51945
51948 const X86Subtarget &Subtarget) {
51949 SDValue N0 = N->getOperand(0);
51950 SDValue N1 = N->getOperand(1);
51951 EVT VT = N->getValueType(0);
51952 SDLoc dl(N);
51953 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51954
51955 // If this is SSE1 only convert to FOR to avoid scalarization.
51956 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51957 return DAG.getBitcast(MVT::v4i32,
51958 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
51959 DAG.getBitcast(MVT::v4f32, N0),
51960 DAG.getBitcast(MVT::v4f32, N1)));
51961 }
51962
51963 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
51964 // TODO: Support multiple SrcOps.
51965 if (VT == MVT::i1) {
51967 SmallVector<APInt, 2> SrcPartials;
51968 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
51969 SrcOps.size() == 1) {
51970 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51971 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51972 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51973 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51974 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51975 if (Mask) {
51976 assert(SrcPartials[0].getBitWidth() == NumElts &&
51977 "Unexpected partial reduction mask");
51978 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
51979 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51980 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51981 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
51982 }
51983 }
51984 }
51985
51986 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51987 return SetCC;
51988
51989 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51990 return R;
51991
51992 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51993 return R;
51994
51995 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51996 return R;
51997
51998 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51999 DAG, DCI, Subtarget))
52000 return FPLogic;
52001
52002 if (DCI.isBeforeLegalizeOps())
52003 return SDValue();
52004
52005 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52006 return R;
52007
52008 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52009 return R;
52010
52011 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52012 return R;
52013
52014 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
52015 if ((VT == MVT::i32 || VT == MVT::i64) &&
52016 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
52017 isNullConstant(N0.getOperand(0))) {
52018 SDValue Cond = N0.getOperand(1);
52019 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52020 Cond = Cond.getOperand(0);
52021
52022 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52023 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52024 uint64_t Val = CN->getZExtValue();
52025 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
52026 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
52027 CCode = X86::GetOppositeBranchCondition(CCode);
52028 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
52029
52030 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52031 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52032 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52033 return R;
52034 }
52035 }
52036 }
52037 }
52038
52039 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52040 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52041 // iff the upper elements of the non-shifted arg are zero.
52042 // KUNPCK require 16+ bool vector elements.
52043 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52044 unsigned NumElts = VT.getVectorNumElements();
52045 unsigned HalfElts = NumElts / 2;
52046 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52047 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52048 N1.getConstantOperandAPInt(1) == HalfElts &&
52049 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52050 return DAG.getNode(
52051 ISD::CONCAT_VECTORS, dl, VT,
52052 extractSubVector(N0, 0, DAG, dl, HalfElts),
52053 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52054 }
52055 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52056 N0.getConstantOperandAPInt(1) == HalfElts &&
52057 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52058 return DAG.getNode(
52059 ISD::CONCAT_VECTORS, dl, VT,
52060 extractSubVector(N1, 0, DAG, dl, HalfElts),
52061 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52062 }
52063 }
52064
52065 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52066 // Attempt to recursively combine an OR of shuffles.
52067 SDValue Op(N, 0);
52068 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52069 return Res;
52070
52071 // If either operand is a constant mask, then only the elements that aren't
52072 // allones are actually demanded by the other operand.
52073 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52074 APInt UndefElts;
52075 SmallVector<APInt> EltBits;
52076 int NumElts = VT.getVectorNumElements();
52077 int EltSizeInBits = VT.getScalarSizeInBits();
52078 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52079 return false;
52080
52081 APInt DemandedElts = APInt::getZero(NumElts);
52082 for (int I = 0; I != NumElts; ++I)
52083 if (!EltBits[I].isAllOnes())
52084 DemandedElts.setBit(I);
52085
52086 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52087 };
52088 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52089 if (N->getOpcode() != ISD::DELETED_NODE)
52090 DCI.AddToWorklist(N);
52091 return SDValue(N, 0);
52092 }
52093 }
52094
52095 // We should fold "masked merge" patterns when `andn` is not available.
52096 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
52097 if (SDValue R = foldMaskedMerge(N, DAG))
52098 return R;
52099
52100 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52101 return R;
52102
52103 return SDValue();
52104}
52105
52106/// Try to turn tests against the signbit in the form of:
52107/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52108/// into:
52109/// SETGT(X, -1)
52111 // This is only worth doing if the output type is i8 or i1.
52112 EVT ResultType = N->getValueType(0);
52113 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52114 return SDValue();
52115
52116 SDValue N0 = N->getOperand(0);
52117 SDValue N1 = N->getOperand(1);
52118
52119 // We should be performing an xor against a truncated shift.
52120 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52121 return SDValue();
52122
52123 // Make sure we are performing an xor against one.
52124 if (!isOneConstant(N1))
52125 return SDValue();
52126
52127 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52128 SDValue Shift = N0.getOperand(0);
52129 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52130 return SDValue();
52131
52132 // Make sure we are truncating from one of i16, i32 or i64.
52133 EVT ShiftTy = Shift.getValueType();
52134 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52135 return SDValue();
52136
52137 // Make sure the shift amount extracts the sign bit.
52138 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52139 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52140 return SDValue();
52141
52142 // Create a greater-than comparison against -1.
52143 // N.B. Using SETGE against 0 works but we want a canonical looking
52144 // comparison, using SETGT matches up with what TranslateX86CC.
52145 SDLoc DL(N);
52146 SDValue ShiftOp = Shift.getOperand(0);
52147 EVT ShiftOpTy = ShiftOp.getValueType();
52148 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52149 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52150 *DAG.getContext(), ResultType);
52151 SDValue Cond =
52152 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52153 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52154 if (SetCCResultType != ResultType)
52155 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52156 return Cond;
52157}
52158
52159/// Turn vector tests of the signbit in the form of:
52160/// xor (sra X, elt_size(X)-1), -1
52161/// into:
52162/// pcmpgt X, -1
52163///
52164/// This should be called before type legalization because the pattern may not
52165/// persist after that.
52167 const X86Subtarget &Subtarget) {
52168 EVT VT = N->getValueType(0);
52169 if (!VT.isSimple())
52170 return SDValue();
52171
52172 switch (VT.getSimpleVT().SimpleTy) {
52173 // clang-format off
52174 default: return SDValue();
52175 case MVT::v16i8:
52176 case MVT::v8i16:
52177 case MVT::v4i32:
52178 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52179 case MVT::v32i8:
52180 case MVT::v16i16:
52181 case MVT::v8i32:
52182 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52183 // clang-format on
52184 }
52185
52186 // There must be a shift right algebraic before the xor, and the xor must be a
52187 // 'not' operation.
52188 SDValue Shift = N->getOperand(0);
52189 SDValue Ones = N->getOperand(1);
52190 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52192 return SDValue();
52193
52194 // The shift should be smearing the sign bit across each vector element.
52195 auto *ShiftAmt =
52196 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52197 if (!ShiftAmt ||
52198 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52199 return SDValue();
52200
52201 // Create a greater-than comparison against -1. We don't use the more obvious
52202 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52203 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52204}
52205
52206/// Detect patterns of truncation with unsigned saturation:
52207///
52208/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52209/// Return the source value x to be truncated or SDValue() if the pattern was
52210/// not matched.
52211///
52212/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52213/// where C1 >= 0 and C2 is unsigned max of destination type.
52214///
52215/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52216/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52217///
52218/// These two patterns are equivalent to:
52219/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52220/// So return the smax(x, C1) value to be truncated or SDValue() if the
52221/// pattern was not matched.
52223 const SDLoc &DL) {
52224 using namespace llvm::SDPatternMatch;
52225 EVT InVT = In.getValueType();
52226
52227 // Saturation with truncation. We truncate from InVT to VT.
52229 "Unexpected types for truncate operation");
52230
52231 APInt C1, C2;
52233
52234 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52235 // the element size of the destination type.
52236 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52237 C2.isMask(VT.getScalarSizeInBits()))
52238 return UMin;
52239
52240 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52241 sd_match(SMin, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52242 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52243 return SMin;
52244
52245 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52246 sd_match(SMax, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52247 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52248 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52249
52250 return SDValue();
52251}
52252
52253/// Detect patterns of truncation with signed saturation:
52254/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52255/// signed_max_of_dest_type)) to dest_type)
52256/// or:
52257/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52258/// signed_min_of_dest_type)) to dest_type).
52259/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52260/// Return the source value to be truncated or SDValue() if the pattern was not
52261/// matched.
52262static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52263 using namespace llvm::SDPatternMatch;
52264 unsigned NumDstBits = VT.getScalarSizeInBits();
52265 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52266 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52267
52268 APInt SignedMax, SignedMin;
52269 if (MatchPackUS) {
52270 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52271 SignedMin = APInt::getZero(NumSrcBits);
52272 } else {
52273 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52274 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52275 }
52276
52277 SDValue SMin, SMax;
52278 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52279 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52280 return SMax;
52281
52282 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52283 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52284 return SMin;
52285
52286 return SDValue();
52287}
52288
52290 SelectionDAG &DAG,
52291 const X86Subtarget &Subtarget) {
52292 if (!Subtarget.hasSSE2() || !VT.isVector())
52293 return SDValue();
52294
52295 EVT SVT = VT.getVectorElementType();
52296 EVT InVT = In.getValueType();
52297 EVT InSVT = InVT.getVectorElementType();
52298
52299 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52300 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52301 // and concatenate at the same time. Then we can use a final vpmovuswb to
52302 // clip to 0-255.
52303 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52304 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52305 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52306 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52307 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52308 DL, DAG, Subtarget);
52309 assert(Mid && "Failed to pack!");
52310 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52311 }
52312 }
52313
52314 // vXi32 truncate instructions are available with AVX512F.
52315 // vXi16 truncate instructions are only available with AVX512BW.
52316 // For 256-bit or smaller vectors, we require VLX.
52317 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52318 // If the result type is 256-bits or larger and we have disable 512-bit
52319 // registers, we should go ahead and use the pack instructions if possible.
52320 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52321 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52322 (InVT.getSizeInBits() > 128) &&
52323 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52324 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52325
52326 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52328 (SVT == MVT::i8 || SVT == MVT::i16) &&
52329 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52330 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52331 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52332 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52333 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52334 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52335 DAG, Subtarget);
52336 assert(Mid && "Failed to pack!");
52338 Subtarget);
52339 assert(V && "Failed to pack!");
52340 return V;
52341 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52342 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52343 Subtarget);
52344 }
52345 if (SDValue SSatVal = detectSSatPattern(In, VT))
52346 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52347 Subtarget);
52348 }
52349
52350 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52351 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52352 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52353 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52354 unsigned TruncOpc = 0;
52355 SDValue SatVal;
52356 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52357 SatVal = SSatVal;
52358 TruncOpc = X86ISD::VTRUNCS;
52359 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52360 SatVal = USatVal;
52361 TruncOpc = X86ISD::VTRUNCUS;
52362 }
52363 if (SatVal) {
52364 unsigned ResElts = VT.getVectorNumElements();
52365 // If the input type is less than 512 bits and we don't have VLX, we need
52366 // to widen to 512 bits.
52367 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52368 unsigned NumConcats = 512 / InVT.getSizeInBits();
52369 ResElts *= NumConcats;
52370 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52371 ConcatOps[0] = SatVal;
52372 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52373 NumConcats * InVT.getVectorNumElements());
52374 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52375 }
52376 // Widen the result if its narrower than 128 bits.
52377 if (ResElts * SVT.getSizeInBits() < 128)
52378 ResElts = 128 / SVT.getSizeInBits();
52379 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52380 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52381 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52382 DAG.getVectorIdxConstant(0, DL));
52383 }
52384 }
52385
52386 return SDValue();
52387}
52388
52390 SelectionDAG &DAG,
52392 const X86Subtarget &Subtarget) {
52393 auto *Ld = cast<LoadSDNode>(N);
52394 EVT RegVT = Ld->getValueType(0);
52395 SDValue Ptr = Ld->getBasePtr();
52396 SDValue Chain = Ld->getChain();
52397 ISD::LoadExtType Ext = Ld->getExtensionType();
52398
52399 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52400 return SDValue();
52401
52402 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
52403 return SDValue();
52404
52406 if (!LdC)
52407 return SDValue();
52408
52409 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
52410 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
52411 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
52412 if (Undefs[I])
52413 continue;
52414 if (UserUndefs[I] || Bits[I] != UserBits[I])
52415 return false;
52416 }
52417 return true;
52418 };
52419
52420 // Look through all other loads/broadcasts in the chain for another constant
52421 // pool entry.
52422 for (SDNode *User : Chain->users()) {
52423 auto *UserLd = dyn_cast<MemSDNode>(User);
52424 if (User != N && UserLd &&
52425 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
52426 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
52428 UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
52429 User->getValueSizeInBits(0).getFixedValue() >
52430 RegVT.getFixedSizeInBits()) {
52431 EVT UserVT = User->getValueType(0);
52432 SDValue UserPtr = UserLd->getBasePtr();
52433 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
52434
52435 // See if we are loading a constant that matches in the lower
52436 // bits of a longer constant (but from a different constant pool ptr).
52437 if (UserC && UserPtr != Ptr) {
52438 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
52439 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
52440 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
52441 APInt Undefs, UserUndefs;
52442 SmallVector<APInt> Bits, UserBits;
52443 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
52444 UserVT.getScalarSizeInBits());
52445 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
52446 Bits) &&
52448 UserUndefs, UserBits)) {
52449 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
52450 SDValue Extract = extractSubVector(
52451 SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
52452 Extract = DAG.getBitcast(RegVT, Extract);
52453 return DCI.CombineTo(N, Extract, SDValue(User, 1));
52454 }
52455 }
52456 }
52457 }
52458 }
52459 }
52460
52461 return SDValue();
52462}
52463
52466 const X86Subtarget &Subtarget) {
52467 auto *Ld = cast<LoadSDNode>(N);
52468 EVT RegVT = Ld->getValueType(0);
52469 EVT MemVT = Ld->getMemoryVT();
52470 SDLoc dl(Ld);
52471 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52472
52473 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
52474 // into two 16-byte operations. Also split non-temporal aligned loads on
52475 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
52476 ISD::LoadExtType Ext = Ld->getExtensionType();
52477 unsigned Fast;
52478 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
52479 Ext == ISD::NON_EXTLOAD &&
52480 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
52481 Ld->getAlign() >= Align(16)) ||
52482 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
52483 *Ld->getMemOperand(), &Fast) &&
52484 !Fast))) {
52485 unsigned NumElems = RegVT.getVectorNumElements();
52486 if (NumElems < 2)
52487 return SDValue();
52488
52489 unsigned HalfOffset = 16;
52490 SDValue Ptr1 = Ld->getBasePtr();
52491 SDValue Ptr2 =
52492 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
52493 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
52494 NumElems / 2);
52495 SDValue Load1 =
52496 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
52497 Ld->getOriginalAlign(),
52498 Ld->getMemOperand()->getFlags());
52499 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
52500 Ld->getPointerInfo().getWithOffset(HalfOffset),
52501 Ld->getOriginalAlign(),
52502 Ld->getMemOperand()->getFlags());
52503 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
52504 Load1.getValue(1), Load2.getValue(1));
52505
52506 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
52507 return DCI.CombineTo(N, NewVec, TF, true);
52508 }
52509
52510 // Bool vector load - attempt to cast to an integer, as we have good
52511 // (vXiY *ext(vXi1 bitcast(iX))) handling.
52512 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
52513 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
52514 unsigned NumElts = RegVT.getVectorNumElements();
52515 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52516 if (TLI.isTypeLegal(IntVT)) {
52517 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
52518 Ld->getPointerInfo(),
52519 Ld->getOriginalAlign(),
52520 Ld->getMemOperand()->getFlags());
52521 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
52522 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
52523 }
52524 }
52525
52526 // If we also broadcast this vector to a wider type, then just extract the
52527 // lowest subvector.
52528 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
52529 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
52530 SDValue Ptr = Ld->getBasePtr();
52531 SDValue Chain = Ld->getChain();
52532 for (SDNode *User : Chain->users()) {
52533 auto *UserLd = dyn_cast<MemSDNode>(User);
52534 if (User != N && UserLd &&
52535 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
52536 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
52537 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
52538 !User->hasAnyUseOfValue(1) &&
52539 User->getValueSizeInBits(0).getFixedValue() >
52540 RegVT.getFixedSizeInBits()) {
52541 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
52542 RegVT.getSizeInBits());
52543 Extract = DAG.getBitcast(RegVT, Extract);
52544 return DCI.CombineTo(N, Extract, SDValue(User, 1));
52545 }
52546 }
52547 }
52548
52549 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
52550 return V;
52551
52552 // Cast ptr32 and ptr64 pointers to the default address space before a load.
52553 unsigned AddrSpace = Ld->getAddressSpace();
52554 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
52555 AddrSpace == X86AS::PTR32_UPTR) {
52556 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
52557 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
52558 SDValue Cast =
52559 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
52560 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
52561 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
52562 Ld->getMemOperand()->getFlags());
52563 }
52564 }
52565
52566 return SDValue();
52567}
52568
52569/// If V is a build vector of boolean constants and exactly one of those
52570/// constants is true, return the operand index of that true element.
52571/// Otherwise, return -1.
52572static int getOneTrueElt(SDValue V) {
52573 // This needs to be a build vector of booleans.
52574 // TODO: Checking for the i1 type matches the IR definition for the mask,
52575 // but the mask check could be loosened to i8 or other types. That might
52576 // also require checking more than 'allOnesValue'; eg, the x86 HW
52577 // instructions only require that the MSB is set for each mask element.
52578 // The ISD::MSTORE comments/definition do not specify how the mask operand
52579 // is formatted.
52580 auto *BV = dyn_cast<BuildVectorSDNode>(V);
52581 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
52582 return -1;
52583
52584 int TrueIndex = -1;
52585 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
52586 for (unsigned i = 0; i < NumElts; ++i) {
52587 const SDValue &Op = BV->getOperand(i);
52588 if (Op.isUndef())
52589 continue;
52590 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
52591 if (!ConstNode)
52592 return -1;
52593 if (ConstNode->getAPIntValue().countr_one() >= 1) {
52594 // If we already found a one, this is too many.
52595 if (TrueIndex >= 0)
52596 return -1;
52597 TrueIndex = i;
52598 }
52599 }
52600 return TrueIndex;
52601}
52602
52603/// Given a masked memory load/store operation, return true if it has one mask
52604/// bit set. If it has one mask bit set, then also return the memory address of
52605/// the scalar element to load/store, the vector index to insert/extract that
52606/// scalar element, and the alignment for the scalar memory access.
52608 SelectionDAG &DAG, SDValue &Addr,
52609 SDValue &Index, Align &Alignment,
52610 unsigned &Offset) {
52611 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
52612 if (TrueMaskElt < 0)
52613 return false;
52614
52615 // Get the address of the one scalar element that is specified by the mask
52616 // using the appropriate offset from the base pointer.
52617 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
52618 Offset = 0;
52619 Addr = MaskedOp->getBasePtr();
52620 if (TrueMaskElt != 0) {
52621 Offset = TrueMaskElt * EltVT.getStoreSize();
52623 SDLoc(MaskedOp));
52624 }
52625
52626 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
52627 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
52628 EltVT.getStoreSize());
52629 return true;
52630}
52631
52632/// If exactly one element of the mask is set for a non-extending masked load,
52633/// it is a scalar load and vector insert.
52634/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52635/// mask have already been optimized in IR, so we don't bother with those here.
52636static SDValue
52639 const X86Subtarget &Subtarget) {
52640 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52641 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52642 // However, some target hooks may need to be added to know when the transform
52643 // is profitable. Endianness would also have to be considered.
52644
52645 SDValue Addr, VecIndex;
52646 Align Alignment;
52647 unsigned Offset;
52648 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
52649 return SDValue();
52650
52651 // Load the one scalar element that is specified by the mask using the
52652 // appropriate offset from the base pointer.
52653 SDLoc DL(ML);
52654 EVT VT = ML->getValueType(0);
52655 EVT EltVT = VT.getVectorElementType();
52656
52657 EVT CastVT = VT;
52658 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
52659 EltVT = MVT::f64;
52660 CastVT = VT.changeVectorElementType(EltVT);
52661 }
52662
52663 SDValue Load =
52664 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
52665 ML->getPointerInfo().getWithOffset(Offset),
52666 Alignment, ML->getMemOperand()->getFlags());
52667
52668 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
52669
52670 // Insert the loaded element into the appropriate place in the vector.
52671 SDValue Insert =
52672 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
52673 Insert = DAG.getBitcast(VT, Insert);
52674 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
52675}
52676
52677static SDValue
52680 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52681 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
52682 return SDValue();
52683
52684 SDLoc DL(ML);
52685 EVT VT = ML->getValueType(0);
52686
52687 // If we are loading the first and last elements of a vector, it is safe and
52688 // always faster to load the whole vector. Replace the masked load with a
52689 // vector load and select.
52690 unsigned NumElts = VT.getVectorNumElements();
52691 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
52692 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
52693 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
52694 if (LoadFirstElt && LoadLastElt) {
52695 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
52696 ML->getMemOperand());
52697 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
52698 ML->getPassThru());
52699 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
52700 }
52701
52702 // Convert a masked load with a constant mask into a masked load and a select.
52703 // This allows the select operation to use a faster kind of select instruction
52704 // (for example, vblendvps -> vblendps).
52705
52706 // Don't try this if the pass-through operand is already undefined. That would
52707 // cause an infinite loop because that's what we're about to create.
52708 if (ML->getPassThru().isUndef())
52709 return SDValue();
52710
52711 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
52712 return SDValue();
52713
52714 // The new masked load has an undef pass-through operand. The select uses the
52715 // original pass-through operand.
52716 SDValue NewML = DAG.getMaskedLoad(
52717 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
52718 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
52719 ML->getAddressingMode(), ML->getExtensionType());
52720 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
52721 ML->getPassThru());
52722
52723 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
52724}
52725
52728 const X86Subtarget &Subtarget) {
52729 auto *Mld = cast<MaskedLoadSDNode>(N);
52730
52731 // TODO: Expanding load with constant mask may be optimized as well.
52732 if (Mld->isExpandingLoad())
52733 return SDValue();
52734
52735 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
52736 if (SDValue ScalarLoad =
52737 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
52738 return ScalarLoad;
52739
52740 // TODO: Do some AVX512 subsets benefit from this transform?
52741 if (!Subtarget.hasAVX512())
52742 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
52743 return Blend;
52744 }
52745
52746 // If the mask value has been legalized to a non-boolean vector, try to
52747 // simplify ops leading up to it. We only demand the MSB of each lane.
52748 SDValue Mask = Mld->getMask();
52749 if (Mask.getScalarValueSizeInBits() != 1) {
52750 EVT VT = Mld->getValueType(0);
52751 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52753 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
52754 if (N->getOpcode() != ISD::DELETED_NODE)
52755 DCI.AddToWorklist(N);
52756 return SDValue(N, 0);
52757 }
52758 if (SDValue NewMask =
52760 return DAG.getMaskedLoad(
52761 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
52762 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
52763 Mld->getAddressingMode(), Mld->getExtensionType());
52764 }
52765
52766 return SDValue();
52767}
52768
52769/// If exactly one element of the mask is set for a non-truncating masked store,
52770/// it is a vector extract and scalar store.
52771/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52772/// mask have already been optimized in IR, so we don't bother with those here.
52774 SelectionDAG &DAG,
52775 const X86Subtarget &Subtarget) {
52776 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52777 // However, some target hooks may need to be added to know when the transform
52778 // is profitable. Endianness would also have to be considered.
52779
52780 SDValue Addr, VecIndex;
52781 Align Alignment;
52782 unsigned Offset;
52783 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
52784 return SDValue();
52785
52786 // Extract the one scalar element that is actually being stored.
52787 SDLoc DL(MS);
52788 SDValue Value = MS->getValue();
52789 EVT VT = Value.getValueType();
52790 EVT EltVT = VT.getVectorElementType();
52791 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
52792 EltVT = MVT::f64;
52793 EVT CastVT = VT.changeVectorElementType(EltVT);
52794 Value = DAG.getBitcast(CastVT, Value);
52795 }
52796 SDValue Extract =
52797 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
52798
52799 // Store that element at the appropriate offset from the base pointer.
52800 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
52802 Alignment, MS->getMemOperand()->getFlags());
52803}
52804
52807 const X86Subtarget &Subtarget) {
52808 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
52809 if (Mst->isCompressingStore())
52810 return SDValue();
52811
52812 EVT VT = Mst->getValue().getValueType();
52813 SDLoc dl(Mst);
52814 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52815
52816 if (Mst->isTruncatingStore())
52817 return SDValue();
52818
52819 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
52820 return ScalarStore;
52821
52822 // If the mask value has been legalized to a non-boolean vector, try to
52823 // simplify ops leading up to it. We only demand the MSB of each lane.
52824 SDValue Mask = Mst->getMask();
52825 if (Mask.getScalarValueSizeInBits() != 1) {
52827 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
52828 if (N->getOpcode() != ISD::DELETED_NODE)
52829 DCI.AddToWorklist(N);
52830 return SDValue(N, 0);
52831 }
52832 if (SDValue NewMask =
52834 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
52835 Mst->getBasePtr(), Mst->getOffset(), NewMask,
52836 Mst->getMemoryVT(), Mst->getMemOperand(),
52837 Mst->getAddressingMode());
52838 }
52839
52840 SDValue Value = Mst->getValue();
52841 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
52842 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
52843 Mst->getMemoryVT())) {
52844 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
52845 Mst->getBasePtr(), Mst->getOffset(), Mask,
52846 Mst->getMemoryVT(), Mst->getMemOperand(),
52847 Mst->getAddressingMode(), true);
52848 }
52849
52850 return SDValue();
52851}
52852
52855 const X86Subtarget &Subtarget) {
52856 StoreSDNode *St = cast<StoreSDNode>(N);
52857 EVT StVT = St->getMemoryVT();
52858 SDLoc dl(St);
52859 SDValue StoredVal = St->getValue();
52860 EVT VT = StoredVal.getValueType();
52861 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52862
52863 // Convert a store of vXi1 into a store of iX and a bitcast.
52864 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
52865 VT.getVectorElementType() == MVT::i1) {
52866
52868 StoredVal = DAG.getBitcast(NewVT, StoredVal);
52869
52870 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52871 St->getPointerInfo(), St->getOriginalAlign(),
52872 St->getMemOperand()->getFlags());
52873 }
52874
52875 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
52876 // This will avoid a copy to k-register.
52877 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
52878 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
52879 StoredVal.getOperand(0).getValueType() == MVT::i8) {
52880 SDValue Val = StoredVal.getOperand(0);
52881 // We must store zeros to the unused bits.
52882 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
52883 return DAG.getStore(St->getChain(), dl, Val,
52884 St->getBasePtr(), St->getPointerInfo(),
52885 St->getOriginalAlign(),
52886 St->getMemOperand()->getFlags());
52887 }
52888
52889 // Widen v2i1/v4i1 stores to v8i1.
52890 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
52891 Subtarget.hasAVX512()) {
52892 unsigned NumConcats = 8 / VT.getVectorNumElements();
52893 // We must store zeros to the unused bits.
52894 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
52895 Ops[0] = StoredVal;
52896 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
52897 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52898 St->getPointerInfo(), St->getOriginalAlign(),
52899 St->getMemOperand()->getFlags());
52900 }
52901
52902 // Turn vXi1 stores of constants into a scalar store.
52903 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
52904 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
52906 // If its a v64i1 store without 64-bit support, we need two stores.
52907 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
52908 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
52909 StoredVal->ops().slice(0, 32));
52911 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
52912 StoredVal->ops().slice(32, 32));
52914
52915 SDValue Ptr0 = St->getBasePtr();
52916 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
52917
52918 SDValue Ch0 =
52919 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
52920 St->getOriginalAlign(),
52921 St->getMemOperand()->getFlags());
52922 SDValue Ch1 =
52923 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
52925 St->getOriginalAlign(),
52926 St->getMemOperand()->getFlags());
52927 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
52928 }
52929
52930 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
52931 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52932 St->getPointerInfo(), St->getOriginalAlign(),
52933 St->getMemOperand()->getFlags());
52934 }
52935
52936 // Convert scalar fabs/fneg load-store to integer equivalents.
52937 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
52938 (StoredVal.getOpcode() == ISD::FABS ||
52939 StoredVal.getOpcode() == ISD::FNEG) &&
52940 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
52941 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
52942 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
52943 if (TLI.isTypeLegal(IntVT)) {
52945 unsigned SignOp = ISD::XOR;
52946 if (StoredVal.getOpcode() == ISD::FABS) {
52947 SignMask = ~SignMask;
52948 SignOp = ISD::AND;
52949 }
52950 SDValue LogicOp = DAG.getNode(
52951 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
52952 DAG.getConstant(SignMask, dl, IntVT));
52953 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
52954 St->getPointerInfo(), St->getOriginalAlign(),
52955 St->getMemOperand()->getFlags());
52956 }
52957 }
52958
52959 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
52960 // Sandy Bridge, perform two 16-byte stores.
52961 unsigned Fast;
52962 if (VT.is256BitVector() && StVT == VT &&
52963 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
52964 *St->getMemOperand(), &Fast) &&
52965 !Fast) {
52966 unsigned NumElems = VT.getVectorNumElements();
52967 if (NumElems < 2)
52968 return SDValue();
52969
52970 return splitVectorStore(St, DAG);
52971 }
52972
52973 // Split under-aligned vector non-temporal stores.
52974 if (St->isNonTemporal() && StVT == VT &&
52975 St->getAlign().value() < VT.getStoreSize()) {
52976 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
52977 // vectors or the legalizer can scalarize it to use MOVNTI.
52978 if (VT.is256BitVector() || VT.is512BitVector()) {
52979 unsigned NumElems = VT.getVectorNumElements();
52980 if (NumElems < 2)
52981 return SDValue();
52982 return splitVectorStore(St, DAG);
52983 }
52984
52985 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
52986 // to use MOVNTI.
52987 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
52988 MVT NTVT = Subtarget.hasSSE4A()
52989 ? MVT::v2f64
52990 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
52991 return scalarizeVectorStore(St, NTVT, DAG);
52992 }
52993 }
52994
52995 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
52996 // supported, but avx512f is by extending to v16i32 and truncating.
52997 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
52998 St->getValue().getOpcode() == ISD::TRUNCATE &&
52999 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53000 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53001 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53002 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53003 St->getValue().getOperand(0));
53004 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53005 MVT::v16i8, St->getMemOperand());
53006 }
53007
53008 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53009 if (!St->isTruncatingStore() &&
53010 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53011 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53012 StoredVal.hasOneUse() &&
53013 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53014 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53015 return EmitTruncSStore(IsSigned, St->getChain(),
53016 dl, StoredVal.getOperand(0), St->getBasePtr(),
53017 VT, St->getMemOperand(), DAG);
53018 }
53019
53020 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53021 if (!St->isTruncatingStore()) {
53022 auto IsExtractedElement = [](SDValue V) {
53023 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53024 V = V.getOperand(0);
53025 unsigned Opc = V.getOpcode();
53026 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
53027 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53028 V.getOperand(0).hasOneUse())
53029 return V.getOperand(0);
53030 return SDValue();
53031 };
53032 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53033 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53034 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53035 SDValue Src = Trunc.getOperand(0);
53036 MVT DstVT = Trunc.getSimpleValueType();
53037 MVT SrcVT = Src.getSimpleValueType();
53038 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53039 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53040 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53041 if (NumTruncBits == VT.getSizeInBits() &&
53042 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53043 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53044 TruncVT, St->getMemOperand());
53045 }
53046 }
53047 }
53048 }
53049
53050 // Optimize trunc store (of multiple scalars) to shuffle and store.
53051 // First, pack all of the elements in one place. Next, store to memory
53052 // in fewer chunks.
53053 if (St->isTruncatingStore() && VT.isVector()) {
53054 if (TLI.isTruncStoreLegal(VT, StVT)) {
53055 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53056 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53057 dl, Val, St->getBasePtr(),
53058 St->getMemoryVT(), St->getMemOperand(), DAG);
53059 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53060 DAG, dl))
53061 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53062 dl, Val, St->getBasePtr(),
53063 St->getMemoryVT(), St->getMemOperand(), DAG);
53064 }
53065
53066 return SDValue();
53067 }
53068
53069 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53070 unsigned AddrSpace = St->getAddressSpace();
53071 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53072 AddrSpace == X86AS::PTR32_UPTR) {
53073 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53074 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53075 SDValue Cast =
53076 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53077 return DAG.getTruncStore(
53078 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53079 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
53080 St->getAAInfo());
53081 }
53082 }
53083
53084 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53085 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53086 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53087 Subtarget.hasCF() && St->isSimple()) {
53088 SDValue Cmov;
53089 if (StoredVal.getOpcode() == X86ISD::CMOV)
53090 Cmov = StoredVal;
53091 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53092 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53093 Cmov = StoredVal.getOperand(0);
53094 else
53095 return SDValue();
53096
53097 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53098 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53099 return SDValue();
53100
53101 bool InvertCC = false;
53102 SDValue V = SDValue(Ld, 0);
53103 if (V == Cmov.getOperand(1))
53104 InvertCC = true;
53105 else if (V != Cmov.getOperand(0))
53106 return SDValue();
53107
53108 SDVTList Tys = DAG.getVTList(MVT::Other);
53109 SDValue CC = Cmov.getOperand(2);
53110 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53111 if (InvertCC)
53112 CC = DAG.getTargetConstant(
53115 dl, MVT::i8);
53116 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53117 Cmov.getOperand(3)};
53118 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53119 St->getMemOperand());
53120 }
53121
53122 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53123 // the FP state in cases where an emms may be missing.
53124 // A preferable solution to the general problem is to figure out the right
53125 // places to insert EMMS. This qualifies as a quick hack.
53126
53127 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53128 if (VT.getSizeInBits() != 64)
53129 return SDValue();
53130
53131 const Function &F = DAG.getMachineFunction().getFunction();
53132 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53133 bool F64IsLegal =
53134 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53135
53136 if (!F64IsLegal || Subtarget.is64Bit())
53137 return SDValue();
53138
53139 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53140 cast<LoadSDNode>(St->getValue())->isSimple() &&
53141 St->getChain().hasOneUse() && St->isSimple()) {
53142 auto *Ld = cast<LoadSDNode>(St->getValue());
53143
53144 if (!ISD::isNormalLoad(Ld))
53145 return SDValue();
53146
53147 // Avoid the transformation if there are multiple uses of the loaded value.
53148 if (!Ld->hasNUsesOfValue(1, 0))
53149 return SDValue();
53150
53151 SDLoc LdDL(Ld);
53152 SDLoc StDL(N);
53153 // Lower to a single movq load/store pair.
53154 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53155 Ld->getBasePtr(), Ld->getMemOperand());
53156
53157 // Make sure new load is placed in same chain order.
53158 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53159 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53160 St->getMemOperand());
53161 }
53162
53163 // This is similar to the above case, but here we handle a scalar 64-bit
53164 // integer store that is extracted from a vector on a 32-bit target.
53165 // If we have SSE2, then we can treat it like a floating-point double
53166 // to get past legalization. The execution dependencies fixup pass will
53167 // choose the optimal machine instruction for the store if this really is
53168 // an integer or v2f32 rather than an f64.
53169 if (VT == MVT::i64 &&
53171 SDValue OldExtract = St->getOperand(1);
53172 SDValue ExtOp0 = OldExtract.getOperand(0);
53173 unsigned VecSize = ExtOp0.getValueSizeInBits();
53174 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53175 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53176 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53177 BitCast, OldExtract.getOperand(1));
53178 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53179 St->getPointerInfo(), St->getOriginalAlign(),
53180 St->getMemOperand()->getFlags());
53181 }
53182
53183 return SDValue();
53184}
53185
53188 const X86Subtarget &Subtarget) {
53189 auto *St = cast<MemIntrinsicSDNode>(N);
53190
53191 SDValue StoredVal = N->getOperand(1);
53192 MVT VT = StoredVal.getSimpleValueType();
53193 EVT MemVT = St->getMemoryVT();
53194
53195 // Figure out which elements we demand.
53196 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53197 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53198
53199 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53200 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53201 if (N->getOpcode() != ISD::DELETED_NODE)
53202 DCI.AddToWorklist(N);
53203 return SDValue(N, 0);
53204 }
53205
53206 return SDValue();
53207}
53208
53209/// Return 'true' if this vector operation is "horizontal"
53210/// and return the operands for the horizontal operation in LHS and RHS. A
53211/// horizontal operation performs the binary operation on successive elements
53212/// of its first operand, then on successive elements of its second operand,
53213/// returning the resulting values in a vector. For example, if
53214/// A = < float a0, float a1, float a2, float a3 >
53215/// and
53216/// B = < float b0, float b1, float b2, float b3 >
53217/// then the result of doing a horizontal operation on A and B is
53218/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53219/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53220/// A horizontal-op B, for some already available A and B, and if so then LHS is
53221/// set to A, RHS to B, and the routine returns 'true'.
53222static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53223 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53224 bool IsCommutative,
53225 SmallVectorImpl<int> &PostShuffleMask,
53226 bool ForceHorizOp) {
53227 // If either operand is undef, bail out. The binop should be simplified.
53228 if (LHS.isUndef() || RHS.isUndef())
53229 return false;
53230
53231 // Look for the following pattern:
53232 // A = < float a0, float a1, float a2, float a3 >
53233 // B = < float b0, float b1, float b2, float b3 >
53234 // and
53235 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53236 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53237 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53238 // which is A horizontal-op B.
53239
53240 MVT VT = LHS.getSimpleValueType();
53241 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53242 "Unsupported vector type for horizontal add/sub");
53243 unsigned NumElts = VT.getVectorNumElements();
53244
53245 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53246 SmallVectorImpl<int> &ShuffleMask) {
53247 bool UseSubVector = false;
53248 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53249 Op.getOperand(0).getValueType().is256BitVector() &&
53250 llvm::isNullConstant(Op.getOperand(1))) {
53251 Op = Op.getOperand(0);
53252 UseSubVector = true;
53253 }
53255 SmallVector<int, 16> SrcMask, ScaledMask;
53257 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53258 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53259 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53260 })) {
53261 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53262 if (!UseSubVector && SrcOps.size() <= 2 &&
53263 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53264 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53265 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53266 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53267 }
53268 if (UseSubVector && SrcOps.size() == 1 &&
53269 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53270 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53271 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53272 ShuffleMask.assign(Mask.begin(), Mask.end());
53273 }
53274 }
53275 };
53276
53277 // View LHS in the form
53278 // LHS = VECTOR_SHUFFLE A, B, LMask
53279 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53280 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53281 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53282 SDValue A, B;
53284 GetShuffle(LHS, A, B, LMask);
53285
53286 // Likewise, view RHS in the form
53287 // RHS = VECTOR_SHUFFLE C, D, RMask
53288 SDValue C, D;
53290 GetShuffle(RHS, C, D, RMask);
53291
53292 // At least one of the operands should be a vector shuffle.
53293 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53294 if (NumShuffles == 0)
53295 return false;
53296
53297 if (LMask.empty()) {
53298 A = LHS;
53299 for (unsigned i = 0; i != NumElts; ++i)
53300 LMask.push_back(i);
53301 }
53302
53303 if (RMask.empty()) {
53304 C = RHS;
53305 for (unsigned i = 0; i != NumElts; ++i)
53306 RMask.push_back(i);
53307 }
53308
53309 // If we have an unary mask, ensure the other op is set to null.
53310 if (isUndefOrInRange(LMask, 0, NumElts))
53311 B = SDValue();
53312 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53313 A = SDValue();
53314
53315 if (isUndefOrInRange(RMask, 0, NumElts))
53316 D = SDValue();
53317 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53318 C = SDValue();
53319
53320 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53321 // RHS operands and shuffle mask.
53322 if (A != C) {
53323 std::swap(C, D);
53325 }
53326 // Check that the shuffles are both shuffling the same vectors.
53327 if (!(A == C && B == D))
53328 return false;
53329
53330 PostShuffleMask.clear();
53331 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53332
53333 // LHS and RHS are now:
53334 // LHS = shuffle A, B, LMask
53335 // RHS = shuffle A, B, RMask
53336 // Check that the masks correspond to performing a horizontal operation.
53337 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53338 // so we just repeat the inner loop if this is a 256-bit op.
53339 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53340 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53341 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53342 assert((NumEltsPer128BitChunk % 2 == 0) &&
53343 "Vector type should have an even number of elements in each lane");
53344 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53345 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53346 // Ignore undefined components.
53347 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53348 if (LIdx < 0 || RIdx < 0 ||
53349 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53350 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53351 continue;
53352
53353 // Check that successive odd/even elements are being operated on. If not,
53354 // this is not a horizontal operation.
53355 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53356 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53357 return false;
53358
53359 // Compute the post-shuffle mask index based on where the element
53360 // is stored in the HOP result, and where it needs to be moved to.
53361 int Base = LIdx & ~1u;
53362 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53363 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53364
53365 // The low half of the 128-bit result must choose from A.
53366 // The high half of the 128-bit result must choose from B,
53367 // unless B is undef. In that case, we are always choosing from A.
53368 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53369 Index += NumEltsPer64BitChunk;
53370 PostShuffleMask[i + j] = Index;
53371 }
53372 }
53373
53374 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53375 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53376
53377 bool IsIdentityPostShuffle =
53378 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53379 if (IsIdentityPostShuffle)
53380 PostShuffleMask.clear();
53381
53382 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53383 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53384 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53385 return false;
53386
53387 // If the source nodes are already used in HorizOps then always accept this.
53388 // Shuffle folding should merge these back together.
53389 auto FoundHorizUser = [&](SDNode *User) {
53390 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53391 };
53392 ForceHorizOp =
53393 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53394 llvm::any_of(NewRHS->users(), FoundHorizUser));
53395
53396 // Assume a SingleSource HOP if we only shuffle one input and don't need to
53397 // shuffle the result.
53398 if (!ForceHorizOp &&
53399 !shouldUseHorizontalOp(NewLHS == NewRHS &&
53400 (NumShuffles < 2 || !IsIdentityPostShuffle),
53401 DAG, Subtarget))
53402 return false;
53403
53404 LHS = DAG.getBitcast(VT, NewLHS);
53405 RHS = DAG.getBitcast(VT, NewRHS);
53406 return true;
53407}
53408
53409// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
53411 const X86Subtarget &Subtarget) {
53412 EVT VT = N->getValueType(0);
53413 unsigned Opcode = N->getOpcode();
53414 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
53415 SmallVector<int, 8> PostShuffleMask;
53416
53417 auto MergableHorizOp = [N](unsigned HorizOpcode) {
53418 return N->hasOneUse() &&
53419 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
53420 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
53421 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
53422 };
53423
53424 switch (Opcode) {
53425 case ISD::FADD:
53426 case ISD::FSUB:
53427 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
53428 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
53429 SDValue LHS = N->getOperand(0);
53430 SDValue RHS = N->getOperand(1);
53431 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
53432 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53433 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53434 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
53435 if (!PostShuffleMask.empty())
53436 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53437 DAG.getUNDEF(VT), PostShuffleMask);
53438 return HorizBinOp;
53439 }
53440 }
53441 break;
53442 case ISD::ADD:
53443 case ISD::SUB:
53444 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
53445 VT == MVT::v16i16 || VT == MVT::v8i32)) {
53446 SDValue LHS = N->getOperand(0);
53447 SDValue RHS = N->getOperand(1);
53448 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
53449 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53450 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53451 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
53452 ArrayRef<SDValue> Ops) {
53453 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
53454 };
53455 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
53456 {LHS, RHS}, HOpBuilder);
53457 if (!PostShuffleMask.empty())
53458 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53459 DAG.getUNDEF(VT), PostShuffleMask);
53460 return HorizBinOp;
53461 }
53462 }
53463 break;
53464 }
53465
53466 return SDValue();
53467}
53468
53469// Try to combine the following nodes
53470// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
53471// <i32 -2147483648[float -0.000000e+00]> 0
53472// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
53473// <(load 4 from constant-pool)> t0, t29
53474// [t30: v16i32 = bitcast t27]
53475// t6: v16i32 = xor t7, t27[t30]
53476// t11: v16f32 = bitcast t6
53477// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
53478// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
53479// t22: v16f32 = bitcast t7
53480// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
53481// t24: v32f16 = bitcast t23
53483 const X86Subtarget &Subtarget) {
53484 EVT VT = N->getValueType(0);
53485 SDValue LHS = N->getOperand(0);
53486 SDValue RHS = N->getOperand(1);
53487 int CombineOpcode =
53488 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
53489 auto combineConjugation = [&](SDValue &r) {
53490 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
53491 SDValue XOR = LHS.getOperand(0);
53492 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
53493 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
53494 if (XORRHS.isConstant()) {
53495 APInt ConjugationInt32 = APInt(32, 0x80000000);
53496 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
53497 if ((XORRHS.getBitWidth() == 32 &&
53498 XORRHS.getConstant() == ConjugationInt32) ||
53499 (XORRHS.getBitWidth() == 64 &&
53500 XORRHS.getConstant() == ConjugationInt64)) {
53501 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
53502 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
53503 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
53504 r = DAG.getBitcast(VT, FCMulC);
53505 return true;
53506 }
53507 }
53508 }
53509 }
53510 return false;
53511 };
53512 SDValue Res;
53513 if (combineConjugation(Res))
53514 return Res;
53515 std::swap(LHS, RHS);
53516 if (combineConjugation(Res))
53517 return Res;
53518 return Res;
53519}
53520
53521// Try to combine the following nodes:
53522// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
53524 const X86Subtarget &Subtarget) {
53525 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
53527 Flags.hasAllowContract();
53528 };
53529
53530 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
53531 return DAG.getTarget().Options.NoSignedZerosFPMath ||
53532 Flags.hasNoSignedZeros();
53533 };
53534 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
53535 APInt AI = APInt(32, 0x80008000);
53536 KnownBits Bits = DAG.computeKnownBits(Op);
53537 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
53538 Bits.getConstant() == AI;
53539 };
53540
53541 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
53542 !AllowContract(N->getFlags()))
53543 return SDValue();
53544
53545 EVT VT = N->getValueType(0);
53546 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
53547 return SDValue();
53548
53549 SDValue LHS = N->getOperand(0);
53550 SDValue RHS = N->getOperand(1);
53551 bool IsConj;
53552 SDValue FAddOp1, MulOp0, MulOp1;
53553 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
53554 &IsVectorAllNegativeZero,
53555 &HasNoSignedZero](SDValue N) -> bool {
53556 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
53557 return false;
53558 SDValue Op0 = N.getOperand(0);
53559 unsigned Opcode = Op0.getOpcode();
53560 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
53561 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
53562 MulOp0 = Op0.getOperand(0);
53563 MulOp1 = Op0.getOperand(1);
53564 IsConj = Opcode == X86ISD::VFCMULC;
53565 return true;
53566 }
53567 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
53569 HasNoSignedZero(Op0->getFlags())) ||
53570 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
53571 MulOp0 = Op0.getOperand(0);
53572 MulOp1 = Op0.getOperand(1);
53573 IsConj = Opcode == X86ISD::VFCMADDC;
53574 return true;
53575 }
53576 }
53577 return false;
53578 };
53579
53580 if (GetCFmulFrom(LHS))
53581 FAddOp1 = RHS;
53582 else if (GetCFmulFrom(RHS))
53583 FAddOp1 = LHS;
53584 else
53585 return SDValue();
53586
53587 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
53588 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
53589 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
53590 // FIXME: How do we handle when fast math flags of FADD are different from
53591 // CFMUL's?
53592 SDValue CFmul =
53593 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
53594 return DAG.getBitcast(VT, CFmul);
53595}
53596
53597/// Do target-specific dag combines on floating-point adds/subs.
53599 const X86Subtarget &Subtarget) {
53600 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
53601 return HOp;
53602
53603 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
53604 return COp;
53605
53606 return SDValue();
53607}
53608
53610 const X86Subtarget &Subtarget) {
53611 EVT VT = N->getValueType(0);
53612 SDValue Src = N->getOperand(0);
53613 EVT SrcVT = Src.getValueType();
53614 SDLoc DL(N);
53615
53616 if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
53617 SrcVT != MVT::v2f32)
53618 return SDValue();
53619
53620 return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
53621 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
53622 DAG.getUNDEF(SrcVT)));
53623}
53624
53625/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
53626/// the codegen.
53627/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
53628/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
53629/// anything that is guaranteed to be transformed by DAGCombiner.
53631 const X86Subtarget &Subtarget,
53632 const SDLoc &DL) {
53633 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
53634 SDValue Src = N->getOperand(0);
53635 unsigned SrcOpcode = Src.getOpcode();
53636 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53637
53638 EVT VT = N->getValueType(0);
53639 EVT SrcVT = Src.getValueType();
53640
53641 auto IsFreeTruncation = [VT](SDValue Op) {
53642 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
53643
53644 // See if this has been extended from a smaller/equal size to
53645 // the truncation size, allowing a truncation to combine with the extend.
53646 unsigned Opcode = Op.getOpcode();
53647 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
53648 Opcode == ISD::ZERO_EXTEND) &&
53649 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
53650 return true;
53651
53652 // See if this is a single use constant which can be constant folded.
53653 // NOTE: We don't peek throught bitcasts here because there is currently
53654 // no support for constant folding truncate+bitcast+vector_of_constants. So
53655 // we'll just send up with a truncate on both operands which will
53656 // get turned back into (truncate (binop)) causing an infinite loop.
53657 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
53658 };
53659
53660 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
53661 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
53662 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
53663 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
53664 };
53665
53666 // Don't combine if the operation has other uses.
53667 if (!Src.hasOneUse())
53668 return SDValue();
53669
53670 // Only support vector truncation for now.
53671 // TODO: i64 scalar math would benefit as well.
53672 if (!VT.isVector())
53673 return SDValue();
53674
53675 // In most cases its only worth pre-truncating if we're only facing the cost
53676 // of one truncation.
53677 // i.e. if one of the inputs will constant fold or the input is repeated.
53678 switch (SrcOpcode) {
53679 case ISD::MUL:
53680 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
53681 // better to truncate if we have the chance.
53682 if (SrcVT.getScalarType() == MVT::i64 &&
53683 TLI.isOperationLegal(SrcOpcode, VT) &&
53684 !TLI.isOperationLegal(SrcOpcode, SrcVT))
53685 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
53686 [[fallthrough]];
53687 case ISD::AND:
53688 case ISD::XOR:
53689 case ISD::OR:
53690 case ISD::ADD:
53691 case ISD::SUB: {
53692 SDValue Op0 = Src.getOperand(0);
53693 SDValue Op1 = Src.getOperand(1);
53694 if (TLI.isOperationLegal(SrcOpcode, VT) &&
53695 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
53696 return TruncateArithmetic(Op0, Op1);
53697 break;
53698 }
53699 }
53700
53701 return SDValue();
53702}
53703
53704// Try to form a MULHU or MULHS node by looking for
53705// (trunc (srl (mul ext, ext), 16))
53706// TODO: This is X86 specific because we want to be able to handle wide types
53707// before type legalization. But we can only do it if the vector will be
53708// legalized via widening/splitting. Type legalization can't handle promotion
53709// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
53710// combiner.
53711static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
53712 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
53713 using namespace llvm::SDPatternMatch;
53714
53715 if (!Subtarget.hasSSE2())
53716 return SDValue();
53717
53718 // Only handle vXi16 types that are at least 128-bits unless they will be
53719 // widened.
53720 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
53721 return SDValue();
53722
53723 // Input type should be at least vXi32.
53724 EVT InVT = Src.getValueType();
53725 if (InVT.getVectorElementType().getSizeInBits() < 32)
53726 return SDValue();
53727
53728 // First instruction should be a right shift by 16 of a multiply.
53729 SDValue LHS, RHS;
53730 if (!sd_match(Src,
53731 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_SpecificInt(16))))
53732 return SDValue();
53733
53734 // Count leading sign/zero bits on both inputs - if there are enough then
53735 // truncation back to vXi16 will be cheap - either as a pack/shuffle
53736 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
53737 // truncations may actually be free by peeking through to the ext source.
53738 auto IsSext = [&DAG](SDValue V) {
53739 return DAG.ComputeMaxSignificantBits(V) <= 16;
53740 };
53741 auto IsZext = [&DAG](SDValue V) {
53742 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
53743 };
53744
53745 bool IsSigned = IsSext(LHS) && IsSext(RHS);
53746 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
53747 if (!IsSigned && !IsUnsigned)
53748 return SDValue();
53749
53750 // Check if both inputs are extensions, which will be removed by truncation.
53751 auto isOpTruncateFree = [](SDValue Op) {
53752 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
53753 Op.getOpcode() == ISD::ZERO_EXTEND)
53754 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
53755 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
53756 };
53757 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
53758
53759 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
53760 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
53761 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
53762 // will have to split anyway.
53763 unsigned InSizeInBits = InVT.getSizeInBits();
53764 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
53765 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
53766 (InSizeInBits % 16) == 0) {
53767 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53768 InVT.getSizeInBits() / 16);
53769 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
53770 DAG.getBitcast(BCVT, RHS));
53771 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
53772 }
53773
53774 // Truncate back to source type.
53775 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
53776 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
53777
53778 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
53779 return DAG.getNode(Opc, DL, VT, LHS, RHS);
53780}
53781
53782// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
53783// from one vector with signed bytes from another vector, adds together
53784// adjacent pairs of 16-bit products, and saturates the result before
53785// truncating to 16-bits.
53786//
53787// Which looks something like this:
53788// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
53789// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
53791 const X86Subtarget &Subtarget,
53792 const SDLoc &DL) {
53793 if (!VT.isVector() || !Subtarget.hasSSSE3())
53794 return SDValue();
53795
53796 unsigned NumElems = VT.getVectorNumElements();
53797 EVT ScalarVT = VT.getVectorElementType();
53798 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
53799 return SDValue();
53800
53801 SDValue SSatVal = detectSSatPattern(In, VT);
53802 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
53803 return SDValue();
53804
53805 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
53806 // of multiplies from even/odd elements.
53807 SDValue N0 = SSatVal.getOperand(0);
53808 SDValue N1 = SSatVal.getOperand(1);
53809
53810 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
53811 return SDValue();
53812
53813 SDValue N00 = N0.getOperand(0);
53814 SDValue N01 = N0.getOperand(1);
53815 SDValue N10 = N1.getOperand(0);
53816 SDValue N11 = N1.getOperand(1);
53817
53818 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
53819 // Canonicalize zero_extend to LHS.
53820 if (N01.getOpcode() == ISD::ZERO_EXTEND)
53821 std::swap(N00, N01);
53822 if (N11.getOpcode() == ISD::ZERO_EXTEND)
53823 std::swap(N10, N11);
53824
53825 // Ensure we have a zero_extend and a sign_extend.
53826 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
53827 N01.getOpcode() != ISD::SIGN_EXTEND ||
53828 N10.getOpcode() != ISD::ZERO_EXTEND ||
53829 N11.getOpcode() != ISD::SIGN_EXTEND)
53830 return SDValue();
53831
53832 // Peek through the extends.
53833 N00 = N00.getOperand(0);
53834 N01 = N01.getOperand(0);
53835 N10 = N10.getOperand(0);
53836 N11 = N11.getOperand(0);
53837
53838 // Ensure the extend is from vXi8.
53839 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
53840 N01.getValueType().getVectorElementType() != MVT::i8 ||
53841 N10.getValueType().getVectorElementType() != MVT::i8 ||
53842 N11.getValueType().getVectorElementType() != MVT::i8)
53843 return SDValue();
53844
53845 // All inputs should be build_vectors.
53846 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
53847 N01.getOpcode() != ISD::BUILD_VECTOR ||
53848 N10.getOpcode() != ISD::BUILD_VECTOR ||
53850 return SDValue();
53851
53852 // N00/N10 are zero extended. N01/N11 are sign extended.
53853
53854 // For each element, we need to ensure we have an odd element from one vector
53855 // multiplied by the odd element of another vector and the even element from
53856 // one of the same vectors being multiplied by the even element from the
53857 // other vector. So we need to make sure for each element i, this operator
53858 // is being performed:
53859 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
53860 SDValue ZExtIn, SExtIn;
53861 for (unsigned i = 0; i != NumElems; ++i) {
53862 SDValue N00Elt = N00.getOperand(i);
53863 SDValue N01Elt = N01.getOperand(i);
53864 SDValue N10Elt = N10.getOperand(i);
53865 SDValue N11Elt = N11.getOperand(i);
53866 // TODO: Be more tolerant to undefs.
53867 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53868 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53869 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53871 return SDValue();
53872 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
53873 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
53874 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
53875 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
53876 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
53877 return SDValue();
53878 unsigned IdxN00 = ConstN00Elt->getZExtValue();
53879 unsigned IdxN01 = ConstN01Elt->getZExtValue();
53880 unsigned IdxN10 = ConstN10Elt->getZExtValue();
53881 unsigned IdxN11 = ConstN11Elt->getZExtValue();
53882 // Add is commutative so indices can be reordered.
53883 if (IdxN00 > IdxN10) {
53884 std::swap(IdxN00, IdxN10);
53885 std::swap(IdxN01, IdxN11);
53886 }
53887 // N0 indices be the even element. N1 indices must be the next odd element.
53888 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
53889 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
53890 return SDValue();
53891 SDValue N00In = N00Elt.getOperand(0);
53892 SDValue N01In = N01Elt.getOperand(0);
53893 SDValue N10In = N10Elt.getOperand(0);
53894 SDValue N11In = N11Elt.getOperand(0);
53895 // First time we find an input capture it.
53896 if (!ZExtIn) {
53897 ZExtIn = N00In;
53898 SExtIn = N01In;
53899 }
53900 if (ZExtIn != N00In || SExtIn != N01In ||
53901 ZExtIn != N10In || SExtIn != N11In)
53902 return SDValue();
53903 }
53904
53905 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
53906 EVT ExtVT = Ext.getValueType();
53907 if (ExtVT.getVectorNumElements() != NumElems * 2) {
53908 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
53909 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
53910 DAG.getVectorIdxConstant(0, DL));
53911 }
53912 };
53913 ExtractVec(ZExtIn);
53914 ExtractVec(SExtIn);
53915
53916 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
53917 ArrayRef<SDValue> Ops) {
53918 // Shrink by adding truncate nodes and let DAGCombine fold with the
53919 // sources.
53920 EVT InVT = Ops[0].getValueType();
53921 assert(InVT.getScalarType() == MVT::i8 &&
53922 "Unexpected scalar element type");
53923 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
53924 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53925 InVT.getVectorNumElements() / 2);
53926 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
53927 };
53928 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
53929 PMADDBuilder);
53930}
53931
53933 const X86Subtarget &Subtarget) {
53934 EVT VT = N->getValueType(0);
53935 SDValue Src = N->getOperand(0);
53936 SDLoc DL(N);
53937
53938 // Attempt to pre-truncate inputs to arithmetic ops instead.
53939 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
53940 return V;
53941
53942 // Try to detect PMADD
53943 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
53944 return PMAdd;
53945
53946 // Try to combine truncation with signed/unsigned saturation.
53947 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
53948 return Val;
53949
53950 // Try to combine PMULHUW/PMULHW for vXi16.
53951 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
53952 return V;
53953
53954 // The bitcast source is a direct mmx result.
53955 // Detect bitcasts between i32 to x86mmx
53956 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
53957 SDValue BCSrc = Src.getOperand(0);
53958 if (BCSrc.getValueType() == MVT::x86mmx)
53959 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
53960 }
53961
53962 return SDValue();
53963}
53964
53967 EVT VT = N->getValueType(0);
53968 SDValue In = N->getOperand(0);
53969 SDLoc DL(N);
53970
53971 if (SDValue SSatVal = detectSSatPattern(In, VT))
53972 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
53973 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
53974 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
53975
53976 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53977 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
53978 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53979 return SDValue(N, 0);
53980
53981 return SDValue();
53982}
53983
53984/// Returns the negated value if the node \p N flips sign of FP value.
53985///
53986/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
53987/// or FSUB(0, x)
53988/// AVX512F does not have FXOR, so FNEG is lowered as
53989/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
53990/// In this case we go though all bitcasts.
53991/// This also recognizes splat of a negated value and returns the splat of that
53992/// value.
53993static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
53994 if (N->getOpcode() == ISD::FNEG)
53995 return N->getOperand(0);
53996
53997 // Don't recurse exponentially.
53999 return SDValue();
54000
54001 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54002
54004 EVT VT = Op->getValueType(0);
54005
54006 // Make sure the element size doesn't change.
54007 if (VT.getScalarSizeInBits() != ScalarSize)
54008 return SDValue();
54009
54010 unsigned Opc = Op.getOpcode();
54011 switch (Opc) {
54012 case ISD::VECTOR_SHUFFLE: {
54013 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54014 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54015 if (!Op.getOperand(1).isUndef())
54016 return SDValue();
54017 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54018 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54019 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54020 cast<ShuffleVectorSDNode>(Op)->getMask());
54021 break;
54022 }
54024 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54025 // -V, INDEX).
54026 SDValue InsVector = Op.getOperand(0);
54027 SDValue InsVal = Op.getOperand(1);
54028 if (!InsVector.isUndef())
54029 return SDValue();
54030 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54031 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54032 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54033 NegInsVal, Op.getOperand(2));
54034 break;
54035 }
54036 case ISD::FSUB:
54037 case ISD::XOR:
54038 case X86ISD::FXOR: {
54039 SDValue Op1 = Op.getOperand(1);
54040 SDValue Op0 = Op.getOperand(0);
54041
54042 // For XOR and FXOR, we want to check if constant
54043 // bits of Op1 are sign bit masks. For FSUB, we
54044 // have to check if constant bits of Op0 are sign
54045 // bit masks and hence we swap the operands.
54046 if (Opc == ISD::FSUB)
54047 std::swap(Op0, Op1);
54048
54049 APInt UndefElts;
54050 SmallVector<APInt, 16> EltBits;
54051 // Extract constant bits and see if they are all
54052 // sign bit masks. Ignore the undef elements.
54053 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54054 /* AllowWholeUndefs */ true,
54055 /* AllowPartialUndefs */ false)) {
54056 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54057 if (!UndefElts[I] && !EltBits[I].isSignMask())
54058 return SDValue();
54059
54060 // Only allow bitcast from correctly-sized constant.
54061 Op0 = peekThroughBitcasts(Op0);
54062 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54063 return Op0;
54064 }
54065 break;
54066 } // case
54067 } // switch
54068
54069 return SDValue();
54070}
54071
54072static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54073 bool NegRes) {
54074 if (NegMul) {
54075 switch (Opcode) {
54076 // clang-format off
54077 default: llvm_unreachable("Unexpected opcode");
54078 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54079 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54080 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54081 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54082 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54083 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54084 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54085 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54086 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54087 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54088 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54089 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54090 // clang-format on
54091 }
54092 }
54093
54094 if (NegAcc) {
54095 switch (Opcode) {
54096 // clang-format off
54097 default: llvm_unreachable("Unexpected opcode");
54098 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54099 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54100 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54101 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54102 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54103 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54104 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54105 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54106 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54107 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54108 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54109 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54110 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54111 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54112 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54113 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54114 // clang-format on
54115 }
54116 }
54117
54118 if (NegRes) {
54119 switch (Opcode) {
54120 // For accuracy reason, we never combine fneg and fma under strict FP.
54121 // clang-format off
54122 default: llvm_unreachable("Unexpected opcode");
54123 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54124 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54125 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54126 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54127 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54128 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54129 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54130 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54131 // clang-format on
54132 }
54133 }
54134
54135 return Opcode;
54136}
54137
54138/// Do target-specific dag combines on floating point negations.
54141 const X86Subtarget &Subtarget) {
54142 EVT OrigVT = N->getValueType(0);
54143 SDValue Arg = isFNEG(DAG, N);
54144 if (!Arg)
54145 return SDValue();
54146
54147 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54148 EVT VT = Arg.getValueType();
54149 EVT SVT = VT.getScalarType();
54150 SDLoc DL(N);
54151
54152 // Let legalize expand this if it isn't a legal type yet.
54153 if (!TLI.isTypeLegal(VT))
54154 return SDValue();
54155
54156 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54157 // use of a constant by performing (-0 - A*B) instead.
54158 // FIXME: Check rounding control flags as well once it becomes available.
54159 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54160 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54161 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54162 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54163 Arg.getOperand(1), Zero);
54164 return DAG.getBitcast(OrigVT, NewNode);
54165 }
54166
54167 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54168 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54169 if (SDValue NegArg =
54170 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54171 return DAG.getBitcast(OrigVT, NegArg);
54172
54173 return SDValue();
54174}
54175
54177 bool LegalOperations,
54178 bool ForCodeSize,
54180 unsigned Depth) const {
54181 // fneg patterns are removable even if they have multiple uses.
54182 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54184 return DAG.getBitcast(Op.getValueType(), Arg);
54185 }
54186
54187 EVT VT = Op.getValueType();
54188 EVT SVT = VT.getScalarType();
54189 unsigned Opc = Op.getOpcode();
54190 SDNodeFlags Flags = Op.getNode()->getFlags();
54191 switch (Opc) {
54192 case ISD::FMA:
54193 case X86ISD::FMSUB:
54194 case X86ISD::FNMADD:
54195 case X86ISD::FNMSUB:
54196 case X86ISD::FMADD_RND:
54197 case X86ISD::FMSUB_RND:
54198 case X86ISD::FNMADD_RND:
54199 case X86ISD::FNMSUB_RND: {
54200 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54201 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54203 break;
54204
54205 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54206 // if it may have signed zeros.
54207 if (!Flags.hasNoSignedZeros())
54208 break;
54209
54210 // This is always negatible for free but we might be able to remove some
54211 // extra operand negations as well.
54213 for (int i = 0; i != 3; ++i)
54214 NewOps[i] = getCheaperNegatedExpression(
54215 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54216
54217 bool NegA = !!NewOps[0];
54218 bool NegB = !!NewOps[1];
54219 bool NegC = !!NewOps[2];
54220 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54221
54222 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54224
54225 // Fill in the non-negated ops with the original values.
54226 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54227 if (!NewOps[i])
54228 NewOps[i] = Op.getOperand(i);
54229 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54230 }
54231 case X86ISD::FRCP:
54232 if (SDValue NegOp0 =
54233 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54234 ForCodeSize, Cost, Depth + 1))
54235 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54236 break;
54237 }
54238
54239 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54240 ForCodeSize, Cost, Depth);
54241}
54242
54244 const X86Subtarget &Subtarget) {
54245 MVT VT = N->getSimpleValueType(0);
54246 // If we have integer vector types available, use the integer opcodes.
54247 if (!VT.isVector() || !Subtarget.hasSSE2())
54248 return SDValue();
54249
54250 SDLoc dl(N);
54251
54252 unsigned IntBits = VT.getScalarSizeInBits();
54253 MVT IntSVT = MVT::getIntegerVT(IntBits);
54254 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
54255
54256 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54257 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54258 unsigned IntOpcode;
54259 switch (N->getOpcode()) {
54260 // clang-format off
54261 default: llvm_unreachable("Unexpected FP logic op");
54262 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54263 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54264 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54265 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54266 // clang-format on
54267 }
54268 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54269 return DAG.getBitcast(VT, IntOp);
54270}
54271
54272
54273/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54275 if (N->getOpcode() != ISD::XOR)
54276 return SDValue();
54277
54278 SDValue LHS = N->getOperand(0);
54279 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54280 return SDValue();
54281
54283 X86::CondCode(LHS->getConstantOperandVal(0)));
54284 SDLoc DL(N);
54285 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54286}
54287
54289 const X86Subtarget &Subtarget) {
54290 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54291 "Invalid opcode for combing with CTLZ");
54292 if (Subtarget.hasFastLZCNT())
54293 return SDValue();
54294
54295 EVT VT = N->getValueType(0);
54296 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54297 (VT != MVT::i64 || !Subtarget.is64Bit()))
54298 return SDValue();
54299
54300 SDValue N0 = N->getOperand(0);
54301 SDValue N1 = N->getOperand(1);
54302
54303 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54305 return SDValue();
54306
54307 SDValue OpCTLZ;
54308 SDValue OpSizeTM1;
54309
54310 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54311 OpCTLZ = N1;
54312 OpSizeTM1 = N0;
54313 } else if (N->getOpcode() == ISD::SUB) {
54314 return SDValue();
54315 } else {
54316 OpCTLZ = N0;
54317 OpSizeTM1 = N1;
54318 }
54319
54320 if (!OpCTLZ.hasOneUse())
54321 return SDValue();
54322 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
54323 if (!C)
54324 return SDValue();
54325
54326 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54327 return SDValue();
54328 EVT OpVT = VT;
54329 SDValue Op = OpCTLZ.getOperand(0);
54330 if (VT == MVT::i8) {
54331 // Zero extend to i32 since there is not an i8 bsr.
54332 OpVT = MVT::i32;
54333 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
54334 }
54335
54336 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
54337 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
54338 if (VT == MVT::i8)
54339 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
54340
54341 return Op;
54342}
54343
54346 const X86Subtarget &Subtarget) {
54347 SDValue N0 = N->getOperand(0);
54348 SDValue N1 = N->getOperand(1);
54349 EVT VT = N->getValueType(0);
54350 SDLoc DL(N);
54351
54352 // If this is SSE1 only convert to FXOR to avoid scalarization.
54353 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
54354 return DAG.getBitcast(MVT::v4i32,
54355 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
54356 DAG.getBitcast(MVT::v4f32, N0),
54357 DAG.getBitcast(MVT::v4f32, N1)));
54358 }
54359
54360 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
54361 return Cmp;
54362
54363 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
54364 return R;
54365
54366 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
54367 return R;
54368
54369 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
54370 return R;
54371
54372 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
54373 DAG, DCI, Subtarget))
54374 return FPLogic;
54375
54376 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
54377 return R;
54378
54379 if (DCI.isBeforeLegalizeOps())
54380 return SDValue();
54381
54382 if (SDValue SetCC = foldXor1SetCC(N, DAG))
54383 return SetCC;
54384
54385 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
54386 return R;
54387
54388 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
54389 return RV;
54390
54391 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
54392 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54393 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
54394 N0.getOperand(0).getValueType().isVector() &&
54395 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
54396 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
54397 return DAG.getBitcast(
54398 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
54399 }
54400
54401 // Handle AVX512 mask widening.
54402 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
54403 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
54404 VT.getVectorElementType() == MVT::i1 &&
54406 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
54407 return DAG.getNode(
54409 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
54410 N0.getOperand(2));
54411 }
54412
54413 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
54414 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
54415 // TODO: Under what circumstances could this be performed in DAGCombine?
54416 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
54417 N0.getOperand(0).getOpcode() == N->getOpcode()) {
54418 SDValue TruncExtSrc = N0.getOperand(0);
54419 auto *N1C = dyn_cast<ConstantSDNode>(N1);
54420 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
54421 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
54422 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
54423 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
54424 return DAG.getNode(ISD::XOR, DL, VT, LHS,
54425 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
54426 }
54427 }
54428
54429 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
54430 return R;
54431
54432 return combineFneg(N, DAG, DCI, Subtarget);
54433}
54434
54437 const X86Subtarget &Subtarget) {
54438 SDValue N0 = N->getOperand(0);
54439 EVT VT = N->getValueType(0);
54440
54441 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
54442 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
54443 SDValue Src = N0.getOperand(0);
54444 EVT SrcVT = Src.getValueType();
54445 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
54446 (DCI.isBeforeLegalize() ||
54447 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
54448 Subtarget.hasSSSE3()) {
54449 unsigned NumElts = SrcVT.getVectorNumElements();
54450 SmallVector<int, 32> ReverseMask(NumElts);
54451 for (unsigned I = 0; I != NumElts; ++I)
54452 ReverseMask[I] = (NumElts - 1) - I;
54453 SDValue Rev =
54454 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
54455 return DAG.getBitcast(VT, Rev);
54456 }
54457 }
54458
54459 return SDValue();
54460}
54461
54462// Various combines to try to convert to avgceilu.
54465 const X86Subtarget &Subtarget) {
54466 unsigned Opcode = N->getOpcode();
54467 SDValue N0 = N->getOperand(0);
54468 SDValue N1 = N->getOperand(1);
54469 EVT VT = N->getValueType(0);
54470 EVT SVT = VT.getScalarType();
54471 SDLoc DL(N);
54472
54473 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
54474 // Only useful on vXi8 which doesn't have good SRA handling.
54475 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
54477 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
54478 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
54479 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
54480 return DAG.getNode(ISD::XOR, DL, VT,
54481 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
54482 }
54483
54484 return SDValue();
54485}
54486
54489 const X86Subtarget &Subtarget) {
54490 EVT VT = N->getValueType(0);
54491 unsigned NumBits = VT.getSizeInBits();
54492
54493 // TODO - Constant Folding.
54494
54495 // Simplify the inputs.
54496 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54497 APInt DemandedMask(APInt::getAllOnes(NumBits));
54498 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54499 return SDValue(N, 0);
54500
54501 return SDValue();
54502}
54503
54505 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
54506}
54507
54508/// If a value is a scalar FP zero or a vector FP zero (potentially including
54509/// undefined elements), return a zero constant that may be used to fold away
54510/// that value. In the case of a vector, the returned constant will not contain
54511/// undefined elements even if the input parameter does. This makes it suitable
54512/// to be used as a replacement operand with operations (eg, bitwise-and) where
54513/// an undef should not propagate.
54515 const X86Subtarget &Subtarget) {
54517 return SDValue();
54518
54519 if (V.getValueType().isVector())
54520 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
54521
54522 return V;
54523}
54524
54526 const X86Subtarget &Subtarget) {
54527 SDValue N0 = N->getOperand(0);
54528 SDValue N1 = N->getOperand(1);
54529 EVT VT = N->getValueType(0);
54530 SDLoc DL(N);
54531
54532 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
54533 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
54534 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
54535 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
54536 return SDValue();
54537
54538 auto isAllOnesConstantFP = [](SDValue V) {
54539 if (V.getSimpleValueType().isVector())
54540 return ISD::isBuildVectorAllOnes(V.getNode());
54541 auto *C = dyn_cast<ConstantFPSDNode>(V);
54542 return C && C->getConstantFPValue()->isAllOnesValue();
54543 };
54544
54545 // fand (fxor X, -1), Y --> fandn X, Y
54546 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
54547 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
54548
54549 // fand X, (fxor Y, -1) --> fandn Y, X
54550 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
54551 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
54552
54553 return SDValue();
54554}
54555
54556/// Do target-specific dag combines on X86ISD::FAND nodes.
54558 const X86Subtarget &Subtarget) {
54559 // FAND(0.0, x) -> 0.0
54560 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
54561 return V;
54562
54563 // FAND(x, 0.0) -> 0.0
54564 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54565 return V;
54566
54567 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
54568 return V;
54569
54570 return lowerX86FPLogicOp(N, DAG, Subtarget);
54571}
54572
54573/// Do target-specific dag combines on X86ISD::FANDN nodes.
54575 const X86Subtarget &Subtarget) {
54576 // FANDN(0.0, x) -> x
54577 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54578 return N->getOperand(1);
54579
54580 // FANDN(x, 0.0) -> 0.0
54581 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54582 return V;
54583
54584 return lowerX86FPLogicOp(N, DAG, Subtarget);
54585}
54586
54587/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
54590 const X86Subtarget &Subtarget) {
54591 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
54592
54593 // F[X]OR(0.0, x) -> x
54594 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54595 return N->getOperand(1);
54596
54597 // F[X]OR(x, 0.0) -> x
54598 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
54599 return N->getOperand(0);
54600
54601 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
54602 return NewVal;
54603
54604 return lowerX86FPLogicOp(N, DAG, Subtarget);
54605}
54606
54607/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
54609 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
54610
54611 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
54612 if (!DAG.getTarget().Options.NoNaNsFPMath ||
54614 return SDValue();
54615
54616 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
54617 // into FMINC and FMAXC, which are Commutative operations.
54618 unsigned NewOp = 0;
54619 switch (N->getOpcode()) {
54620 default: llvm_unreachable("unknown opcode");
54621 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
54622 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
54623 }
54624
54625 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
54626 N->getOperand(0), N->getOperand(1));
54627}
54628
54630 const X86Subtarget &Subtarget) {
54631 EVT VT = N->getValueType(0);
54632 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
54633 return SDValue();
54634
54635 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54636
54637 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
54638 (Subtarget.hasSSE2() && VT == MVT::f64) ||
54639 (Subtarget.hasFP16() && VT == MVT::f16) ||
54640 (VT.isVector() && TLI.isTypeLegal(VT))))
54641 return SDValue();
54642
54643 SDValue Op0 = N->getOperand(0);
54644 SDValue Op1 = N->getOperand(1);
54645 SDLoc DL(N);
54646 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
54647
54648 // If we don't have to respect NaN inputs, this is a direct translation to x86
54649 // min/max instructions.
54650 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
54651 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54652
54653 // If one of the operands is known non-NaN use the native min/max instructions
54654 // with the non-NaN input as second operand.
54655 if (DAG.isKnownNeverNaN(Op1))
54656 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54657 if (DAG.isKnownNeverNaN(Op0))
54658 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
54659
54660 // If we have to respect NaN inputs, this takes at least 3 instructions.
54661 // Favor a library call when operating on a scalar and minimizing code size.
54662 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
54663 return SDValue();
54664
54665 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
54666 VT);
54667
54668 // There are 4 possibilities involving NaN inputs, and these are the required
54669 // outputs:
54670 // Op1
54671 // Num NaN
54672 // ----------------
54673 // Num | Max | Op0 |
54674 // Op0 ----------------
54675 // NaN | Op1 | NaN |
54676 // ----------------
54677 //
54678 // The SSE FP max/min instructions were not designed for this case, but rather
54679 // to implement:
54680 // Min = Op1 < Op0 ? Op1 : Op0
54681 // Max = Op1 > Op0 ? Op1 : Op0
54682 //
54683 // So they always return Op0 if either input is a NaN. However, we can still
54684 // use those instructions for fmaxnum by selecting away a NaN input.
54685
54686 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
54687 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
54688 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
54689
54690 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
54691 // are NaN, the NaN value of Op1 is the result.
54692 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
54693}
54694
54697 EVT VT = N->getValueType(0);
54698 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54699
54700 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
54701 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
54702 return SDValue(N, 0);
54703
54704 // Convert a full vector load into vzload when not all bits are needed.
54705 SDValue In = N->getOperand(0);
54706 MVT InVT = In.getSimpleValueType();
54707 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
54708 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
54709 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54710 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
54711 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
54712 MVT MemVT = MVT::getIntegerVT(NumBits);
54713 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
54714 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
54715 SDLoc dl(N);
54716 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
54717 DAG.getBitcast(InVT, VZLoad));
54718 DCI.CombineTo(N, Convert);
54719 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54721 return SDValue(N, 0);
54722 }
54723 }
54724
54725 return SDValue();
54726}
54727
54731 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
54732 EVT VT = N->getValueType(0);
54733
54734 // Convert a full vector load into vzload when not all bits are needed.
54735 SDValue In = N->getOperand(IsStrict ? 1 : 0);
54736 MVT InVT = In.getSimpleValueType();
54737 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
54738 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
54739 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54740 LoadSDNode *LN = cast<LoadSDNode>(In);
54741 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
54742 MVT MemVT = MVT::getFloatingPointVT(NumBits);
54743 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
54744 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
54745 SDLoc dl(N);
54746 if (IsStrict) {
54747 SDValue Convert =
54748 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
54749 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
54750 DCI.CombineTo(N, Convert, Convert.getValue(1));
54751 } else {
54752 SDValue Convert =
54753 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
54754 DCI.CombineTo(N, Convert);
54755 }
54756 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54758 return SDValue(N, 0);
54759 }
54760 }
54761
54762 return SDValue();
54763}
54764
54765/// Do target-specific dag combines on X86ISD::ANDNP nodes.
54768 const X86Subtarget &Subtarget) {
54769 SDValue N0 = N->getOperand(0);
54770 SDValue N1 = N->getOperand(1);
54771 MVT VT = N->getSimpleValueType(0);
54772 int NumElts = VT.getVectorNumElements();
54773 unsigned EltSizeInBits = VT.getScalarSizeInBits();
54774 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54775 SDLoc DL(N);
54776
54777 // ANDNP(undef, x) -> 0
54778 // ANDNP(x, undef) -> 0
54779 if (N0.isUndef() || N1.isUndef())
54780 return DAG.getConstant(0, DL, VT);
54781
54782 // ANDNP(0, x) -> x
54784 return N1;
54785
54786 // ANDNP(x, 0) -> 0
54788 return DAG.getConstant(0, DL, VT);
54789
54790 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
54792 return DAG.getNOT(DL, N0, VT);
54793
54794 // Turn ANDNP back to AND if input is inverted.
54795 if (SDValue Not = IsNOT(N0, DAG))
54796 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
54797
54798 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
54799 // to make use of predicated selects.
54800 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
54801 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
54802 SDValue Src = N0.getOperand(0);
54803 EVT SrcVT = Src.getValueType();
54804 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
54805 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
54806 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
54807 getZeroVector(VT, Subtarget, DAG, DL));
54808 }
54809
54810 // Constant Folding
54811 APInt Undefs0, Undefs1;
54812 SmallVector<APInt> EltBits0, EltBits1;
54813 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
54814 /*AllowWholeUndefs*/ true,
54815 /*AllowPartialUndefs*/ true)) {
54816 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
54817 /*AllowWholeUndefs*/ true,
54818 /*AllowPartialUndefs*/ true)) {
54819 SmallVector<APInt> ResultBits;
54820 for (int I = 0; I != NumElts; ++I)
54821 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
54822 return getConstVector(ResultBits, VT, DAG, DL);
54823 }
54824
54825 // Constant fold NOT(N0) to allow us to use AND.
54826 // Ensure this is only performed if we can confirm that the bitcasted source
54827 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
54828 if (N0->hasOneUse()) {
54830 if (BC0.getOpcode() != ISD::BITCAST) {
54831 for (APInt &Elt : EltBits0)
54832 Elt = ~Elt;
54833 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
54834 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
54835 }
54836 }
54837 }
54838
54839 // Attempt to recursively combine a bitmask ANDNP with shuffles.
54840 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
54841 SDValue Op(N, 0);
54842 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
54843 return Res;
54844
54845 // If either operand is a constant mask, then only the elements that aren't
54846 // zero are actually demanded by the other operand.
54847 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
54848 APInt UndefElts;
54849 SmallVector<APInt> EltBits;
54850 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
54851 APInt DemandedElts = APInt::getAllOnes(NumElts);
54852 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
54853 EltBits)) {
54854 DemandedBits.clearAllBits();
54855 DemandedElts.clearAllBits();
54856 for (int I = 0; I != NumElts; ++I) {
54857 if (UndefElts[I]) {
54858 // We can't assume an undef src element gives an undef dst - the
54859 // other src might be zero.
54860 DemandedBits.setAllBits();
54861 DemandedElts.setBit(I);
54862 } else if ((Invert && !EltBits[I].isAllOnes()) ||
54863 (!Invert && !EltBits[I].isZero())) {
54864 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
54865 DemandedElts.setBit(I);
54866 }
54867 }
54868 }
54869 return std::make_pair(DemandedBits, DemandedElts);
54870 };
54871 APInt Bits0, Elts0;
54872 APInt Bits1, Elts1;
54873 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
54874 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
54875
54876 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
54877 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
54878 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
54879 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
54880 if (N->getOpcode() != ISD::DELETED_NODE)
54881 DCI.AddToWorklist(N);
54882 return SDValue(N, 0);
54883 }
54884 }
54885
54886 // Folds for better commutativity:
54887 if (N1->hasOneUse()) {
54888 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
54889 if (SDValue Not = IsNOT(N1, DAG))
54890 return DAG.getNOT(
54891 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
54892
54893 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
54894 // Zero out elements by setting the PSHUFB mask value to 0xFF.
54895 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
54897 if (BC1.getOpcode() == X86ISD::PSHUFB) {
54898 EVT ShufVT = BC1.getValueType();
54899 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
54900 DAG.getBitcast(ShufVT, N0));
54901 SDValue NewShuf =
54902 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
54903 return DAG.getBitcast(VT, NewShuf);
54904 }
54905 }
54906 }
54907
54908 return SDValue();
54909}
54910
54913 SDValue N1 = N->getOperand(1);
54914
54915 // BT ignores high bits in the bit index operand.
54916 unsigned BitWidth = N1.getValueSizeInBits();
54918 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
54919 if (N->getOpcode() != ISD::DELETED_NODE)
54920 DCI.AddToWorklist(N);
54921 return SDValue(N, 0);
54922 }
54923
54924 return SDValue();
54925}
54926
54929 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
54930 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
54931
54932 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
54933 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54934 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
54935 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
54936 if (N->getOpcode() != ISD::DELETED_NODE)
54937 DCI.AddToWorklist(N);
54938 return SDValue(N, 0);
54939 }
54940
54941 // Convert a full vector load into vzload when not all bits are needed.
54942 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
54943 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
54944 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
54945 SDLoc dl(N);
54946 if (IsStrict) {
54947 SDValue Convert = DAG.getNode(
54948 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
54949 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
54950 DCI.CombineTo(N, Convert, Convert.getValue(1));
54951 } else {
54952 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
54953 DAG.getBitcast(MVT::v8i16, VZLoad));
54954 DCI.CombineTo(N, Convert);
54955 }
54956
54957 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54959 return SDValue(N, 0);
54960 }
54961 }
54962 }
54963
54964 return SDValue();
54965}
54966
54967// Try to combine sext_in_reg of a cmov of constants by extending the constants.
54969 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
54970
54971 EVT DstVT = N->getValueType(0);
54972
54973 SDValue N0 = N->getOperand(0);
54974 SDValue N1 = N->getOperand(1);
54975 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54976
54977 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
54978 return SDValue();
54979
54980 // Look through single use any_extends / truncs.
54981 SDValue IntermediateBitwidthOp;
54982 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
54983 N0.hasOneUse()) {
54984 IntermediateBitwidthOp = N0;
54985 N0 = N0.getOperand(0);
54986 }
54987
54988 // See if we have a single use cmov.
54989 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
54990 return SDValue();
54991
54992 SDValue CMovOp0 = N0.getOperand(0);
54993 SDValue CMovOp1 = N0.getOperand(1);
54994
54995 // Make sure both operands are constants.
54996 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
54997 !isa<ConstantSDNode>(CMovOp1.getNode()))
54998 return SDValue();
54999
55000 SDLoc DL(N);
55001
55002 // If we looked through an any_extend/trunc above, add one to the constants.
55003 if (IntermediateBitwidthOp) {
55004 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55005 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55006 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55007 }
55008
55009 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55010 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55011
55012 EVT CMovVT = DstVT;
55013 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55014 if (DstVT == MVT::i16) {
55015 CMovVT = MVT::i32;
55016 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55017 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55018 }
55019
55020 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55021 N0.getOperand(2), N0.getOperand(3));
55022
55023 if (CMovVT != DstVT)
55024 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55025
55026 return CMov;
55027}
55028
55030 const X86Subtarget &Subtarget) {
55031 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55032
55033 if (SDValue V = combineSextInRegCmov(N, DAG))
55034 return V;
55035
55036 EVT VT = N->getValueType(0);
55037 SDValue N0 = N->getOperand(0);
55038 SDValue N1 = N->getOperand(1);
55039 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55040 SDLoc dl(N);
55041
55042 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55043 // both SSE and AVX2 since there is no sign-extended shift right
55044 // operation on a vector with 64-bit elements.
55045 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55046 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55047 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55048 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55049 SDValue N00 = N0.getOperand(0);
55050
55051 // EXTLOAD has a better solution on AVX2,
55052 // it may be replaced with X86ISD::VSEXT node.
55053 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55054 if (!ISD::isNormalLoad(N00.getNode()))
55055 return SDValue();
55056
55057 // Attempt to promote any comparison mask ops before moving the
55058 // SIGN_EXTEND_INREG in the way.
55059 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55060 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55061
55062 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55063 SDValue Tmp =
55064 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55065 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55066 }
55067 }
55068 return SDValue();
55069}
55070
55071/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55072/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55073/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55074/// opportunities to combine math ops, use an LEA, or use a complex addressing
55075/// mode. This can eliminate extend, add, and shift instructions.
55077 const X86Subtarget &Subtarget) {
55078 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55079 Ext->getOpcode() != ISD::ZERO_EXTEND)
55080 return SDValue();
55081
55082 // TODO: This should be valid for other integer types.
55083 EVT VT = Ext->getValueType(0);
55084 if (VT != MVT::i64)
55085 return SDValue();
55086
55087 SDValue Add = Ext->getOperand(0);
55088 if (Add.getOpcode() != ISD::ADD)
55089 return SDValue();
55090
55091 SDValue AddOp0 = Add.getOperand(0);
55092 SDValue AddOp1 = Add.getOperand(1);
55093 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55094 bool NSW = Add->getFlags().hasNoSignedWrap();
55095 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55096 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55097 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55098
55099 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55100 // into the 'zext'
55101 if ((Sext && !NSW) || (!Sext && !NUW))
55102 return SDValue();
55103
55104 // Having a constant operand to the 'add' ensures that we are not increasing
55105 // the instruction count because the constant is extended for free below.
55106 // A constant operand can also become the displacement field of an LEA.
55107 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55108 if (!AddOp1C)
55109 return SDValue();
55110
55111 // Don't make the 'add' bigger if there's no hope of combining it with some
55112 // other 'add' or 'shl' instruction.
55113 // TODO: It may be profitable to generate simpler LEA instructions in place
55114 // of single 'add' instructions, but the cost model for selecting an LEA
55115 // currently has a high threshold.
55116 bool HasLEAPotential = false;
55117 for (auto *User : Ext->users()) {
55118 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55119 HasLEAPotential = true;
55120 break;
55121 }
55122 }
55123 if (!HasLEAPotential)
55124 return SDValue();
55125
55126 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55127 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55128 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55129 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55130
55131 // The wider add is guaranteed to not wrap because both operands are
55132 // sign-extended.
55133 SDNodeFlags Flags;
55134 Flags.setNoSignedWrap(NSW);
55135 Flags.setNoUnsignedWrap(NUW);
55136 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55137}
55138
55139// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55140// operands and the result of CMOV is not used anywhere else - promote CMOV
55141// itself instead of promoting its result. This could be beneficial, because:
55142// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55143// (or more) pseudo-CMOVs only when they go one-after-another and
55144// getting rid of result extension code after CMOV will help that.
55145// 2) Promotion of constant CMOV arguments is free, hence the
55146// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55147// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55148// promotion is also good in terms of code-size.
55149// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55150// promotion).
55152 SDValue CMovN = Extend->getOperand(0);
55153 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55154 return SDValue();
55155
55156 EVT TargetVT = Extend->getValueType(0);
55157 unsigned ExtendOpcode = Extend->getOpcode();
55158 SDLoc DL(Extend);
55159
55160 EVT VT = CMovN.getValueType();
55161 SDValue CMovOp0 = CMovN.getOperand(0);
55162 SDValue CMovOp1 = CMovN.getOperand(1);
55163
55164 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55165 !isa<ConstantSDNode>(CMovOp1.getNode()))
55166 return SDValue();
55167
55168 // Only extend to i32 or i64.
55169 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55170 return SDValue();
55171
55172 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55173 // are free.
55174 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55175 return SDValue();
55176
55177 // If this a zero extend to i64, we should only extend to i32 and use a free
55178 // zero extend to finish.
55179 EVT ExtendVT = TargetVT;
55180 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55181 ExtendVT = MVT::i32;
55182
55183 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55184 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55185
55186 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55187 CMovN.getOperand(2), CMovN.getOperand(3));
55188
55189 // Finish extending if needed.
55190 if (ExtendVT != TargetVT)
55191 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55192
55193 return Res;
55194}
55195
55196// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55197// result type.
55199 const X86Subtarget &Subtarget) {
55200 SDValue N0 = N->getOperand(0);
55201 EVT VT = N->getValueType(0);
55202 SDLoc dl(N);
55203
55204 // Only do this combine with AVX512 for vector extends.
55205 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55206 return SDValue();
55207
55208 // Only combine legal element types.
55209 EVT SVT = VT.getVectorElementType();
55210 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55211 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55212 return SDValue();
55213
55214 // We don't have CMPP Instruction for vxf16
55215 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55216 return SDValue();
55217 // We can only do this if the vector size in 256 bits or less.
55218 unsigned Size = VT.getSizeInBits();
55219 if (Size > 256 && Subtarget.useAVX512Regs())
55220 return SDValue();
55221
55222 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55223 // that's the only integer compares with we have.
55224 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
55226 return SDValue();
55227
55228 // Only do this combine if the extension will be fully consumed by the setcc.
55229 EVT N00VT = N0.getOperand(0).getValueType();
55230 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55231 if (Size != MatchingVecType.getSizeInBits())
55232 return SDValue();
55233
55234 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55235
55236 if (N->getOpcode() == ISD::ZERO_EXTEND)
55237 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55238
55239 return Res;
55240}
55241
55244 const X86Subtarget &Subtarget) {
55245 SDValue N0 = N->getOperand(0);
55246 EVT VT = N->getValueType(0);
55247 SDLoc DL(N);
55248
55249 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55250 if (!DCI.isBeforeLegalizeOps() &&
55252 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55253 N0->getOperand(1));
55254 bool ReplaceOtherUses = !N0.hasOneUse();
55255 DCI.CombineTo(N, Setcc);
55256 // Replace other uses with a truncate of the widened setcc_carry.
55257 if (ReplaceOtherUses) {
55258 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55259 N0.getValueType(), Setcc);
55260 DCI.CombineTo(N0.getNode(), Trunc);
55261 }
55262
55263 return SDValue(N, 0);
55264 }
55265
55266 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55267 return NewCMov;
55268
55269 if (!DCI.isBeforeLegalizeOps())
55270 return SDValue();
55271
55272 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55273 return V;
55274
55275 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55276 DAG, DCI, Subtarget))
55277 return V;
55278
55279 if (VT.isVector()) {
55280 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55281 return R;
55282
55284 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55285 }
55286
55287 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55288 return NewAdd;
55289
55290 return SDValue();
55291}
55292
55293// Inverting a constant vector is profitable if it can be eliminated and the
55294// inverted vector is already present in DAG. Otherwise, it will be loaded
55295// anyway.
55296//
55297// We determine which of the values can be completely eliminated and invert it.
55298// If both are eliminable, select a vector with the first negative element.
55301 "ConstantFP build vector expected");
55302 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55303 // can eliminate it. Since this function is invoked for each FMA with this
55304 // vector.
55305 auto IsNotFMA = [](SDNode *User) {
55306 return User->getOpcode() != ISD::FMA &&
55307 User->getOpcode() != ISD::STRICT_FMA;
55308 };
55309 if (llvm::any_of(V->users(), IsNotFMA))
55310 return SDValue();
55311
55313 EVT VT = V.getValueType();
55314 EVT EltVT = VT.getVectorElementType();
55315 for (const SDValue &Op : V->op_values()) {
55316 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55317 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55318 } else {
55319 assert(Op.isUndef());
55320 Ops.push_back(DAG.getUNDEF(EltVT));
55321 }
55322 }
55323
55324 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
55325 if (!NV)
55326 return SDValue();
55327
55328 // If an inverted version cannot be eliminated, choose it instead of the
55329 // original version.
55330 if (llvm::any_of(NV->users(), IsNotFMA))
55331 return SDValue(NV, 0);
55332
55333 // If the inverted version also can be eliminated, we have to consistently
55334 // prefer one of the values. We prefer a constant with a negative value on
55335 // the first place.
55336 // N.B. We need to skip undefs that may precede a value.
55337 for (const SDValue &Op : V->op_values()) {
55338 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55339 if (Cst->isNegative())
55340 return SDValue();
55341 break;
55342 }
55343 }
55344 return SDValue(NV, 0);
55345}
55346
55349 const X86Subtarget &Subtarget) {
55350 SDLoc dl(N);
55351 EVT VT = N->getValueType(0);
55353 bool IsStrict = N->isTargetOpcode()
55354 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
55355 : N->isStrictFPOpcode();
55356
55357 // Let legalize expand this if it isn't a legal type yet.
55358 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55359 if (!TLI.isTypeLegal(VT))
55360 return SDValue();
55361
55362 SDValue A = N->getOperand(IsStrict ? 1 : 0);
55363 SDValue B = N->getOperand(IsStrict ? 2 : 1);
55364 SDValue C = N->getOperand(IsStrict ? 3 : 2);
55365
55366 // If the operation allows fast-math and the target does not support FMA,
55367 // split this into mul+add to avoid libcall(s).
55368 SDNodeFlags Flags = N->getFlags();
55369 if (!IsStrict && Flags.hasAllowReassociation() &&
55370 TLI.isOperationExpand(ISD::FMA, VT)) {
55371 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
55372 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
55373 }
55374
55375 EVT ScalarVT = VT.getScalarType();
55376 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
55377 !Subtarget.hasAnyFMA()) &&
55378 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
55379 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
55380 return SDValue();
55381
55382 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
55383 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
55384 bool LegalOperations = !DCI.isBeforeLegalizeOps();
55385 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
55386 CodeSize)) {
55387 V = NegV;
55388 return true;
55389 }
55390 // Look through extract_vector_elts. If it comes from an FNEG, create a
55391 // new extract from the FNEG input.
55392 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
55393 isNullConstant(V.getOperand(1))) {
55394 SDValue Vec = V.getOperand(0);
55395 if (SDValue NegV = TLI.getCheaperNegatedExpression(
55396 Vec, DAG, LegalOperations, CodeSize)) {
55397 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
55398 NegV, V.getOperand(1));
55399 return true;
55400 }
55401 }
55402 // Lookup if there is an inverted version of constant vector V in DAG.
55403 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
55404 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
55405 V = NegV;
55406 return true;
55407 }
55408 }
55409 return false;
55410 };
55411
55412 // Do not convert the passthru input of scalar intrinsics.
55413 // FIXME: We could allow negations of the lower element only.
55414 bool NegA = invertIfNegative(A);
55415 bool NegB = invertIfNegative(B);
55416 bool NegC = invertIfNegative(C);
55417
55418 if (!NegA && !NegB && !NegC)
55419 return SDValue();
55420
55421 unsigned NewOpcode =
55422 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
55423
55424 // Propagate fast-math-flags to new FMA node.
55425 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
55426 if (IsStrict) {
55427 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
55428 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
55429 {N->getOperand(0), A, B, C});
55430 } else {
55431 if (N->getNumOperands() == 4)
55432 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
55433 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
55434 }
55435}
55436
55437// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
55438// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
55441 SDLoc dl(N);
55442 EVT VT = N->getValueType(0);
55443 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55444 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
55445 bool LegalOperations = !DCI.isBeforeLegalizeOps();
55446
55447 SDValue N2 = N->getOperand(2);
55448
55449 SDValue NegN2 =
55450 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
55451 if (!NegN2)
55452 return SDValue();
55453 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
55454
55455 if (N->getNumOperands() == 4)
55456 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
55457 NegN2, N->getOperand(3));
55458 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
55459 NegN2);
55460}
55461
55464 const X86Subtarget &Subtarget) {
55465 SDLoc dl(N);
55466 SDValue N0 = N->getOperand(0);
55467 EVT VT = N->getValueType(0);
55468
55469 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55470 // FIXME: Is this needed? We don't seem to have any tests for it.
55471 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
55473 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
55474 N0->getOperand(1));
55475 bool ReplaceOtherUses = !N0.hasOneUse();
55476 DCI.CombineTo(N, Setcc);
55477 // Replace other uses with a truncate of the widened setcc_carry.
55478 if (ReplaceOtherUses) {
55479 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55480 N0.getValueType(), Setcc);
55481 DCI.CombineTo(N0.getNode(), Trunc);
55482 }
55483
55484 return SDValue(N, 0);
55485 }
55486
55487 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55488 return NewCMov;
55489
55490 if (DCI.isBeforeLegalizeOps())
55491 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55492 return V;
55493
55494 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
55495 DAG, DCI, Subtarget))
55496 return V;
55497
55498 if (VT.isVector())
55499 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
55500 return R;
55501
55502 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55503 return NewAdd;
55504
55505 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
55506 return R;
55507
55508 // TODO: Combine with any target/faux shuffle.
55509 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
55511 SDValue N00 = N0.getOperand(0);
55512 SDValue N01 = N0.getOperand(1);
55513 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
55514 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
55515 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
55516 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
55517 return concatSubVectors(N00, N01, DAG, dl);
55518 }
55519 }
55520
55521 return SDValue();
55522}
55523
55524/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
55525/// pre-promote its result type since vXi1 vectors don't get promoted
55526/// during type legalization.
55529 const SDLoc &DL, SelectionDAG &DAG,
55530 const X86Subtarget &Subtarget) {
55531 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
55532 VT.getVectorElementType() == MVT::i1 &&
55533 (OpVT.getVectorElementType() == MVT::i8 ||
55534 OpVT.getVectorElementType() == MVT::i16)) {
55535 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
55536 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
55537 }
55538 return SDValue();
55539}
55540
55543 const X86Subtarget &Subtarget) {
55544 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
55545 const SDValue LHS = N->getOperand(0);
55546 const SDValue RHS = N->getOperand(1);
55547 EVT VT = N->getValueType(0);
55548 EVT OpVT = LHS.getValueType();
55549 SDLoc DL(N);
55550
55551 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
55552 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
55553 Subtarget))
55554 return V;
55555
55556 if (VT == MVT::i1) {
55557 X86::CondCode X86CC;
55558 if (SDValue V =
55559 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
55560 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
55561 }
55562
55563 if (OpVT.isScalarInteger()) {
55564 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
55565 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
55566 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
55567 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
55568 if (N0.getOperand(0) == N1)
55569 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
55570 N0.getOperand(1));
55571 if (N0.getOperand(1) == N1)
55572 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
55573 N0.getOperand(0));
55574 }
55575 return SDValue();
55576 };
55577 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
55578 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55579 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
55580 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55581
55582 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
55583 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
55584 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
55585 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
55586 if (N0.getOperand(0) == N1)
55587 return DAG.getNode(ISD::AND, DL, OpVT, N1,
55588 DAG.getNOT(DL, N0.getOperand(1), OpVT));
55589 if (N0.getOperand(1) == N1)
55590 return DAG.getNode(ISD::AND, DL, OpVT, N1,
55591 DAG.getNOT(DL, N0.getOperand(0), OpVT));
55592 }
55593 return SDValue();
55594 };
55595 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
55596 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55597 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
55598 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55599
55600 // cmpeq(trunc(x),C) --> cmpeq(x,C)
55601 // cmpne(trunc(x),C) --> cmpne(x,C)
55602 // iff x upper bits are zero.
55603 if (LHS.getOpcode() == ISD::TRUNCATE &&
55604 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
55605 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
55606 EVT SrcVT = LHS.getOperand(0).getValueType();
55608 OpVT.getScalarSizeInBits());
55609 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55610 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
55611 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
55612 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
55613 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
55614 }
55615
55616 // With C as a power of 2 and C != 0 and C != INT_MIN:
55617 // icmp eq Abs(X) C ->
55618 // (icmp eq A, C) | (icmp eq A, -C)
55619 // icmp ne Abs(X) C ->
55620 // (icmp ne A, C) & (icmp ne A, -C)
55621 // Both of these patterns can be better optimized in
55622 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
55623 // integers which is checked above.
55624 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
55625 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
55626 const APInt &CInt = C->getAPIntValue();
55627 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
55628 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
55629 SDValue BaseOp = LHS.getOperand(0);
55630 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
55631 SDValue SETCC1 = DAG.getSetCC(
55632 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
55633 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
55634 SETCC0, SETCC1);
55635 }
55636 }
55637 }
55638 }
55639 }
55640
55641 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
55643 // Using temporaries to avoid messing up operand ordering for later
55644 // transformations if this doesn't work.
55645 SDValue Op0 = LHS;
55646 SDValue Op1 = RHS;
55647 ISD::CondCode TmpCC = CC;
55648 // Put build_vector on the right.
55649 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
55650 std::swap(Op0, Op1);
55651 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
55652 }
55653
55654 bool IsSEXT0 =
55655 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
55656 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
55657 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
55658
55659 if (IsSEXT0 && IsVZero1) {
55660 assert(VT == Op0.getOperand(0).getValueType() &&
55661 "Unexpected operand type");
55662 if (TmpCC == ISD::SETGT)
55663 return DAG.getConstant(0, DL, VT);
55664 if (TmpCC == ISD::SETLE)
55665 return DAG.getConstant(1, DL, VT);
55666 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
55667 return DAG.getNOT(DL, Op0.getOperand(0), VT);
55668
55669 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
55670 "Unexpected condition code!");
55671 return Op0.getOperand(0);
55672 }
55673 }
55674
55675 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
55676 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
55677 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
55678 // a mask, there are signed AVX512 comparisons).
55679 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
55680 bool CanMakeSigned = false;
55682 KnownBits CmpKnown =
55684 // If we know LHS/RHS share the same sign bit at each element we can
55685 // make this signed.
55686 // NOTE: `computeKnownBits` on a vector type aggregates common bits
55687 // across all lanes. So a pattern where the sign varies from lane to
55688 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
55689 // missed. We could get around this by demanding each lane
55690 // independently, but this isn't the most important optimization and
55691 // that may eat into compile time.
55692 CanMakeSigned =
55693 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
55694 }
55695 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
55696 SDValue LHSOut = LHS;
55697 SDValue RHSOut = RHS;
55698 ISD::CondCode NewCC = CC;
55699 switch (CC) {
55700 case ISD::SETGE:
55701 case ISD::SETUGE:
55702 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
55703 /*NSW*/ true))
55704 LHSOut = NewLHS;
55705 else if (SDValue NewRHS = incDecVectorConstant(
55706 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
55707 RHSOut = NewRHS;
55708 else
55709 break;
55710
55711 [[fallthrough]];
55712 case ISD::SETUGT:
55713 NewCC = ISD::SETGT;
55714 break;
55715
55716 case ISD::SETLE:
55717 case ISD::SETULE:
55718 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
55719 /*NSW*/ true))
55720 LHSOut = NewLHS;
55721 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
55722 /*NSW*/ true))
55723 RHSOut = NewRHS;
55724 else
55725 break;
55726
55727 [[fallthrough]];
55728 case ISD::SETULT:
55729 // Will be swapped to SETGT in LowerVSETCC*.
55730 NewCC = ISD::SETLT;
55731 break;
55732 default:
55733 break;
55734 }
55735 if (NewCC != CC) {
55736 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
55737 NewCC, DL, DAG, Subtarget))
55738 return R;
55739 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
55740 }
55741 }
55742 }
55743
55744 if (SDValue R =
55745 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
55746 return R;
55747
55748 // In the middle end transforms:
55749 // `(or (icmp eq X, C), (icmp eq X, C+1))`
55750 // -> `(icmp ult (add x, -C), 2)`
55751 // Likewise inverted cases with `ugt`.
55752 //
55753 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
55754 // in worse codegen. So, undo the middle-end transform and go back to `(or
55755 // (icmp eq), (icmp eq))` form.
55756 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
55757 // the xmm approach.
55758 //
55759 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
55760 // ne))` as it doesn't end up instruction positive.
55761 // TODO: We might want to do this for avx512 as well if we `sext` the result.
55762 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
55763 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
55764 !Subtarget.hasAVX512() &&
55765 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
55766 Subtarget.hasAVX2()) &&
55767 LHS.hasOneUse()) {
55768
55769 APInt CmpC;
55770 SDValue AddC = LHS.getOperand(1);
55771 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
55773 // See which form we have depending on the constant/condition.
55774 SDValue C0 = SDValue();
55775 SDValue C1 = SDValue();
55776
55777 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
55778 // we will end up generating an additional constant. Keeping in the
55779 // current form has a slight latency cost, but it probably worth saving a
55780 // constant.
55783 // Pass
55784 }
55785 // Normal Cases
55786 else if ((CC == ISD::SETULT && CmpC == 2) ||
55787 (CC == ISD::SETULE && CmpC == 1)) {
55788 // These will constant fold.
55789 C0 = DAG.getNegative(AddC, DL, OpVT);
55790 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
55791 DAG.getAllOnesConstant(DL, OpVT));
55792 }
55793 // Inverted Cases
55794 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
55795 (CC == ISD::SETUGE && (-CmpC) == 2)) {
55796 // These will constant fold.
55797 C0 = DAG.getNOT(DL, AddC, OpVT);
55798 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
55799 DAG.getAllOnesConstant(DL, OpVT));
55800 }
55801 if (C0 && C1) {
55802 SDValue NewLHS =
55803 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
55804 SDValue NewRHS =
55805 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
55806 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
55807 }
55808 }
55809 }
55810
55811 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
55812 // to avoid scalarization via legalization because v4i32 is not a legal type.
55813 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
55814 LHS.getValueType() == MVT::v4f32)
55815 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
55816
55817 // X pred 0.0 --> X pred -X
55818 // If the negation of X already exists, use it in the comparison. This removes
55819 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
55820 // instructions in patterns with a 'select' node.
55822 SDVTList FNegVT = DAG.getVTList(OpVT);
55823 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
55824 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
55825 }
55826
55827 return SDValue();
55828}
55829
55832 const X86Subtarget &Subtarget) {
55833 SDValue Src = N->getOperand(0);
55834 MVT SrcVT = Src.getSimpleValueType();
55835 MVT VT = N->getSimpleValueType(0);
55836 unsigned NumBits = VT.getScalarSizeInBits();
55837 unsigned NumElts = SrcVT.getVectorNumElements();
55838 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
55839 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
55840
55841 // Perform constant folding.
55842 APInt UndefElts;
55843 SmallVector<APInt, 32> EltBits;
55844 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
55845 /*AllowWholeUndefs*/ true,
55846 /*AllowPartialUndefs*/ true)) {
55847 APInt Imm(32, 0);
55848 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
55849 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
55850 Imm.setBit(Idx);
55851
55852 return DAG.getConstant(Imm, SDLoc(N), VT);
55853 }
55854
55855 // Look through int->fp bitcasts that don't change the element width.
55856 unsigned EltWidth = SrcVT.getScalarSizeInBits();
55857 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
55858 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
55859 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
55860
55861 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
55862 // with scalar comparisons.
55863 if (SDValue NotSrc = IsNOT(Src, DAG)) {
55864 SDLoc DL(N);
55865 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
55866 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
55867 return DAG.getNode(ISD::XOR, DL, VT,
55868 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
55869 DAG.getConstant(NotMask, DL, VT));
55870 }
55871
55872 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
55873 // results with scalar comparisons.
55874 if (Src.getOpcode() == X86ISD::PCMPGT &&
55875 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
55876 SDLoc DL(N);
55877 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
55878 return DAG.getNode(ISD::XOR, DL, VT,
55879 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
55880 DAG.getConstant(NotMask, DL, VT));
55881 }
55882
55883 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
55884 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
55885 // iff pow2splat(c1).
55886 // Use KnownBits to determine if only a single bit is non-zero
55887 // in each element (pow2 or zero), and shift that bit to the msb.
55888 if (Src.getOpcode() == X86ISD::PCMPEQ) {
55889 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
55890 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
55891 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
55892 if (KnownLHS.countMaxPopulation() == 1 &&
55893 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
55894 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
55895 SDLoc DL(N);
55896 MVT ShiftVT = SrcVT;
55897 SDValue ShiftLHS = Src.getOperand(0);
55898 SDValue ShiftRHS = Src.getOperand(1);
55899 if (ShiftVT.getScalarType() == MVT::i8) {
55900 // vXi8 shifts - we only care about the signbit so can use PSLLW.
55901 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
55902 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
55903 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
55904 }
55905 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
55906 ShiftLHS, ShiftAmt, DAG);
55907 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
55908 ShiftRHS, ShiftAmt, DAG);
55909 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
55910 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
55911 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
55912 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
55913 }
55914 }
55915
55916 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
55917 if (N->isOnlyUserOf(Src.getNode())) {
55919 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
55920 APInt UndefElts;
55921 SmallVector<APInt, 32> EltBits;
55922 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
55923 UndefElts, EltBits)) {
55924 APInt Mask = APInt::getZero(NumBits);
55925 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
55926 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
55927 Mask.setBit(Idx);
55928 }
55929 SDLoc DL(N);
55930 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
55931 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
55932 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
55933 DAG.getConstant(Mask, DL, VT));
55934 }
55935 }
55936 }
55937
55938 // Simplify the inputs.
55939 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55940 APInt DemandedMask(APInt::getAllOnes(NumBits));
55941 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55942 return SDValue(N, 0);
55943
55944 return SDValue();
55945}
55946
55949 const X86Subtarget &Subtarget) {
55950 MVT VT = N->getSimpleValueType(0);
55951 unsigned NumBits = VT.getScalarSizeInBits();
55952
55953 // Simplify the inputs.
55954 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55955 APInt DemandedMask(APInt::getAllOnes(NumBits));
55956 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55957 return SDValue(N, 0);
55958
55959 return SDValue();
55960}
55961
55964 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
55965 SDValue Mask = MemOp->getMask();
55966
55967 // With vector masks we only demand the upper bit of the mask.
55968 if (Mask.getScalarValueSizeInBits() != 1) {
55969 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55970 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
55971 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
55972 if (N->getOpcode() != ISD::DELETED_NODE)
55973 DCI.AddToWorklist(N);
55974 return SDValue(N, 0);
55975 }
55976 }
55977
55978 return SDValue();
55979}
55980
55982 SDValue Index, SDValue Base, SDValue Scale,
55983 SelectionDAG &DAG) {
55984 SDLoc DL(GorS);
55985
55986 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
55987 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
55988 Gather->getMask(), Base, Index, Scale } ;
55989 return DAG.getMaskedGather(Gather->getVTList(),
55990 Gather->getMemoryVT(), DL, Ops,
55991 Gather->getMemOperand(),
55992 Gather->getIndexType(),
55993 Gather->getExtensionType());
55994 }
55995 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
55996 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
55997 Scatter->getMask(), Base, Index, Scale };
55998 return DAG.getMaskedScatter(Scatter->getVTList(),
55999 Scatter->getMemoryVT(), DL,
56000 Ops, Scatter->getMemOperand(),
56001 Scatter->getIndexType(),
56002 Scatter->isTruncatingStore());
56003}
56004
56007 SDLoc DL(N);
56008 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56009 SDValue Index = GorS->getIndex();
56010 SDValue Base = GorS->getBasePtr();
56011 SDValue Scale = GorS->getScale();
56012 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56013
56014 if (DCI.isBeforeLegalize()) {
56015 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56016
56017 // Shrink constant indices if they are larger than 32-bits.
56018 // Only do this before legalize types since v2i64 could become v2i32.
56019 // FIXME: We could check that the type is legal if we're after legalize
56020 // types, but then we would need to construct test cases where that happens.
56021 // FIXME: We could support more than just constant vectors, but we need to
56022 // careful with costing. A truncate that can be optimized out would be fine.
56023 // Otherwise we might only want to create a truncate if it avoids a split.
56024 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
56025 if (BV->isConstant() && IndexWidth > 32 &&
56026 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56027 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
56028 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56029 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56030 }
56031 }
56032
56033 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56034 // there are sufficient sign bits. Only do this before legalize types to
56035 // avoid creating illegal types in truncate.
56036 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56037 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56038 IndexWidth > 32 &&
56039 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
56040 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56041 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
56042 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56043 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56044 }
56045 }
56046
56047 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56048 // Try to move splat constant adders from the index operand to the base
56049 // pointer operand. Taking care to multiply by the scale. We can only do
56050 // this when index element type is the same as the pointer type.
56051 // Otherwise we need to be sure the math doesn't wrap before the scale.
56052 if (Index.getOpcode() == ISD::ADD &&
56053 Index.getValueType().getVectorElementType() == PtrVT &&
56054 isa<ConstantSDNode>(Scale)) {
56055 uint64_t ScaleAmt = Scale->getAsZExtVal();
56056 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
56057 BitVector UndefElts;
56058 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
56059 // FIXME: Allow non-constant?
56060 if (UndefElts.none()) {
56061 // Apply the scale.
56062 APInt Adder = C->getAPIntValue() * ScaleAmt;
56063 // Add it to the existing base.
56064 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56065 DAG.getConstant(Adder, DL, PtrVT));
56066 Index = Index.getOperand(0);
56067 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56068 }
56069 }
56070
56071 // It's also possible base is just a constant. In that case, just
56072 // replace it with 0 and move the displacement into the index.
56073 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
56074 isOneConstant(Scale)) {
56075 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
56076 // Combine the constant build_vector and the constant base.
56077 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
56078 Index.getOperand(1), Splat);
56079 // Add to the LHS of the original Index add.
56080 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
56081 Index.getOperand(0), Splat);
56082 Base = DAG.getConstant(0, DL, Base.getValueType());
56083 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56084 }
56085 }
56086 }
56087
56088 if (DCI.isBeforeLegalizeOps()) {
56089 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56090
56091 // Make sure the index is either i32 or i64
56092 if (IndexWidth != 32 && IndexWidth != 64) {
56093 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
56094 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
56095 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
56096 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56097 }
56098 }
56099
56100 // With vector masks we only demand the upper bit of the mask.
56101 SDValue Mask = GorS->getMask();
56102 if (Mask.getScalarValueSizeInBits() != 1) {
56103 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56104 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56105 if (N->getOpcode() != ISD::DELETED_NODE)
56106 DCI.AddToWorklist(N);
56107 return SDValue(N, 0);
56108 }
56109 }
56110
56111 return SDValue();
56112}
56113
56114// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
56116 const X86Subtarget &Subtarget) {
56117 SDLoc DL(N);
56118 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
56119 SDValue EFLAGS = N->getOperand(1);
56120
56121 // Try to simplify the EFLAGS and condition code operands.
56122 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
56123 return getSETCC(CC, Flags, DL, DAG);
56124
56125 return SDValue();
56126}
56127
56128/// Optimize branch condition evaluation.
56130 const X86Subtarget &Subtarget) {
56131 SDLoc DL(N);
56132 SDValue EFLAGS = N->getOperand(3);
56133 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
56134
56135 // Try to simplify the EFLAGS and condition code operands.
56136 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
56137 // RAUW them under us.
56138 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
56139 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
56140 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
56141 N->getOperand(1), Cond, Flags);
56142 }
56143
56144 return SDValue();
56145}
56146
56147// TODO: Could we move this to DAGCombine?
56149 SelectionDAG &DAG) {
56150 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
56151 // to optimize away operation when it's from a constant.
56152 //
56153 // The general transformation is:
56154 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
56155 // AND(VECTOR_CMP(x,y), constant2)
56156 // constant2 = UNARYOP(constant)
56157
56158 // Early exit if this isn't a vector operation, the operand of the
56159 // unary operation isn't a bitwise AND, or if the sizes of the operations
56160 // aren't the same.
56161 EVT VT = N->getValueType(0);
56162 bool IsStrict = N->isStrictFPOpcode();
56163 unsigned NumEltBits = VT.getScalarSizeInBits();
56164 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56165 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
56166 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
56167 VT.getSizeInBits() != Op0.getValueSizeInBits())
56168 return SDValue();
56169
56170 // Now check that the other operand of the AND is a constant. We could
56171 // make the transformation for non-constant splats as well, but it's unclear
56172 // that would be a benefit as it would not eliminate any operations, just
56173 // perform one more step in scalar code before moving to the vector unit.
56174 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
56175 // Bail out if the vector isn't a constant.
56176 if (!BV->isConstant())
56177 return SDValue();
56178
56179 // Everything checks out. Build up the new and improved node.
56180 SDLoc DL(N);
56181 EVT IntVT = BV->getValueType(0);
56182 // Create a new constant of the appropriate type for the transformed
56183 // DAG.
56184 SDValue SourceConst;
56185 if (IsStrict)
56186 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
56187 {N->getOperand(0), SDValue(BV, 0)});
56188 else
56189 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
56190 // The AND node needs bitcasts to/from an integer vector type around it.
56191 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
56192 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
56193 MaskConst);
56194 SDValue Res = DAG.getBitcast(VT, NewAnd);
56195 if (IsStrict)
56196 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
56197 return Res;
56198 }
56199
56200 return SDValue();
56201}
56202
56203/// If we are converting a value to floating-point, try to replace scalar
56204/// truncate of an extracted vector element with a bitcast. This tries to keep
56205/// the sequence on XMM registers rather than moving between vector and GPRs.
56207 // TODO: This is currently only used by combineSIntToFP, but it is generalized
56208 // to allow being called by any similar cast opcode.
56209 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
56210 SDValue Trunc = N->getOperand(0);
56211 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
56212 return SDValue();
56213
56214 SDValue ExtElt = Trunc.getOperand(0);
56215 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
56216 !isNullConstant(ExtElt.getOperand(1)))
56217 return SDValue();
56218
56219 EVT TruncVT = Trunc.getValueType();
56220 EVT SrcVT = ExtElt.getValueType();
56221 unsigned DestWidth = TruncVT.getSizeInBits();
56222 unsigned SrcWidth = SrcVT.getSizeInBits();
56223 if (SrcWidth % DestWidth != 0)
56224 return SDValue();
56225
56226 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
56227 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
56228 unsigned VecWidth = SrcVecVT.getSizeInBits();
56229 unsigned NumElts = VecWidth / DestWidth;
56230 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
56231 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
56232 SDLoc DL(N);
56233 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
56234 BitcastVec, ExtElt.getOperand(1));
56235 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
56236}
56237
56239 const X86Subtarget &Subtarget) {
56240 bool IsStrict = N->isStrictFPOpcode();
56241 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56242 EVT VT = N->getValueType(0);
56243 EVT InVT = Op0.getValueType();
56244
56245 // Using i16 as an intermediate type is a bad idea, unless we have HW support
56246 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
56247 // if hasFP16 support:
56248 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
56249 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
56250 // else
56251 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
56252 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
56253 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
56254 unsigned ScalarSize = InVT.getScalarSizeInBits();
56255 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
56256 ScalarSize >= 64)
56257 return SDValue();
56258 SDLoc dl(N);
56259 EVT DstVT =
56261 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
56262 : ScalarSize < 32 ? MVT::i32
56263 : MVT::i64,
56264 InVT.getVectorNumElements());
56265 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
56266 if (IsStrict)
56267 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56268 {N->getOperand(0), P});
56269 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56270 }
56271
56272 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
56273 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
56274 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
56275 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
56276 VT.getScalarType() != MVT::f16) {
56277 SDLoc dl(N);
56278 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
56279 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
56280
56281 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
56282 if (IsStrict)
56283 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56284 {N->getOperand(0), P});
56285 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56286 }
56287
56288 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
56289 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
56290 // the optimization here.
56291 SDNodeFlags Flags = N->getFlags();
56292 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
56293 if (IsStrict)
56294 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
56295 {N->getOperand(0), Op0});
56296 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
56297 }
56298
56299 return SDValue();
56300}
56301
56304 const X86Subtarget &Subtarget) {
56305 // First try to optimize away the conversion entirely when it's
56306 // conditionally from a constant. Vectors only.
56307 bool IsStrict = N->isStrictFPOpcode();
56309 return Res;
56310
56311 // Now move on to more general possibilities.
56312 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56313 EVT VT = N->getValueType(0);
56314 EVT InVT = Op0.getValueType();
56315
56316 // Using i16 as an intermediate type is a bad idea, unless we have HW support
56317 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
56318 // if hasFP16 support:
56319 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
56320 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
56321 // else
56322 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
56323 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
56324 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
56325 unsigned ScalarSize = InVT.getScalarSizeInBits();
56326 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
56327 ScalarSize >= 64)
56328 return SDValue();
56329 SDLoc dl(N);
56330 EVT DstVT =
56332 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
56333 : ScalarSize < 32 ? MVT::i32
56334 : MVT::i64,
56335 InVT.getVectorNumElements());
56336 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
56337 if (IsStrict)
56338 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56339 {N->getOperand(0), P});
56340 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56341 }
56342
56343 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
56344 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
56345 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
56346 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
56347 VT.getScalarType() != MVT::f16) {
56348 SDLoc dl(N);
56349 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
56350 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
56351 if (IsStrict)
56352 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56353 {N->getOperand(0), P});
56354 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56355 }
56356
56357 // Without AVX512DQ we only support i64 to float scalar conversion. For both
56358 // vectors and scalars, see if we know that the upper bits are all the sign
56359 // bit, in which case we can truncate the input to i32 and convert from that.
56360 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
56361 unsigned BitWidth = InVT.getScalarSizeInBits();
56362 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
56363 if (NumSignBits >= (BitWidth - 31)) {
56364 EVT TruncVT = MVT::i32;
56365 if (InVT.isVector())
56366 TruncVT = InVT.changeVectorElementType(TruncVT);
56367 SDLoc dl(N);
56368 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
56369 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
56370 if (IsStrict)
56371 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56372 {N->getOperand(0), Trunc});
56373 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
56374 }
56375 // If we're after legalize and the type is v2i32 we need to shuffle and
56376 // use CVTSI2P.
56377 assert(InVT == MVT::v2i64 && "Unexpected VT!");
56378 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
56379 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
56380 { 0, 2, -1, -1 });
56381 if (IsStrict)
56382 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
56383 {N->getOperand(0), Shuf});
56384 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
56385 }
56386 }
56387
56388 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
56389 // a 32-bit target where SSE doesn't support i64->FP operations.
56390 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
56391 Op0.getOpcode() == ISD::LOAD) {
56392 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
56393
56394 // This transformation is not supported if the result type is f16 or f128.
56395 if (VT == MVT::f16 || VT == MVT::f128)
56396 return SDValue();
56397
56398 // If we have AVX512DQ we can use packed conversion instructions unless
56399 // the VT is f80.
56400 if (Subtarget.hasDQI() && VT != MVT::f80)
56401 return SDValue();
56402
56403 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
56404 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
56405 std::pair<SDValue, SDValue> Tmp =
56406 Subtarget.getTargetLowering()->BuildFILD(
56407 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
56408 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
56409 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
56410 return Tmp.first;
56411 }
56412 }
56413
56414 if (IsStrict)
56415 return SDValue();
56416
56417 if (SDValue V = combineToFPTruncExtElt(N, DAG))
56418 return V;
56419
56420 return SDValue();
56421}
56422
56423// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
56425 const X86Subtarget &Subtarget) {
56426 if (!Subtarget.hasAVX10_2())
56427 return SDValue();
56428
56429 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
56430 EVT SrcVT = N->getOperand(0).getValueType();
56431 EVT DstVT = N->getValueType(0);
56432 SDLoc dl(N);
56433
56434 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
56435 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
56436
56437 // Concatenate the original v2f32 input and V2F32Value to create v4f32
56438 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
56439 N->getOperand(0), V2F32Value);
56440
56441 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
56442 if (IsSigned)
56443 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
56444
56445 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
56446 }
56447 return SDValue();
56448}
56449
56451 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
56452
56453 for (const SDNode *User : Flags->users()) {
56455 switch (User->getOpcode()) {
56456 default:
56457 // Be conservative.
56458 return true;
56459 case X86ISD::SETCC:
56461 CC = (X86::CondCode)User->getConstantOperandVal(0);
56462 break;
56463 case X86ISD::BRCOND:
56464 case X86ISD::CMOV:
56465 CC = (X86::CondCode)User->getConstantOperandVal(2);
56466 break;
56467 }
56468
56469 switch (CC) {
56470 // clang-format off
56471 default: break;
56472 case X86::COND_A: case X86::COND_AE:
56473 case X86::COND_B: case X86::COND_BE:
56474 case X86::COND_O: case X86::COND_NO:
56475 case X86::COND_G: case X86::COND_GE:
56476 case X86::COND_L: case X86::COND_LE:
56477 return true;
56478 // clang-format on
56479 }
56480 }
56481
56482 return false;
56483}
56484
56485static bool onlyZeroFlagUsed(SDValue Flags) {
56486 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
56487
56488 for (const SDNode *User : Flags->users()) {
56489 unsigned CCOpNo;
56490 switch (User->getOpcode()) {
56491 default:
56492 // Be conservative.
56493 return false;
56494 case X86ISD::SETCC:
56496 CCOpNo = 0;
56497 break;
56498 case X86ISD::BRCOND:
56499 case X86ISD::CMOV:
56500 CCOpNo = 2;
56501 break;
56502 }
56503
56504 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
56505 if (CC != X86::COND_E && CC != X86::COND_NE)
56506 return false;
56507 }
56508
56509 return true;
56510}
56511
56514 const X86Subtarget &Subtarget) {
56515 // Only handle test patterns.
56516 if (!isNullConstant(N->getOperand(1)))
56517 return SDValue();
56518
56519 // If we have a CMP of a truncated binop, see if we can make a smaller binop
56520 // and use its flags directly.
56521 // TODO: Maybe we should try promoting compares that only use the zero flag
56522 // first if we can prove the upper bits with computeKnownBits?
56523 SDLoc dl(N);
56524 SDValue Op = N->getOperand(0);
56525 EVT VT = Op.getValueType();
56526 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56527
56528 if (SDValue CMP =
56529 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
56530 return CMP;
56531
56532 // If we have a constant logical shift that's only used in a comparison
56533 // against zero turn it into an equivalent AND. This allows turning it into
56534 // a TEST instruction later.
56535 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
56536 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
56537 onlyZeroFlagUsed(SDValue(N, 0))) {
56538 unsigned BitWidth = VT.getSizeInBits();
56539 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
56540 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
56541 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
56542 APInt Mask = Op.getOpcode() == ISD::SRL
56543 ? APInt::getHighBitsSet(BitWidth, MaskBits)
56544 : APInt::getLowBitsSet(BitWidth, MaskBits);
56545 if (Mask.isSignedIntN(32)) {
56546 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
56547 DAG.getConstant(Mask, dl, VT));
56548 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56549 DAG.getConstant(0, dl, VT));
56550 }
56551 }
56552 }
56553
56554 // If we're extracting from a avx512 bool vector and comparing against zero,
56555 // then try to just bitcast the vector to an integer to use TEST/BT directly.
56556 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
56557 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
56558 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
56559 SDValue Src = Op.getOperand(0);
56560 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56561 isNullConstant(Src.getOperand(1)) &&
56562 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
56563 SDValue BoolVec = Src.getOperand(0);
56564 unsigned ShAmt = 0;
56565 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
56566 ShAmt = BoolVec.getConstantOperandVal(1);
56567 BoolVec = BoolVec.getOperand(0);
56568 }
56569 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
56570 EVT VecVT = BoolVec.getValueType();
56571 unsigned BitWidth = VecVT.getVectorNumElements();
56572 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
56573 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
56574 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
56575 Op = DAG.getBitcast(BCVT, BoolVec);
56576 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
56577 DAG.getConstant(Mask, dl, BCVT));
56578 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56579 DAG.getConstant(0, dl, BCVT));
56580 }
56581 }
56582 }
56583
56584 // Peek through any zero-extend if we're only testing for a zero result.
56585 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
56586 SDValue Src = Op.getOperand(0);
56587 EVT SrcVT = Src.getValueType();
56588 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
56589 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
56590 DAG.getConstant(0, dl, SrcVT));
56591 }
56592
56593 // Look for a truncate.
56594 if (Op.getOpcode() != ISD::TRUNCATE)
56595 return SDValue();
56596
56597 SDValue Trunc = Op;
56598 Op = Op.getOperand(0);
56599
56600 // See if we can compare with zero against the truncation source,
56601 // which should help using the Z flag from many ops. Only do this for
56602 // i32 truncated op to prevent partial-reg compares of promoted ops.
56603 EVT OpVT = Op.getValueType();
56604 APInt UpperBits =
56606 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
56607 onlyZeroFlagUsed(SDValue(N, 0))) {
56608 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56609 DAG.getConstant(0, dl, OpVT));
56610 }
56611
56612 // After this the truncate and arithmetic op must have a single use.
56613 if (!Trunc.hasOneUse() || !Op.hasOneUse())
56614 return SDValue();
56615
56616 unsigned NewOpc;
56617 switch (Op.getOpcode()) {
56618 default: return SDValue();
56619 case ISD::AND:
56620 // Skip and with constant. We have special handling for and with immediate
56621 // during isel to generate test instructions.
56622 if (isa<ConstantSDNode>(Op.getOperand(1)))
56623 return SDValue();
56624 NewOpc = X86ISD::AND;
56625 break;
56626 case ISD::OR: NewOpc = X86ISD::OR; break;
56627 case ISD::XOR: NewOpc = X86ISD::XOR; break;
56628 case ISD::ADD:
56629 // If the carry or overflow flag is used, we can't truncate.
56631 return SDValue();
56632 NewOpc = X86ISD::ADD;
56633 break;
56634 case ISD::SUB:
56635 // If the carry or overflow flag is used, we can't truncate.
56637 return SDValue();
56638 NewOpc = X86ISD::SUB;
56639 break;
56640 }
56641
56642 // We found an op we can narrow. Truncate its inputs.
56643 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
56644 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
56645
56646 // Use a X86 specific opcode to avoid DAG combine messing with it.
56647 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56648 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
56649
56650 // For AND, keep a CMP so that we can match the test pattern.
56651 if (NewOpc == X86ISD::AND)
56652 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56653 DAG.getConstant(0, dl, VT));
56654
56655 // Return the flags.
56656 return Op.getValue(1);
56657}
56658
56661 const X86Subtarget &ST) {
56662 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
56663 "Expected X86ISD::ADD or X86ISD::SUB");
56664
56665 SDLoc DL(N);
56666 SDValue LHS = N->getOperand(0);
56667 SDValue RHS = N->getOperand(1);
56668 MVT VT = LHS.getSimpleValueType();
56669 bool IsSub = X86ISD::SUB == N->getOpcode();
56670 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
56671
56672 if (IsSub && isOneConstant(N->getOperand(1)) && !N->hasAnyUseOfValue(0))
56673 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
56674 return CMP;
56675
56676 // If we don't use the flag result, simplify back to a generic ADD/SUB.
56677 if (!N->hasAnyUseOfValue(1)) {
56678 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
56679 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
56680 }
56681
56682 // Fold any similar generic ADD/SUB opcodes to reuse this node.
56683 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
56684 SDValue Ops[] = {N0, N1};
56685 SDVTList VTs = DAG.getVTList(N->getValueType(0));
56686 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
56687 SDValue Op(N, 0);
56688 if (Negate) {
56689 // Bail if this is only used by a user of the x86 add/sub.
56690 if (GenericAddSub->hasOneUse() &&
56691 GenericAddSub->user_begin()->isOnlyUserOf(N))
56692 return;
56693 Op = DAG.getNegative(Op, DL, VT);
56694 }
56695 DCI.CombineTo(GenericAddSub, Op);
56696 }
56697 };
56698 MatchGeneric(LHS, RHS, false);
56699 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
56700
56701 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
56702 // EFLAGS result doesn't change.
56703 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
56704 /*ZeroSecondOpOnly*/ true);
56705}
56706
56708 SDValue LHS = N->getOperand(0);
56709 SDValue RHS = N->getOperand(1);
56710 SDValue BorrowIn = N->getOperand(2);
56711
56712 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
56713 MVT VT = N->getSimpleValueType(0);
56714 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56715 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
56716 }
56717
56718 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
56719 // iff the flag result is dead.
56720 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
56721 !N->hasAnyUseOfValue(1))
56722 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56723 LHS.getOperand(1), BorrowIn);
56724
56725 return SDValue();
56726}
56727
56728// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
56731 SDValue LHS = N->getOperand(0);
56732 SDValue RHS = N->getOperand(1);
56733 SDValue CarryIn = N->getOperand(2);
56734 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
56735 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
56736
56737 // Canonicalize constant to RHS.
56738 if (LHSC && !RHSC)
56739 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
56740 CarryIn);
56741
56742 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
56743 // the result is either zero or one (depending on the input carry bit).
56744 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
56745 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
56746 // We don't have a good way to replace an EFLAGS use, so only do this when
56747 // dead right now.
56748 SDValue(N, 1).use_empty()) {
56749 SDLoc DL(N);
56750 EVT VT = N->getValueType(0);
56751 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
56752 SDValue Res1 = DAG.getNode(
56753 ISD::AND, DL, VT,
56755 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
56756 DAG.getConstant(1, DL, VT));
56757 return DCI.CombineTo(N, Res1, CarryOut);
56758 }
56759
56760 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
56761 // iff the flag result is dead.
56762 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
56763 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
56764 SDLoc DL(N);
56765 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
56766 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
56767 DAG.getConstant(0, DL, LHS.getValueType()),
56768 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
56769 }
56770
56771 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
56772 MVT VT = N->getSimpleValueType(0);
56773 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56774 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
56775 }
56776
56777 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
56778 // iff the flag result is dead.
56779 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
56780 !N->hasAnyUseOfValue(1))
56781 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56782 LHS.getOperand(1), CarryIn);
56783
56784 return SDValue();
56785}
56786
56788 const SDLoc &DL, EVT VT,
56789 const X86Subtarget &Subtarget) {
56790 using namespace SDPatternMatch;
56791
56792 // Example of pattern we try to detect:
56793 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
56794 //(add (build_vector (extract_elt t, 0),
56795 // (extract_elt t, 2),
56796 // (extract_elt t, 4),
56797 // (extract_elt t, 6)),
56798 // (build_vector (extract_elt t, 1),
56799 // (extract_elt t, 3),
56800 // (extract_elt t, 5),
56801 // (extract_elt t, 7)))
56802
56803 if (!Subtarget.hasSSE2())
56804 return SDValue();
56805
56806 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
56807 VT.getVectorNumElements() < 4 ||
56809 return SDValue();
56810
56811 SDValue Op0, Op1, Accum;
56812 if (!sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
56813 m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op1)))) &&
56814 !sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
56815 m_Add(m_Value(Accum), m_AllOf(m_Opc(ISD::BUILD_VECTOR),
56816 m_Value(Op1))))))
56817 return SDValue();
56818
56819 // Check if one of Op0,Op1 is of the form:
56820 // (build_vector (extract_elt Mul, 0),
56821 // (extract_elt Mul, 2),
56822 // (extract_elt Mul, 4),
56823 // ...
56824 // the other is of the form:
56825 // (build_vector (extract_elt Mul, 1),
56826 // (extract_elt Mul, 3),
56827 // (extract_elt Mul, 5),
56828 // ...
56829 // and identify Mul.
56830 SDValue Mul;
56831 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
56832 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
56833 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
56834 // TODO: Be more tolerant to undefs.
56835 APInt Idx0L, Idx0H, Idx1L, Idx1H;
56836 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
56837 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
56838 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
56839 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
56840 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
56841 return SDValue();
56842 // Commutativity of mul allows factors of a product to reorder.
56843 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
56844 std::swap(Idx0L, Idx1L);
56845 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
56846 std::swap(Idx0H, Idx1H);
56847 // Commutativity of add allows pairs of factors to reorder.
56848 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
56849 std::swap(Idx0L, Idx0H);
56850 std::swap(Idx1L, Idx1H);
56851 }
56852 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
56853 Idx1H != 2 * i + 3)
56854 return SDValue();
56855 if (!Mul) {
56856 // First time an extract_elt's source vector is visited. Must be a MUL
56857 // with 2X number of vector elements than the BUILD_VECTOR.
56858 // Both extracts must be from same MUL.
56859 Mul = Vec0L;
56860 if (Mul.getOpcode() != ISD::MUL ||
56861 Mul.getValueType().getVectorNumElements() != 2 * e)
56862 return SDValue();
56863 }
56864 // Check that the extract is from the same MUL previously seen.
56865 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
56866 return SDValue();
56867 }
56868
56869 // Check if the Mul source can be safely shrunk.
56870 ShrinkMode Mode;
56871 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
56872 Mode == ShrinkMode::MULU16)
56873 return SDValue();
56874
56875 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
56876 VT.getVectorNumElements() * 2);
56877 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
56878 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
56879
56880 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
56881 ArrayRef<SDValue> Ops) {
56882 EVT InVT = Ops[0].getValueType();
56883 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
56884 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
56885 InVT.getVectorNumElements() / 2);
56886 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
56887 };
56888 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
56889 if (Accum)
56890 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
56891 return R;
56892}
56893
56894// Attempt to turn this pattern into PMADDWD.
56895// (add (mul (sext (build_vector)), (sext (build_vector))),
56896// (mul (sext (build_vector)), (sext (build_vector)))
56898 const SDLoc &DL, EVT VT,
56899 const X86Subtarget &Subtarget) {
56900 using namespace SDPatternMatch;
56901
56902 if (!Subtarget.hasSSE2())
56903 return SDValue();
56904
56905 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
56906 VT.getVectorNumElements() < 4 ||
56908 return SDValue();
56909
56910 // All inputs need to be sign extends.
56911 // TODO: Support ZERO_EXTEND from known positive?
56912 SDValue N00, N01, N10, N11;
56913 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
56914 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
56915 return SDValue();
56916
56917 // Must be extending from vXi16.
56918 EVT InVT = N00.getValueType();
56919 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
56920 N10.getValueType() != InVT || N11.getValueType() != InVT)
56921 return SDValue();
56922
56923 // All inputs should be build_vectors.
56924 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
56925 N01.getOpcode() != ISD::BUILD_VECTOR ||
56926 N10.getOpcode() != ISD::BUILD_VECTOR ||
56928 return SDValue();
56929
56930 // For each element, we need to ensure we have an odd element from one vector
56931 // multiplied by the odd element of another vector and the even element from
56932 // one of the same vectors being multiplied by the even element from the
56933 // other vector. So we need to make sure for each element i, this operator
56934 // is being performed:
56935 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
56936 SDValue In0, In1;
56937 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
56938 SDValue N00Elt = N00.getOperand(i);
56939 SDValue N01Elt = N01.getOperand(i);
56940 SDValue N10Elt = N10.getOperand(i);
56941 SDValue N11Elt = N11.getOperand(i);
56942 // TODO: Be more tolerant to undefs.
56943 SDValue N00In, N01In, N10In, N11In;
56944 APInt IdxN00, IdxN01, IdxN10, IdxN11;
56945 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
56946 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
56947 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
56948 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
56949 return SDValue();
56950 // Add is commutative so indices can be reordered.
56951 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
56952 std::swap(IdxN00, IdxN10);
56953 std::swap(IdxN01, IdxN11);
56954 }
56955 // N0 indices be the even element. N1 indices must be the next odd element.
56956 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
56957 IdxN11 != 2 * i + 1)
56958 return SDValue();
56959
56960 // First time we find an input capture it.
56961 if (!In0) {
56962 In0 = N00In;
56963 In1 = N01In;
56964
56965 // The input vectors must be at least as wide as the output.
56966 // If they are larger than the output, we extract subvector below.
56967 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
56968 In1.getValueSizeInBits() < VT.getSizeInBits())
56969 return SDValue();
56970 }
56971 // Mul is commutative so the input vectors can be in any order.
56972 // Canonicalize to make the compares easier.
56973 if (In0 != N00In)
56974 std::swap(N00In, N01In);
56975 if (In0 != N10In)
56976 std::swap(N10In, N11In);
56977 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
56978 return SDValue();
56979 }
56980
56981 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
56982 ArrayRef<SDValue> Ops) {
56983 EVT OpVT = Ops[0].getValueType();
56984 assert(OpVT.getScalarType() == MVT::i16 &&
56985 "Unexpected scalar element type");
56986 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
56987 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
56988 OpVT.getVectorNumElements() / 2);
56989 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
56990 };
56991
56992 // If the output is narrower than an input, extract the low part of the input
56993 // vector.
56994 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
56995 VT.getVectorNumElements() * 2);
56996 if (OutVT16.bitsLT(In0.getValueType())) {
56997 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
56998 DAG.getVectorIdxConstant(0, DL));
56999 }
57000 if (OutVT16.bitsLT(In1.getValueType())) {
57001 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57002 DAG.getVectorIdxConstant(0, DL));
57003 }
57004 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57005 PMADDBuilder);
57006}
57007
57008// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57009// If upper element in each pair of both VPMADDWD are zero then we can merge
57010// the operand elements and use the implicit add of VPMADDWD.
57011// TODO: Add support for VPMADDUBSW (which isn't commutable).
57013 const SDLoc &DL, EVT VT) {
57014 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57015 return SDValue();
57016
57017 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57018 if (VT.getSizeInBits() > 128)
57019 return SDValue();
57020
57021 unsigned NumElts = VT.getVectorNumElements();
57022 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57024 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57025
57026 bool Op0HiZero =
57027 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57028 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57029 bool Op1HiZero =
57030 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57031 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57032
57033 // TODO: Check for zero lower elements once we have actual codegen that
57034 // creates them.
57035 if (!Op0HiZero || !Op1HiZero)
57036 return SDValue();
57037
57038 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57039 SmallVector<int> Mask;
57040 for (int i = 0; i != (int)NumElts; ++i) {
57041 Mask.push_back(2 * i);
57042 Mask.push_back(2 * (i + NumElts));
57043 }
57044
57045 SDValue LHS =
57046 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57047 SDValue RHS =
57048 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57049 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57050}
57051
57052/// CMOV of constants requires materializing constant operands in registers.
57053/// Try to fold those constants into an 'add' instruction to reduce instruction
57054/// count. We do this with CMOV rather the generic 'select' because there are
57055/// earlier folds that may be used to turn select-of-constants into logic hacks.
57057 SelectionDAG &DAG,
57058 const X86Subtarget &Subtarget) {
57059 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57060 // better because we eliminate 1-2 instructions. This transform is still
57061 // an improvement without zero operands because we trade 2 move constants and
57062 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57063 // immediate asm operands (fit in 32-bits).
57064 auto isSuitableCmov = [](SDValue V) {
57065 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57066 return false;
57067 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57068 !isa<ConstantSDNode>(V.getOperand(1)))
57069 return false;
57070 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57071 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57072 V.getConstantOperandAPInt(1).isSignedIntN(32));
57073 };
57074
57075 // Match an appropriate CMOV as the first operand of the add.
57076 SDValue Cmov = N->getOperand(0);
57077 SDValue OtherOp = N->getOperand(1);
57078 if (!isSuitableCmov(Cmov))
57079 std::swap(Cmov, OtherOp);
57080 if (!isSuitableCmov(Cmov))
57081 return SDValue();
57082
57083 // Don't remove a load folding opportunity for the add. That would neutralize
57084 // any improvements from removing constant materializations.
57085 if (X86::mayFoldLoad(OtherOp, Subtarget))
57086 return SDValue();
57087
57088 EVT VT = N->getValueType(0);
57089 SDValue FalseOp = Cmov.getOperand(0);
57090 SDValue TrueOp = Cmov.getOperand(1);
57091
57092 // We will push the add through the select, but we can potentially do better
57093 // if we know there is another add in the sequence and this is pointer math.
57094 // In that case, we can absorb an add into the trailing memory op and avoid
57095 // a 3-operand LEA which is likely slower than a 2-operand LEA.
57096 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
57097 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
57098 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
57099 all_of(N->users(), [&](SDNode *Use) {
57100 auto *MemNode = dyn_cast<MemSDNode>(Use);
57101 return MemNode && MemNode->getBasePtr().getNode() == N;
57102 })) {
57103 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
57104 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
57105 // it is possible that choosing op1 might be better.
57106 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
57107 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
57108 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
57109 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
57110 Cmov.getOperand(2), Cmov.getOperand(3));
57111 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
57112 }
57113
57114 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
57115 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
57116 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
57117 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
57118 Cmov.getOperand(3));
57119}
57120
57123 const X86Subtarget &Subtarget) {
57124 EVT VT = N->getValueType(0);
57125 SDValue Op0 = N->getOperand(0);
57126 SDValue Op1 = N->getOperand(1);
57127 SDLoc DL(N);
57128
57129 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
57130 return Select;
57131
57132 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
57133 return MAdd;
57134 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
57135 return MAdd;
57136 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
57137 return MAdd;
57138
57139 // Try to synthesize horizontal adds from adds of shuffles.
57140 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
57141 return V;
57142
57143 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
57144 // iff X and Y won't overflow.
57145 if (Op0.getOpcode() == X86ISD::PSADBW && Op1.getOpcode() == X86ISD::PSADBW &&
57148 if (DAG.willNotOverflowAdd(false, Op0.getOperand(0), Op1.getOperand(0))) {
57149 MVT OpVT = Op0.getOperand(1).getSimpleValueType();
57150 SDValue Sum =
57151 DAG.getNode(ISD::ADD, DL, OpVT, Op0.getOperand(0), Op1.getOperand(0));
57152 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
57153 getZeroVector(OpVT, Subtarget, DAG, DL));
57154 }
57155 }
57156
57157 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
57158 // (sub Y, (sext (vXi1 X))).
57159 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
57160 // generic DAG combine without a legal type check, but adding this there
57161 // caused regressions.
57162 if (VT.isVector()) {
57163 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57164 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
57165 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
57166 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
57167 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
57168 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
57169 }
57170
57171 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
57172 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
57173 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
57174 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
57175 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
57176 }
57177 }
57178
57179 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
57180 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
57181 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
57182 using namespace SDPatternMatch;
57183 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
57184 if (sd_match(N, m_Add(m_Value(Accum),
57185 m_Node(ISD::CONCAT_VECTORS,
57187 m_Value(Lo1)),
57189 m_Value(Hi1)))))) {
57190 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
57191 concatSubVectors(Lo0, Hi0, DAG, DL),
57192 concatSubVectors(Lo1, Hi1, DAG, DL));
57193 }
57194 }
57195
57196 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
57197 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
57198 X86::isZeroNode(Op0.getOperand(1))) {
57199 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
57200 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
57201 Op0.getOperand(0), Op0.getOperand(2));
57202 }
57203
57204 return combineAddOrSubToADCOrSBB(N, DL, DAG);
57205}
57206
57207// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
57208// condition comes from the subtract node that produced -X. This matches the
57209// cmov expansion for absolute value. By swapping the operands we convert abs
57210// to nabs.
57211static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
57212 SelectionDAG &DAG) {
57213 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
57214 return SDValue();
57215
57216 SDValue Cond = N1.getOperand(3);
57217 if (Cond.getOpcode() != X86ISD::SUB)
57218 return SDValue();
57219 assert(Cond.getResNo() == 1 && "Unexpected result number");
57220
57221 SDValue FalseOp = N1.getOperand(0);
57222 SDValue TrueOp = N1.getOperand(1);
57224
57225 // ABS condition should come from a negate operation.
57226 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
57227 isNullConstant(Cond.getOperand(0))) {
57228 // Get the X and -X from the negate.
57229 SDValue NegX = Cond.getValue(0);
57230 SDValue X = Cond.getOperand(1);
57231
57232 // Cmov operands should be X and NegX. Order doesn't matter.
57233 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
57234 return SDValue();
57235
57236 // Build a new CMOV with the operands swapped.
57237 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
57238 N1.getOperand(2), Cond);
57239 // Convert sub to add.
57240 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
57241 }
57242
57243 // Handle ABD special case:
57244 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
57245 // ABD condition should come from a pair of matching subtracts.
57246 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
57247 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
57248 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
57249 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
57250 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
57251 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
57252 // Build a new CMOV with the operands swapped.
57253 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
57254 Cond);
57255 }
57256
57257 return SDValue();
57258}
57259
57261 SDValue Op0 = N->getOperand(0);
57262 SDValue Op1 = N->getOperand(1);
57263
57264 // (sub C (zero_extend (setcc)))
57265 // =>
57266 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
57267 // Don't disturb (sub 0 setcc), which is easily done with neg.
57268 EVT VT = N->getValueType(0);
57269 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
57270 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
57271 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
57272 Op1.getOperand(0).hasOneUse()) {
57273 SDValue SetCC = Op1.getOperand(0);
57276 APInt NewImm = Op0C->getAPIntValue() - 1;
57277 SDLoc DL(Op1);
57278 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
57279 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
57280 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
57281 DAG.getConstant(NewImm, DL, VT));
57282 }
57283
57284 return SDValue();
57285}
57286
57288 // res, flags2 = sub 0, (setcc cc, flag)
57289 // cload/cstore ..., cond_ne, flag2
57290 // ->
57291 // cload/cstore cc, flag
57292 if (N->getConstantOperandVal(3) != X86::COND_NE)
57293 return SDValue();
57294
57295 SDValue Sub = N->getOperand(4);
57296 if (Sub.getOpcode() != X86ISD::SUB)
57297 return SDValue();
57298
57299 SDValue SetCC = Sub.getOperand(1);
57300
57301 if (!X86::isZeroNode(Sub.getOperand(0)) || SetCC.getOpcode() != X86ISD::SETCC)
57302 return SDValue();
57303
57304 SmallVector<SDValue, 5> Ops(N->op_values());
57305 Ops[3] = SetCC.getOperand(0);
57306 Ops[4] = SetCC.getOperand(1);
57307
57308 return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops,
57309 cast<MemSDNode>(N)->getMemoryVT(),
57310 cast<MemSDNode>(N)->getMemOperand());
57311}
57312
57315 const X86Subtarget &Subtarget) {
57316 EVT VT = N->getValueType(0);
57317 SDValue Op0 = N->getOperand(0);
57318 SDValue Op1 = N->getOperand(1);
57319 SDLoc DL(N);
57320
57321 auto IsNonOpaqueConstant = [&](SDValue Op) {
57323 /*AllowOpaques*/ false);
57324 };
57325
57326 // X86 can't encode an immediate LHS of a sub. See if we can push the
57327 // negation into a preceding instruction. If the RHS of the sub is a XOR with
57328 // one use and a constant, invert the immediate, saving one register.
57329 // However, ignore cases where C1 is 0, as those will become a NEG.
57330 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
57331 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
57332 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
57333 Op1->hasOneUse()) {
57334 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
57335 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
57336 SDValue NewAdd =
57337 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
57338 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
57339 }
57340
57341 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
57342 return V;
57343
57344 // Try to synthesize horizontal subs from subs of shuffles.
57345 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
57346 return V;
57347
57348 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
57349 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
57350 X86::isZeroNode(Op1.getOperand(1))) {
57351 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
57352 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
57353 Op1.getOperand(0), Op1.getOperand(2));
57354 }
57355
57356 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
57357 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
57358 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
57359 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
57360 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
57361 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
57362 Op1.getOperand(1), Op1.getOperand(2));
57363 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
57364 }
57365
57366 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
57367 return V;
57368
57369 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
57370 return V;
57371
57372 return combineSubSetcc(N, DAG);
57373}
57374
57376 const X86Subtarget &Subtarget) {
57377 unsigned Opcode = N->getOpcode();
57378 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
57379 "Unknown PCMP opcode");
57380
57381 SDValue LHS = N->getOperand(0);
57382 SDValue RHS = N->getOperand(1);
57383 MVT VT = N->getSimpleValueType(0);
57384 unsigned EltBits = VT.getScalarSizeInBits();
57385 unsigned NumElts = VT.getVectorNumElements();
57386 SDLoc DL(N);
57387
57388 if (LHS == RHS)
57389 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
57390 : DAG.getConstant(0, DL, VT);
57391
57392 // Constant Folding.
57393 // PCMPEQ(X,UNDEF) -> UNDEF
57394 // PCMPGT(X,UNDEF) -> 0
57395 // PCMPGT(UNDEF,X) -> 0
57396 APInt LHSUndefs, RHSUndefs;
57397 SmallVector<APInt> LHSBits, RHSBits;
57398 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
57399 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
57400 APInt Ones = APInt::getAllOnes(EltBits);
57401 APInt Zero = APInt::getZero(EltBits);
57402 SmallVector<APInt> Results(NumElts);
57403 for (unsigned I = 0; I != NumElts; ++I) {
57404 if (Opcode == X86ISD::PCMPEQ) {
57405 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
57406 } else {
57407 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
57408 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
57409 }
57410 }
57411 if (Opcode == X86ISD::PCMPEQ)
57412 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
57413 return getConstVector(Results, VT, DAG, DL);
57414 }
57415
57416 return SDValue();
57417}
57418
57419// Helper to determine if we can convert an integer comparison to a float
57420// comparison byt casting the operands.
57421static std::optional<unsigned>
57422CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
57423 unsigned NumSignificantBitsRHS) {
57424 MVT SVT = VT.getScalarType();
57425 assert(SVT == MVT::f32 && "Only tested for float so far");
57426 const fltSemantics &Sem = SVT.getFltSemantics();
57427 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
57428 "Only PCMPEQ/PCMPGT currently supported");
57429
57430 // TODO: Handle bitcastable integers.
57431
57432 // For cvt + signed compare we need lhs and rhs to be exactly representable as
57433 // a fp value.
57434 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
57435 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
57436 return ISD::SINT_TO_FP;
57437
57438 return std::nullopt;
57439}
57440
57441/// Helper that combines an array of subvector ops as if they were the operands
57442/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
57443/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
57447 const X86Subtarget &Subtarget) {
57448 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
57449 unsigned EltSizeInBits = VT.getScalarSizeInBits();
57450
57451 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
57452 return DAG.getUNDEF(VT);
57453
57454 if (llvm::all_of(Ops, [](SDValue Op) {
57455 return ISD::isBuildVectorAllZeros(Op.getNode());
57456 }))
57457 return getZeroVector(VT, Subtarget, DAG, DL);
57458
57459 SDValue Op0 = Ops[0];
57460 bool IsSplat = llvm::all_equal(Ops);
57461 unsigned NumOps = Ops.size();
57462 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57463 LLVMContext &Ctx = *DAG.getContext();
57464
57465 // Repeated subvectors.
57466 if (IsSplat &&
57467 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
57468 // If this broadcast is inserted into both halves, use a larger broadcast.
57469 if (Op0.getOpcode() == X86ISD::VBROADCAST)
57470 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
57471
57472 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
57473 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
57474 (Subtarget.hasAVX2() ||
57476 VT.getScalarType(), Subtarget)))
57477 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
57478 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
57479 Op0.getOperand(0),
57480 DAG.getVectorIdxConstant(0, DL)));
57481
57482 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
57483 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
57484 (Subtarget.hasAVX2() ||
57485 (EltSizeInBits >= 32 &&
57486 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
57487 Op0.getOperand(0).getValueType() == VT.getScalarType())
57488 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
57489
57490 // concat_vectors(extract_subvector(broadcast(x)),
57491 // extract_subvector(broadcast(x))) -> broadcast(x)
57492 // concat_vectors(extract_subvector(subv_broadcast(x)),
57493 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
57494 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
57495 Op0.getOperand(0).getValueType() == VT) {
57496 SDValue SrcVec = Op0.getOperand(0);
57497 if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
57499 return Op0.getOperand(0);
57500 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
57501 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
57502 return Op0.getOperand(0);
57503 }
57504
57505 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
57506 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
57507 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
57508 return DAG.getNode(Op0.getOpcode(), DL, VT,
57510 Op0.getOperand(0), Op0.getOperand(0)),
57511 Op0.getOperand(1));
57512 }
57513
57514 // TODO: This should go in combineX86ShufflesRecursively eventually.
57515 if (NumOps == 2) {
57516 SDValue Src0 = peekThroughBitcasts(Ops[0]);
57517 SDValue Src1 = peekThroughBitcasts(Ops[1]);
57518 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
57520 EVT SrcVT0 = Src0.getOperand(0).getValueType();
57521 EVT SrcVT1 = Src1.getOperand(0).getValueType();
57522 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
57523 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
57524 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
57525 // Only concat of subvector high halves which vperm2x128 is best at.
57526 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
57527 SrcVT1.is256BitVector() &&
57528 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
57529 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
57530 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
57531 DAG.getBitcast(VT, Src0.getOperand(0)),
57532 DAG.getBitcast(VT, Src1.getOperand(0)),
57533 DAG.getTargetConstant(0x31, DL, MVT::i8));
57534 }
57535 // concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> x.
57536 if (Src0.getOperand(0) == Src1.getOperand(0) &&
57537 Src0.getConstantOperandAPInt(1) == 0 &&
57538 Src1.getConstantOperandAPInt(1) ==
57540 return DAG.getBitcast(VT, extractSubVector(Src0.getOperand(0), 0, DAG,
57541 DL, VT.getSizeInBits()));
57542 }
57543 }
57544 }
57545
57546 // Repeated opcode.
57547 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
57548 // but it currently struggles with different vector widths.
57549 if (llvm::all_of(Ops, [Op0](SDValue Op) {
57550 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
57551 })) {
57552 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
57554 for (SDValue SubOp : SubOps)
57555 Subs.push_back(SubOp.getOperand(I));
57556 // Attempt to peek through bitcasts and concat the original subvectors.
57557 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
57558 if (SubVT.isSimple() && SubVT.isVector()) {
57559 EVT ConcatVT =
57561 SubVT.getVectorElementCount() * Subs.size());
57562 for (SDValue &Sub : Subs)
57563 Sub = DAG.getBitcast(SubVT, Sub);
57564 return DAG.getBitcast(
57565 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
57566 }
57567 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
57568 };
57569 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
57570 bool AllConstants = true;
57571 bool AllSubs = true;
57572 unsigned VecSize = VT.getSizeInBits();
57573 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
57574 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
57575 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
57576 }))
57577 return true;
57578 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
57579 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
57580 unsigned SubSize = BC.getValueSizeInBits();
57581 unsigned EltSize = BC.getScalarValueSizeInBits();
57582 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
57584 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
57585 BC.getOperand(0).getValueSizeInBits() == VecSize &&
57586 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
57587 }
57588 return AllConstants || AllSubs;
57589 };
57590
57591 switch (Op0.getOpcode()) {
57592 case ISD::VECTOR_SHUFFLE: {
57593 if (NumOps == 2 && VT.is256BitVector() &&
57594 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
57595 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
57596 int NumSubElts = Op0.getValueType().getVectorNumElements();
57597 SmallVector<int> NewMask;
57598 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
57599 M = M >= NumSubElts ? M + NumSubElts : M;
57600 NewMask.push_back(M);
57601 }
57602 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
57603 if (0 <= M)
57604 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
57605 NewMask.push_back(M);
57606 }
57607 return DAG.getVectorShuffle(VT, DL, ConcatSubOperand(VT, Ops, 0),
57608 ConcatSubOperand(VT, Ops, 1), NewMask);
57609 }
57610 break;
57611 }
57612 case X86ISD::VBROADCAST: {
57613 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
57614 return Op.getOperand(0).getValueType().is128BitVector();
57615 })) {
57616 if (VT == MVT::v4f64 || VT == MVT::v4i64)
57617 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
57618 ConcatSubOperand(VT, Ops, 0),
57619 ConcatSubOperand(VT, Ops, 0));
57620 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
57621 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
57622 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
57624 DL, VT, ConcatSubOperand(VT, Ops, 0),
57625 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
57626 }
57627 break;
57628 }
57629 case X86ISD::MOVDDUP:
57630 case X86ISD::MOVSHDUP:
57631 case X86ISD::MOVSLDUP: {
57632 if (!IsSplat)
57633 return DAG.getNode(Op0.getOpcode(), DL, VT,
57634 ConcatSubOperand(VT, Ops, 0));
57635 break;
57636 }
57637 case X86ISD::SHUFP: {
57638 // Add SHUFPD support if/when necessary.
57639 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
57640 llvm::all_of(Ops, [Op0](SDValue Op) {
57641 return Op.getOperand(2) == Op0.getOperand(2);
57642 })) {
57643 return DAG.getNode(Op0.getOpcode(), DL, VT,
57644 ConcatSubOperand(VT, Ops, 0),
57645 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
57646 }
57647 break;
57648 }
57649 case X86ISD::UNPCKH:
57650 case X86ISD::UNPCKL: {
57651 // Don't concatenate build_vector patterns.
57652 if (!IsSplat && EltSizeInBits >= 32 &&
57653 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57654 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
57655 none_of(Ops, [](SDValue Op) {
57656 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
57658 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
57660 })) {
57661 return DAG.getNode(Op0.getOpcode(), DL, VT,
57662 ConcatSubOperand(VT, Ops, 0),
57663 ConcatSubOperand(VT, Ops, 1));
57664 }
57665 break;
57666 }
57667 case X86ISD::PSHUFHW:
57668 case X86ISD::PSHUFLW:
57669 case X86ISD::PSHUFD:
57670 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
57671 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
57672 return DAG.getNode(Op0.getOpcode(), DL, VT,
57673 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
57674 }
57675 [[fallthrough]];
57676 case X86ISD::VPERMILPI:
57677 if (!IsSplat && EltSizeInBits == 32 &&
57678 (VT.is256BitVector() ||
57679 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
57680 all_of(Ops, [&Op0](SDValue Op) {
57681 return Op0.getOperand(1) == Op.getOperand(1);
57682 })) {
57683 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
57684 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
57685 Res =
57686 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
57687 return DAG.getBitcast(VT, Res);
57688 }
57689 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
57690 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
57691 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
57692 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
57693 return DAG.getNode(Op0.getOpcode(), DL, VT,
57694 ConcatSubOperand(VT, Ops, 0),
57695 DAG.getTargetConstant(Idx, DL, MVT::i8));
57696 }
57697 break;
57698 case X86ISD::PSHUFB:
57699 case X86ISD::PSADBW:
57700 case X86ISD::VPMADDUBSW:
57701 case X86ISD::VPMADDWD:
57702 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57703 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
57704 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
57705 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
57706 NumOps * SrcVT.getVectorNumElements());
57707 return DAG.getNode(Op0.getOpcode(), DL, VT,
57708 ConcatSubOperand(SrcVT, Ops, 0),
57709 ConcatSubOperand(SrcVT, Ops, 1));
57710 }
57711 break;
57712 case X86ISD::VPERMV:
57713 if (!IsSplat && NumOps == 2 &&
57714 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
57715 MVT OpVT = Op0.getSimpleValueType();
57716 int NumSrcElts = OpVT.getVectorNumElements();
57717 SmallVector<int, 64> ConcatMask;
57718 for (unsigned i = 0; i != NumOps; ++i) {
57719 SmallVector<int, 64> SubMask;
57721 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
57722 break;
57723 for (int M : SubMask) {
57724 if (0 <= M)
57725 M += i * NumSrcElts;
57726 ConcatMask.push_back(M);
57727 }
57728 }
57729 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
57730 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
57731 Ops[1].getOperand(1), DAG, DL);
57732 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
57733 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
57734 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
57735 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
57736 }
57737 }
57738 break;
57739 case X86ISD::VPERMV3:
57740 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
57741 MVT OpVT = Op0.getSimpleValueType();
57742 int NumSrcElts = OpVT.getVectorNumElements();
57743 SmallVector<int, 64> ConcatMask;
57744 for (unsigned i = 0; i != NumOps; ++i) {
57745 SmallVector<int, 64> SubMask;
57747 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
57748 break;
57749 for (int M : SubMask) {
57750 if (0 <= M) {
57751 int Src = M < NumSrcElts ? 0 : 2;
57752 M += M < NumSrcElts ? 0 : NumSrcElts;
57753
57754 // Reference the lowest sub if they upper sub is the same.
57755 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
57756 M += i * NumSrcElts;
57757 }
57758 ConcatMask.push_back(M);
57759 }
57760 }
57761 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
57762 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
57763 Ops[1].getOperand(0), DAG, DL);
57764 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
57765 Ops[1].getOperand(2), DAG, DL);
57766 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
57767 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
57768 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
57769 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
57770 }
57771 }
57772 break;
57773 case X86ISD::VPERM2X128: {
57774 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
57775 assert(NumOps == 2 && "Bad concat_vectors operands");
57776 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
57777 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
57778 // TODO: Handle zero'd subvectors.
57779 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
57780 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
57781 (int)((Imm1 >> 4) & 0x3)};
57782 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
57783 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
57784 Ops[0].getOperand(1), DAG, DL);
57785 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
57786 Ops[1].getOperand(1), DAG, DL);
57787 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
57788 DAG.getBitcast(ShuffleVT, LHS),
57789 DAG.getBitcast(ShuffleVT, RHS),
57790 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
57791 return DAG.getBitcast(VT, Res);
57792 }
57793 }
57794 break;
57795 }
57796 case X86ISD::SHUF128: {
57797 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
57798 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
57799 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
57800 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
57801 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
57802 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
57803 Ops[0].getOperand(1), DAG, DL);
57804 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
57805 Ops[1].getOperand(1), DAG, DL);
57806 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
57807 DAG.getTargetConstant(Imm, DL, MVT::i8));
57808 }
57809 break;
57810 }
57811 case ISD::TRUNCATE:
57812 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
57813 EVT SrcVT = Ops[0].getOperand(0).getValueType();
57814 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
57815 SrcVT == Ops[1].getOperand(0).getValueType() &&
57816 Subtarget.useAVX512Regs() &&
57817 Subtarget.getPreferVectorWidth() >= 512 &&
57818 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
57819 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
57820 return DAG.getNode(ISD::TRUNCATE, DL, VT,
57821 ConcatSubOperand(NewSrcVT, Ops, 0));
57822 }
57823 }
57824 break;
57825 case ISD::ANY_EXTEND:
57826 case ISD::SIGN_EXTEND:
57827 case ISD::ZERO_EXTEND:
57828 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
57829 if (!IsSplat && NumOps == 2 &&
57830 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57831 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57832 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
57833 EVT SrcVT = Ops[0].getOperand(0).getValueType();
57834 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
57835 SrcVT == Ops[1].getOperand(0).getValueType()) {
57836 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
57837 return DAG.getNode(Op0.getOpcode(), DL, VT,
57838 ConcatSubOperand(NewSrcVT, Ops, 0));
57839 }
57840 }
57841 break;
57842 case X86ISD::VSHLI:
57843 case X86ISD::VSRLI:
57844 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
57845 // TODO: Move this to LowerShiftByScalarImmediate?
57846 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
57847 llvm::all_of(Ops, [](SDValue Op) {
57848 return Op.getConstantOperandAPInt(1) == 32;
57849 })) {
57850 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
57851 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
57852 if (Op0.getOpcode() == X86ISD::VSHLI) {
57853 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
57854 {8, 0, 8, 2, 8, 4, 8, 6});
57855 } else {
57856 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
57857 {1, 8, 3, 8, 5, 8, 7, 8});
57858 }
57859 return DAG.getBitcast(VT, Res);
57860 }
57861 [[fallthrough]];
57862 case X86ISD::VSRAI:
57863 case X86ISD::VSHL:
57864 case X86ISD::VSRL:
57865 case X86ISD::VSRA:
57866 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
57867 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57868 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
57869 llvm::all_of(Ops, [Op0](SDValue Op) {
57870 return Op0.getOperand(1) == Op.getOperand(1);
57871 })) {
57872 return DAG.getNode(Op0.getOpcode(), DL, VT,
57873 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
57874 }
57875 break;
57876 case X86ISD::VPERMI:
57877 case X86ISD::VROTLI:
57878 case X86ISD::VROTRI:
57879 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57880 llvm::all_of(Ops, [Op0](SDValue Op) {
57881 return Op0.getOperand(1) == Op.getOperand(1);
57882 })) {
57883 return DAG.getNode(Op0.getOpcode(), DL, VT,
57884 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
57885 }
57886 break;
57887 case ISD::AND:
57888 case ISD::OR:
57889 case ISD::XOR:
57890 case X86ISD::ANDNP:
57891 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57892 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
57893 return DAG.getNode(Op0.getOpcode(), DL, VT,
57894 ConcatSubOperand(VT, Ops, 0),
57895 ConcatSubOperand(VT, Ops, 1));
57896 }
57897 break;
57898 case X86ISD::PCMPEQ:
57899 case X86ISD::PCMPGT:
57900 if (!IsSplat && VT.is256BitVector() &&
57901 (Subtarget.hasInt256() || VT == MVT::v8i32) &&
57902 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
57903 if (Subtarget.hasInt256())
57904 return DAG.getNode(Op0.getOpcode(), DL, VT,
57905 ConcatSubOperand(VT, Ops, 0),
57906 ConcatSubOperand(VT, Ops, 1));
57907
57908 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
57909 // TODO: Handle v4f64 as well?
57910 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
57911 for (unsigned I = 0; I != NumOps; ++I) {
57912 MaxSigBitsLHS =
57913 std::max(MaxSigBitsLHS,
57914 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
57915 MaxSigBitsRHS =
57916 std::max(MaxSigBitsRHS,
57917 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
57918 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
57919 break;
57920 }
57921
57922 ISD::CondCode ICC =
57924 ISD::CondCode FCC =
57926
57927 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
57928 MVT FpVT = VT.changeVectorElementType(FpSVT);
57929
57930 if (std::optional<unsigned> CastOpc =
57931 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
57932 SDValue LHS = ConcatSubOperand(VT, Ops, 0);
57933 SDValue RHS = ConcatSubOperand(VT, Ops, 1);
57934 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
57935 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
57936
57937 bool IsAlwaysSignaling;
57938 unsigned FSETCC =
57939 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
57940 return DAG.getBitcast(
57941 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
57942 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
57943 }
57944 }
57945 break;
57946 case ISD::CTPOP:
57947 case ISD::CTTZ:
57948 case ISD::CTLZ:
57951 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57952 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
57953 return DAG.getNode(Op0.getOpcode(), DL, VT,
57954 ConcatSubOperand(VT, Ops, 0));
57955 }
57956 break;
57958 if (!IsSplat &&
57959 (VT.is256BitVector() ||
57960 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
57961 llvm::all_of(Ops, [Op0](SDValue Op) {
57962 return Op0.getOperand(2) == Op.getOperand(2);
57963 })) {
57964 return DAG.getNode(Op0.getOpcode(), DL, VT,
57965 ConcatSubOperand(VT, Ops, 0),
57966 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
57967 }
57968 break;
57969 case ISD::ADD:
57970 case ISD::SUB:
57971 case ISD::MUL:
57972 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57973 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57974 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
57975 return DAG.getNode(Op0.getOpcode(), DL, VT,
57976 ConcatSubOperand(VT, Ops, 0),
57977 ConcatSubOperand(VT, Ops, 1));
57978 }
57979 break;
57980 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
57981 // their latency are short, so here we don't replace them unless we won't
57982 // introduce extra VINSERT.
57983 case ISD::FADD:
57984 case ISD::FSUB:
57985 case ISD::FMUL:
57986 if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
57987 (VT.is256BitVector() ||
57988 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
57989 return DAG.getNode(Op0.getOpcode(), DL, VT,
57990 ConcatSubOperand(VT, Ops, 0),
57991 ConcatSubOperand(VT, Ops, 1));
57992 }
57993 break;
57994 case ISD::FDIV:
57995 if (!IsSplat && (VT.is256BitVector() ||
57996 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
57997 return DAG.getNode(Op0.getOpcode(), DL, VT,
57998 ConcatSubOperand(VT, Ops, 0),
57999 ConcatSubOperand(VT, Ops, 1));
58000 }
58001 break;
58002 case X86ISD::HADD:
58003 case X86ISD::HSUB:
58004 case X86ISD::FHADD:
58005 case X86ISD::FHSUB:
58006 if (!IsSplat && VT.is256BitVector() &&
58007 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
58008 return DAG.getNode(Op0.getOpcode(), DL, VT,
58009 ConcatSubOperand(VT, Ops, 0),
58010 ConcatSubOperand(VT, Ops, 1));
58011 }
58012 break;
58013 case X86ISD::PACKSS:
58014 case X86ISD::PACKUS:
58015 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58016 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58017 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58018 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58019 NumOps * SrcVT.getVectorNumElements());
58020 return DAG.getNode(Op0.getOpcode(), DL, VT,
58021 ConcatSubOperand(SrcVT, Ops, 0),
58022 ConcatSubOperand(SrcVT, Ops, 1));
58023 }
58024 break;
58025 case X86ISD::PALIGNR:
58026 if (!IsSplat &&
58027 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58028 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
58029 llvm::all_of(Ops, [Op0](SDValue Op) {
58030 return Op0.getOperand(2) == Op.getOperand(2);
58031 })) {
58032 return DAG.getNode(Op0.getOpcode(), DL, VT,
58033 ConcatSubOperand(VT, Ops, 0),
58034 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
58035 }
58036 break;
58037 case X86ISD::BLENDI:
58038 if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) {
58039 uint64_t Mask0 = Ops[0].getConstantOperandVal(2);
58040 uint64_t Mask1 = Ops[1].getConstantOperandVal(2);
58041 // MVT::v16i16 has repeated blend mask.
58042 if (Op0.getSimpleValueType() == MVT::v16i16) {
58043 Mask0 = (Mask0 << 8) | Mask0;
58044 Mask1 = (Mask1 << 8) | Mask1;
58045 }
58046 uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0;
58048 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
58049 SDValue Sel =
58050 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
58051 return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1),
58052 ConcatSubOperand(VT, Ops, 0));
58053 }
58054 break;
58055 case ISD::VSELECT:
58056 if (!IsSplat && Subtarget.hasAVX512() &&
58057 (VT.is256BitVector() ||
58058 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58059 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
58060 EVT SelVT = Ops[0].getOperand(0).getValueType();
58061 if (SelVT.getVectorElementType() == MVT::i1) {
58062 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
58063 NumOps * SelVT.getVectorNumElements());
58064 if (TLI.isTypeLegal(SelVT))
58065 return DAG.getNode(Op0.getOpcode(), DL, VT,
58066 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
58067 ConcatSubOperand(VT, Ops, 1),
58068 ConcatSubOperand(VT, Ops, 2));
58069 }
58070 }
58071 [[fallthrough]];
58072 case X86ISD::BLENDV:
58073 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
58074 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
58075 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
58076 EVT SelVT = Ops[0].getOperand(0).getValueType();
58077 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
58078 if (TLI.isTypeLegal(SelVT))
58079 return DAG.getNode(Op0.getOpcode(), DL, VT,
58080 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
58081 ConcatSubOperand(VT, Ops, 1),
58082 ConcatSubOperand(VT, Ops, 2));
58083 }
58084 break;
58085 }
58086 }
58087
58088 // Fold subvector loads into one.
58089 // If needed, look through bitcasts to get to the load.
58090 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
58091 unsigned Fast;
58092 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
58093 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
58094 *FirstLd->getMemOperand(), &Fast) &&
58095 Fast) {
58096 if (SDValue Ld =
58097 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
58098 return Ld;
58099 }
58100 }
58101
58102 // Attempt to fold target constant loads.
58103 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
58104 SmallVector<APInt> EltBits;
58105 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
58106 for (unsigned I = 0; I != NumOps; ++I) {
58107 APInt OpUndefElts;
58108 SmallVector<APInt> OpEltBits;
58109 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
58110 OpEltBits, /*AllowWholeUndefs*/ true,
58111 /*AllowPartialUndefs*/ false))
58112 break;
58113 EltBits.append(OpEltBits);
58114 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
58115 }
58116 if (EltBits.size() == VT.getVectorNumElements()) {
58117 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
58118 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
58119 SDValue CV = DAG.getConstantPool(C, PVT);
58122 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
58123 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
58124 DAG.ReplaceAllUsesOfValueWith(Op0, Sub);
58125 return Ld;
58126 }
58127 }
58128
58129 // If this simple subvector or scalar/subvector broadcast_load is inserted
58130 // into both halves, use a larger broadcast_load. Update other uses to use
58131 // an extracted subvector.
58132 if (IsSplat &&
58133 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58134 if (ISD::isNormalLoad(Op0.getNode()) ||
58137 auto *Mem = cast<MemSDNode>(Op0);
58138 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
58141 if (SDValue BcastLd =
58142 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
58143 SDValue BcastSrc =
58144 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
58145 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
58146 return BcastLd;
58147 }
58148 }
58149 }
58150
58151 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
58152 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
58153 Subtarget.useAVX512Regs()) {
58154 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58155 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
58156 Res = DAG.getBitcast(ShuffleVT, Res);
58157 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
58158 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58159 return DAG.getBitcast(VT, Res);
58160 }
58161
58162 return SDValue();
58163}
58164
58167 const X86Subtarget &Subtarget) {
58168 EVT VT = N->getValueType(0);
58169 EVT SrcVT = N->getOperand(0).getValueType();
58170 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58171 SmallVector<SDValue, 4> Ops(N->ops());
58172
58173 if (VT.getVectorElementType() == MVT::i1) {
58174 // Attempt to constant fold.
58175 unsigned SubSizeInBits = SrcVT.getSizeInBits();
58177 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
58178 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
58179 if (!C) break;
58180 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
58181 if (I == (E - 1)) {
58182 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
58183 if (TLI.isTypeLegal(IntVT))
58184 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
58185 }
58186 }
58187
58188 // Don't do anything else for i1 vectors.
58189 return SDValue();
58190 }
58191
58192 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
58193 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
58194 DCI, Subtarget))
58195 return R;
58196 }
58197
58198 return SDValue();
58199}
58200
58203 const X86Subtarget &Subtarget) {
58204 if (DCI.isBeforeLegalizeOps())
58205 return SDValue();
58206
58207 MVT OpVT = N->getSimpleValueType(0);
58208
58209 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
58210
58211 SDLoc dl(N);
58212 SDValue Vec = N->getOperand(0);
58213 SDValue SubVec = N->getOperand(1);
58214
58215 uint64_t IdxVal = N->getConstantOperandVal(2);
58216 MVT SubVecVT = SubVec.getSimpleValueType();
58217
58218 if (Vec.isUndef() && SubVec.isUndef())
58219 return DAG.getUNDEF(OpVT);
58220
58221 // Inserting undefs/zeros into zeros/undefs is a zero vector.
58222 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
58223 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
58224 return getZeroVector(OpVT, Subtarget, DAG, dl);
58225
58227 // If we're inserting into a zero vector and then into a larger zero vector,
58228 // just insert into the larger zero vector directly.
58229 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
58231 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
58232 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
58233 getZeroVector(OpVT, Subtarget, DAG, dl),
58234 SubVec.getOperand(1),
58235 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
58236 }
58237
58238 // If we're inserting into a zero vector and our input was extracted from an
58239 // insert into a zero vector of the same type and the extraction was at
58240 // least as large as the original insertion. Just insert the original
58241 // subvector into a zero vector.
58242 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
58243 isNullConstant(SubVec.getOperand(1)) &&
58245 SDValue Ins = SubVec.getOperand(0);
58246 if (isNullConstant(Ins.getOperand(2)) &&
58247 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
58248 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
58249 SubVecVT.getFixedSizeInBits())
58250 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
58251 getZeroVector(OpVT, Subtarget, DAG, dl),
58252 Ins.getOperand(1), N->getOperand(2));
58253 }
58254 }
58255
58256 // Stop here if this is an i1 vector.
58257 if (IsI1Vector)
58258 return SDValue();
58259
58260 // Eliminate an intermediate vector widening:
58261 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
58262 // insert_subvector X, Y, Idx
58263 // TODO: This is a more general version of a DAGCombiner fold, can we move it
58264 // there?
58265 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
58266 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
58267 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
58268 SubVec.getOperand(1), N->getOperand(2));
58269
58270 // If this is an insert of an extract, combine to a shuffle. Don't do this
58271 // if the insert or extract can be represented with a subregister operation.
58272 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58273 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
58274 (IdxVal != 0 ||
58275 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
58276 int ExtIdxVal = SubVec.getConstantOperandVal(1);
58277 if (ExtIdxVal != 0) {
58278 int VecNumElts = OpVT.getVectorNumElements();
58279 int SubVecNumElts = SubVecVT.getVectorNumElements();
58280 SmallVector<int, 64> Mask(VecNumElts);
58281 // First create an identity shuffle mask.
58282 for (int i = 0; i != VecNumElts; ++i)
58283 Mask[i] = i;
58284 // Now insert the extracted portion.
58285 for (int i = 0; i != SubVecNumElts; ++i)
58286 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
58287
58288 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
58289 }
58290 }
58291
58292 // Match concat_vector style patterns.
58293 SmallVector<SDValue, 2> SubVectorOps;
58294 if (collectConcatOps(N, SubVectorOps, DAG)) {
58295 if (SDValue Fold =
58296 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
58297 return Fold;
58298
58299 // If we're inserting all zeros into the upper half, change this to
58300 // a concat with zero. We will match this to a move
58301 // with implicit upper bit zeroing during isel.
58302 // We do this here because we don't want combineConcatVectorOps to
58303 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
58304 if (SubVectorOps.size() == 2 &&
58305 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
58306 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
58307 getZeroVector(OpVT, Subtarget, DAG, dl),
58308 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
58309
58310 // Attempt to recursively combine to a shuffle.
58311 if (all_of(SubVectorOps, [](SDValue SubOp) {
58312 return isTargetShuffle(SubOp.getOpcode());
58313 })) {
58314 SDValue Op(N, 0);
58315 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
58316 return Res;
58317 }
58318 }
58319
58320 // If this is a broadcast insert into an upper undef, use a larger broadcast.
58321 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
58322 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
58323
58324 // If this is a broadcast load inserted into an upper undef, use a larger
58325 // broadcast load.
58326 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
58327 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
58328 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
58329 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
58330 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
58331 SDValue BcastLd =
58333 MemIntr->getMemoryVT(),
58334 MemIntr->getMemOperand());
58335 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
58336 return BcastLd;
58337 }
58338
58339 // If we're splatting the lower half subvector of a full vector load into the
58340 // upper half, attempt to create a subvector broadcast.
58341 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
58342 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
58343 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
58344 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
58345 if (VecLd && SubLd &&
58346 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
58347 SubVec.getValueSizeInBits() / 8, 0))
58348 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
58349 SubLd, 0, DAG);
58350 }
58351
58352 return SDValue();
58353}
58354
58355/// If we are extracting a subvector of a vector select and the select condition
58356/// is composed of concatenated vectors, try to narrow the select width. This
58357/// is a common pattern for AVX1 integer code because 256-bit selects may be
58358/// legal, but there is almost no integer math/logic available for 256-bit.
58359/// This function should only be called with legal types (otherwise, the calls
58360/// to get simple value types will assert).
58362 SelectionDAG &DAG) {
58363 SDValue Sel = Ext->getOperand(0);
58364 if (Sel.getOpcode() != ISD::VSELECT ||
58365 !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
58366 return SDValue();
58367
58368 // Note: We assume simple value types because this should only be called with
58369 // legal operations/types.
58370 // TODO: This can be extended to handle extraction to 256-bits.
58371 MVT VT = Ext->getSimpleValueType(0);
58372 if (!VT.is128BitVector())
58373 return SDValue();
58374
58375 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
58376 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
58377 return SDValue();
58378
58379 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
58380 MVT SelVT = Sel.getSimpleValueType();
58381 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
58382 "Unexpected vector type with legal operations");
58383
58384 unsigned SelElts = SelVT.getVectorNumElements();
58385 unsigned CastedElts = WideVT.getVectorNumElements();
58386 unsigned ExtIdx = Ext->getConstantOperandVal(1);
58387 if (SelElts % CastedElts == 0) {
58388 // The select has the same or more (narrower) elements than the extract
58389 // operand. The extraction index gets scaled by that factor.
58390 ExtIdx *= (SelElts / CastedElts);
58391 } else if (CastedElts % SelElts == 0) {
58392 // The select has less (wider) elements than the extract operand. Make sure
58393 // that the extraction index can be divided evenly.
58394 unsigned IndexDivisor = CastedElts / SelElts;
58395 if (ExtIdx % IndexDivisor != 0)
58396 return SDValue();
58397 ExtIdx /= IndexDivisor;
58398 } else {
58399 llvm_unreachable("Element count of simple vector types are not divisible?");
58400 }
58401
58402 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
58403 unsigned NarrowElts = SelElts / NarrowingFactor;
58404 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
58405 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
58406 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
58407 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
58408 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
58409 return DAG.getBitcast(VT, NarrowSel);
58410}
58411
58414 const X86Subtarget &Subtarget) {
58415 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
58416 // eventually get combined/lowered into ANDNP) with a concatenated operand,
58417 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
58418 // We let generic combining take over from there to simplify the
58419 // insert/extract and 'not'.
58420 // This pattern emerges during AVX1 legalization. We handle it before lowering
58421 // to avoid complications like splitting constant vector loads.
58422
58423 // Capture the original wide type in the likely case that we need to bitcast
58424 // back to this type.
58425 if (!N->getValueType(0).isSimple())
58426 return SDValue();
58427
58428 MVT VT = N->getSimpleValueType(0);
58429 SDValue InVec = N->getOperand(0);
58430 unsigned IdxVal = N->getConstantOperandVal(1);
58431 SDValue InVecBC = peekThroughBitcasts(InVec);
58432 EVT InVecVT = InVec.getValueType();
58433 unsigned SizeInBits = VT.getSizeInBits();
58434 unsigned InSizeInBits = InVecVT.getSizeInBits();
58435 unsigned NumSubElts = VT.getVectorNumElements();
58436 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58437 SDLoc DL(N);
58438
58439 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
58440 TLI.isTypeLegal(InVecVT) &&
58441 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
58442 auto isConcatenatedNot = [](SDValue V) {
58443 V = peekThroughBitcasts(V);
58444 if (!isBitwiseNot(V))
58445 return false;
58446 SDValue NotOp = V->getOperand(0);
58448 };
58449 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
58450 isConcatenatedNot(InVecBC.getOperand(1))) {
58451 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
58452 SDValue Concat = splitVectorIntBinary(InVecBC, DAG, SDLoc(InVecBC));
58453 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
58454 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
58455 }
58456 }
58457
58458 if (DCI.isBeforeLegalizeOps())
58459 return SDValue();
58460
58461 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
58462 return V;
58463
58465 return getZeroVector(VT, Subtarget, DAG, DL);
58466
58467 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
58468 if (VT.getScalarType() == MVT::i1)
58469 return DAG.getConstant(1, DL, VT);
58470 return getOnesVector(VT, DAG, DL);
58471 }
58472
58473 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
58474 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
58475
58476 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
58477 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58478 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
58479 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
58480 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
58481 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
58482 }
58483
58484 // If we are extracting from an insert into a larger vector, replace with a
58485 // smaller insert if we don't access less than the original subvector. Don't
58486 // do this for i1 vectors.
58487 // TODO: Relax the matching indices requirement?
58488 if (VT.getVectorElementType() != MVT::i1 &&
58489 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
58490 IdxVal == InVec.getConstantOperandVal(2) &&
58491 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
58492 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
58493 InVec.getOperand(0), N->getOperand(1));
58494 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
58495 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
58496 InVec.getOperand(1),
58497 DAG.getVectorIdxConstant(NewIdxVal, DL));
58498 }
58499
58500 // If we're extracting an upper subvector from a broadcast we should just
58501 // extract the lowest subvector instead which should allow
58502 // SimplifyDemandedVectorElts do more simplifications.
58503 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
58505 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
58506 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
58507
58508 // Check if we're extracting a whole broadcasted subvector.
58509 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
58510 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
58511 EVT MemVT = MemIntr->getMemoryVT();
58512 if (MemVT == VT) {
58513 // Just use the lowest subvector.
58514 if (IdxVal != 0)
58515 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
58516 // If this is the only use, we can replace with a regular load (this may
58517 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
58518 // memory chain).
58519 if (InVec.hasOneUse()) {
58520 SDValue Ld =
58521 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
58522 MemIntr->getMemOperand());
58523 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
58524 return Ld;
58525 }
58526 }
58527 }
58528
58529 // Attempt to extract from the source of a shuffle vector.
58530 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
58531 SmallVector<int, 32> ShuffleMask;
58532 SmallVector<int, 32> ScaledMask;
58533 SmallVector<SDValue, 2> ShuffleInputs;
58534 unsigned NumSubVecs = InSizeInBits / SizeInBits;
58535 // Decode the shuffle mask and scale it so its shuffling subvectors.
58536 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
58537 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
58538 unsigned SubVecIdx = IdxVal / NumSubElts;
58539 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
58540 return DAG.getUNDEF(VT);
58541 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
58542 return getZeroVector(VT, Subtarget, DAG, DL);
58543 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
58544 if (Src.getValueSizeInBits() == InSizeInBits) {
58545 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
58546 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
58547 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
58548 DL, SizeInBits);
58549 }
58550 }
58551 }
58552
58553 auto IsExtractFree = [](SDValue V) {
58554 if (V.hasOneUse()) {
58556 if (V.getOpcode() == ISD::LOAD)
58557 return true;
58558 }
58559 V = peekThroughBitcasts(V);
58560 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
58561 return true;
58563 return true;
58564 return V.isUndef();
58565 };
58566
58567 // If we're extracting the lowest subvector and we're the only user,
58568 // we may be able to perform this with a smaller vector width.
58569 unsigned InOpcode = InVec.getOpcode();
58570 if (InVec.hasOneUse()) {
58571 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
58572 // v2f64 CVTDQ2PD(v4i32).
58573 if (InOpcode == ISD::SINT_TO_FP &&
58574 InVec.getOperand(0).getValueType() == MVT::v4i32) {
58575 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
58576 }
58577 // v2f64 CVTUDQ2PD(v4i32).
58578 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
58579 InVec.getOperand(0).getValueType() == MVT::v4i32) {
58580 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
58581 }
58582 // v2f64 CVTPS2PD(v4f32).
58583 if (InOpcode == ISD::FP_EXTEND &&
58584 InVec.getOperand(0).getValueType() == MVT::v4f32) {
58585 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
58586 }
58587 }
58588 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
58589 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
58590 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
58591 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
58592 Subtarget.hasVLX())) &&
58593 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
58594 SDValue Src = InVec.getOperand(0);
58595 if (Src.getValueType().getScalarSizeInBits() == 32)
58596 return DAG.getNode(InOpcode, DL, VT,
58597 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
58598 }
58599 if (IdxVal == 0 &&
58600 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
58601 (SizeInBits == 128 || SizeInBits == 256) &&
58602 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
58603 SDValue Ext = InVec.getOperand(0);
58604 if (Ext.getValueSizeInBits() > SizeInBits)
58605 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
58606 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
58607 return DAG.getNode(ExtOp, DL, VT, Ext);
58608 }
58609 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
58610 InVec.getOperand(0).getValueType().is256BitVector() &&
58611 InVec.getOperand(1).getValueType().is256BitVector() &&
58612 InVec.getOperand(2).getValueType().is256BitVector()) {
58613 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
58614 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
58615 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
58616 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
58617 }
58618 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
58619 (SizeInBits == 128 || SizeInBits == 256)) {
58620 SDValue InVecSrc = InVec.getOperand(0);
58621 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
58622 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
58623 return DAG.getNode(InOpcode, DL, VT, Ext);
58624 }
58625
58626 if (SizeInBits == 128 || SizeInBits == 256) {
58627 switch (InOpcode) {
58628 case X86ISD::MOVDDUP:
58629 return DAG.getNode(
58630 InOpcode, DL, VT,
58631 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
58632 case X86ISD::PSHUFD:
58633 case X86ISD::VPERMILPI:
58634 if (InVec.getOperand(0).hasOneUse()) {
58635 uint64_t M = InVec.getConstantOperandVal(1) & 255;
58636 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
58637 return DAG.getNode(InOpcode, DL, VT,
58638 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58639 DL, SizeInBits),
58640 DAG.getTargetConstant(M, DL, MVT::i8));
58641 }
58642 break;
58643 case X86ISD::PCMPEQ:
58644 case X86ISD::PCMPGT:
58645 case X86ISD::UNPCKH:
58646 case X86ISD::UNPCKL:
58647 if (IsExtractFree(InVec.getOperand(0)) ||
58648 IsExtractFree(InVec.getOperand(1)))
58649 return DAG.getNode(InOpcode, DL, VT,
58650 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58651 DL, SizeInBits),
58652 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
58653 DL, SizeInBits));
58654 break;
58655 case X86ISD::CMPP:
58656 if (IsExtractFree(InVec.getOperand(0)) ||
58657 IsExtractFree(InVec.getOperand(1)))
58658 return DAG.getNode(InOpcode, DL, VT,
58659 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58660 DL, SizeInBits),
58661 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
58662 DL, SizeInBits),
58663 InVec.getOperand(2));
58664 break;
58665 case X86ISD::BLENDI:
58666 if (IsExtractFree(InVec.getOperand(0)) ||
58667 IsExtractFree(InVec.getOperand(1))) {
58668 uint64_t M = InVec.getConstantOperandVal(2) & 255;
58669 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
58670 return DAG.getNode(InOpcode, DL, VT,
58671 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58672 DL, SizeInBits),
58673 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
58674 DL, SizeInBits),
58675 DAG.getTargetConstant(M, DL, MVT::i8));
58676 }
58677 break;
58678 case X86ISD::VPERMV3:
58679 if (IdxVal != 0) {
58680 SDValue Src0 = InVec.getOperand(0);
58681 SDValue Mask = InVec.getOperand(1);
58682 SDValue Src1 = InVec.getOperand(2);
58683 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
58684 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
58685 DL, InSizeInBits);
58686 SDValue Shuffle =
58687 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
58688 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
58689 }
58690 break;
58691 }
58692 }
58693 }
58694
58695 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
58696 // as this is very likely to fold into a shuffle/truncation.
58697 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
58698 InVecVT.getScalarSizeInBits() == 64 &&
58699 InVec.getConstantOperandAPInt(1) == 32) {
58700 SDValue Ext =
58701 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
58702 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
58703 }
58704
58705 return SDValue();
58706}
58707
58709 const X86Subtarget &Subtarget) {
58710 using namespace SDPatternMatch;
58711 EVT VT = N->getValueType(0);
58712 SDValue Src = N->getOperand(0);
58713 SDLoc DL(N);
58714
58715 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
58716 // This occurs frequently in our masked scalar intrinsic code and our
58717 // floating point select lowering with AVX512.
58718 // TODO: SimplifyDemandedBits instead?
58719 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
58720 isOneConstant(Src.getOperand(1)))
58721 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
58722
58723 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
58724 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
58725 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
58726 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
58727 isNullConstant(Src.getOperand(1)))
58728 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
58729 Src.getOperand(1));
58730
58731 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
58732 // TODO: Move to DAGCombine/SimplifyDemandedBits?
58733 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
58734 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
58735 if (Op.getValueType() != MVT::i64)
58736 return SDValue();
58737 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
58738 if (Op.getOpcode() == Opc &&
58739 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
58740 return Op.getOperand(0);
58741 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
58742 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
58743 if (Ld->getExtensionType() == Ext &&
58744 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
58745 return Op;
58746 if (IsZeroExt) {
58747 KnownBits Known = DAG.computeKnownBits(Op);
58748 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
58749 return Op;
58750 }
58751 return SDValue();
58752 };
58753
58754 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
58755 return DAG.getBitcast(
58756 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
58757 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
58758
58759 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
58760 return DAG.getBitcast(
58761 VT,
58762 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
58763 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
58764 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
58765 }
58766
58767 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST) {
58768 SDValue SrcOp = Src.getOperand(0);
58769 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
58770 if (SrcOp.getValueType() == MVT::f64)
58771 return DAG.getBitcast(
58772 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
58773 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
58774 if (SrcOp.getValueType() == MVT::x86mmx)
58775 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
58776 }
58777
58778 if (VT == MVT::v4i32) {
58779 SDValue HalfSrc;
58780 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
58781 // to remove XMM->GPR->XMM moves.
58782 if (sd_match(Src, m_AnyExt(m_BitCast(
58783 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
58784 return DAG.getBitcast(
58785 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
58786 }
58787
58788 // See if we're broadcasting the scalar value, in which case just reuse that.
58789 // Ensure the same SDValue from the SDNode use is being used.
58790 if (VT.getScalarType() == Src.getValueType())
58791 for (SDNode *User : Src->users())
58792 if (User->getOpcode() == X86ISD::VBROADCAST &&
58793 Src == User->getOperand(0)) {
58794 unsigned SizeInBits = VT.getFixedSizeInBits();
58795 unsigned BroadcastSizeInBits =
58796 User->getValueSizeInBits(0).getFixedValue();
58797 if (BroadcastSizeInBits == SizeInBits)
58798 return SDValue(User, 0);
58799 if (BroadcastSizeInBits > SizeInBits)
58800 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
58801 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
58802 // coverage.
58803 }
58804
58805 // Check for cases where we've ended up with a scalarized shift, typically
58806 // during type legalization.
58807 switch (Src.getOpcode()) {
58808 case ISD::SHL:
58809 case ISD::SRL:
58810 case ISD::SRA:
58811 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
58812 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
58813 Src.hasOneUse()) {
58814 SDValue SrcVec =
58815 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
58816 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
58817 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
58818 Amt->getZExtValue(), DAG);
58819 }
58820 }
58821 break;
58822 case ISD::FSHL:
58823 case ISD::FSHR:
58824 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
58825 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
58826 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
58827 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
58828 Src.hasOneUse()) {
58829 uint64_t AmtVal =
58830 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
58831 SDValue SrcVec0 =
58832 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
58833 SDValue SrcVec1 =
58834 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
58835 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
58836 DAG.getConstant(AmtVal, DL, VT));
58837 }
58838 }
58839 break;
58840 }
58841
58842 return SDValue();
58843}
58844
58845// Simplify PMULDQ and PMULUDQ operations.
58848 const X86Subtarget &Subtarget) {
58849 SDValue LHS = N->getOperand(0);
58850 SDValue RHS = N->getOperand(1);
58851
58852 // Canonicalize constant to RHS.
58855 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
58856
58857 // Multiply by zero.
58858 // Don't return RHS as it may contain UNDEFs.
58859 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
58860 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
58861
58862 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
58863 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58864 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
58865 return SDValue(N, 0);
58866
58867 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
58868 // convert it to any_extend_invec, due to the LegalOperations check, do the
58869 // conversion directly to a vector shuffle manually. This exposes combine
58870 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
58871 // combineX86ShufflesRecursively on SSE4.1 targets.
58872 // FIXME: This is basically a hack around several other issues related to
58873 // ANY_EXTEND_VECTOR_INREG.
58874 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
58875 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
58876 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
58877 LHS.getOperand(0).getValueType() == MVT::v4i32) {
58878 SDLoc dl(N);
58879 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
58880 LHS.getOperand(0), { 0, -1, 1, -1 });
58881 LHS = DAG.getBitcast(MVT::v2i64, LHS);
58882 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
58883 }
58884 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
58885 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
58886 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
58887 RHS.getOperand(0).getValueType() == MVT::v4i32) {
58888 SDLoc dl(N);
58889 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
58890 RHS.getOperand(0), { 0, -1, 1, -1 });
58891 RHS = DAG.getBitcast(MVT::v2i64, RHS);
58892 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
58893 }
58894
58895 return SDValue();
58896}
58897
58898// Simplify VPMADDUBSW/VPMADDWD operations.
58901 MVT VT = N->getSimpleValueType(0);
58902 SDValue LHS = N->getOperand(0);
58903 SDValue RHS = N->getOperand(1);
58904 unsigned Opc = N->getOpcode();
58905 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
58906 assert((Opc == X86ISD::VPMADDWD || Opc == X86ISD::VPMADDUBSW) &&
58907 "Unexpected PMADD opcode");
58908
58909 // Multiply by zero.
58910 // Don't return LHS/RHS as it may contain UNDEFs.
58911 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
58913 return DAG.getConstant(0, SDLoc(N), VT);
58914
58915 // Constant folding.
58916 APInt LHSUndefs, RHSUndefs;
58917 SmallVector<APInt> LHSBits, RHSBits;
58918 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
58919 unsigned DstEltBits = VT.getScalarSizeInBits();
58920 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
58921 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
58922 SmallVector<APInt> Result;
58923 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
58924 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
58925 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
58926 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
58927 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
58928 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
58929 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
58930 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
58931 Result.push_back(Res);
58932 }
58933 return getConstVector(Result, VT, DAG, SDLoc(N));
58934 }
58935
58936 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58937 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
58938 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
58939 return SDValue(N, 0);
58940
58941 return SDValue();
58942}
58943
58946 const X86Subtarget &Subtarget) {
58947 EVT VT = N->getValueType(0);
58948 SDValue In = N->getOperand(0);
58949 unsigned Opcode = N->getOpcode();
58950 unsigned InOpcode = In.getOpcode();
58951 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58952 SDLoc DL(N);
58953
58954 // Try to merge vector loads and extend_inreg to an extload.
58955 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
58956 In.hasOneUse()) {
58957 auto *Ld = cast<LoadSDNode>(In);
58958 if (Ld->isSimple()) {
58959 MVT SVT = In.getSimpleValueType().getVectorElementType();
58962 : ISD::ZEXTLOAD;
58963 EVT MemVT = VT.changeVectorElementType(SVT);
58964 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
58965 SDValue Load = DAG.getExtLoad(
58966 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
58967 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
58968 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
58969 return Load;
58970 }
58971 }
58972 }
58973
58974 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
58975 if (Opcode == InOpcode)
58976 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
58977
58978 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
58979 // -> EXTEND_VECTOR_INREG(X).
58980 // TODO: Handle non-zero subvector indices.
58981 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
58982 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
58983 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
58984 In.getValueSizeInBits())
58985 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
58986
58987 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
58988 // TODO: Move to DAGCombine?
58989 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
58990 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
58991 In.getValueSizeInBits() == VT.getSizeInBits()) {
58992 unsigned NumElts = VT.getVectorNumElements();
58993 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
58994 EVT EltVT = In.getOperand(0).getValueType();
58995 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
58996 for (unsigned I = 0; I != NumElts; ++I)
58997 Elts[I * Scale] = In.getOperand(I);
58998 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
58999 }
59000
59001 // Attempt to combine as a shuffle on SSE41+ targets.
59002 if (Subtarget.hasSSE41()) {
59003 SDValue Op(N, 0);
59004 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
59005 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59006 return Res;
59007 }
59008
59009 return SDValue();
59010}
59011
59014 EVT VT = N->getValueType(0);
59015 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59016 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
59017 return DAG.getConstant(0, SDLoc(N), VT);
59018
59019 // Fold kshiftr(extract_subvector(X,C1),C2)
59020 // --> extract_subvector(kshiftr(X,C1+C2),0)
59021 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
59022 if (N->getOpcode() == X86ISD::KSHIFTR) {
59023 SDLoc DL(N);
59024 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
59025 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
59026 SDValue Src = N->getOperand(0).getOperand(0);
59027 uint64_t Amt = N->getConstantOperandVal(1) +
59028 N->getOperand(0).getConstantOperandVal(1);
59029 EVT SrcVT = Src.getValueType();
59030 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
59031 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
59032 DAG.getTargetConstant(Amt, DL, MVT::i8));
59033 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
59034 DAG.getVectorIdxConstant(0, DL));
59035 }
59036 }
59037 }
59038
59039 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
59040 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
59041 return SDValue(N, 0);
59042
59043 return SDValue();
59044}
59045
59046// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
59047// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
59048// extra instructions between the conversion due to going to scalar and back.
59050 const X86Subtarget &Subtarget) {
59051 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
59052 return SDValue();
59053
59054 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
59055 return SDValue();
59056
59057 if (N->getValueType(0) != MVT::f32 ||
59058 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
59059 return SDValue();
59060
59061 SDLoc dl(N);
59062 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
59063 N->getOperand(0).getOperand(0));
59064 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
59065 DAG.getTargetConstant(4, dl, MVT::i32));
59066 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
59067 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
59068 DAG.getVectorIdxConstant(0, dl));
59069}
59070
59073 const X86Subtarget &Subtarget) {
59074 EVT VT = N->getValueType(0);
59075 bool IsStrict = N->isStrictFPOpcode();
59076 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
59077 EVT SrcVT = Src.getValueType();
59078
59079 SDLoc dl(N);
59080 if (SrcVT.getScalarType() == MVT::bf16) {
59081 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
59082 !IsStrict && Src.getOperand(0).getValueType() == VT)
59083 return Src.getOperand(0);
59084
59085 if (!SrcVT.isVector())
59086 return SDValue();
59087
59088 assert(!IsStrict && "Strict FP doesn't support BF16");
59089 if (VT.getVectorElementType() == MVT::f64) {
59090 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
59091 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
59092 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
59093 }
59094 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
59095 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
59096 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
59097 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
59098 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
59099 return DAG.getBitcast(VT, Src);
59100 }
59101
59102 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
59103 return SDValue();
59104
59105 if (Subtarget.hasFP16())
59106 return SDValue();
59107
59108 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
59109 return SDValue();
59110
59111 if (VT.getVectorElementType() != MVT::f32 &&
59112 VT.getVectorElementType() != MVT::f64)
59113 return SDValue();
59114
59115 unsigned NumElts = VT.getVectorNumElements();
59116 if (NumElts == 1 || !isPowerOf2_32(NumElts))
59117 return SDValue();
59118
59119 // Convert the input to vXi16.
59120 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
59121 Src = DAG.getBitcast(IntVT, Src);
59122
59123 // Widen to at least 8 input elements.
59124 if (NumElts < 8) {
59125 unsigned NumConcats = 8 / NumElts;
59126 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
59127 : DAG.getConstant(0, dl, IntVT);
59128 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
59129 Ops[0] = Src;
59130 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
59131 }
59132
59133 // Destination is vXf32 with at least 4 elements.
59134 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
59135 std::max(4U, NumElts));
59136 SDValue Cvt, Chain;
59137 if (IsStrict) {
59138 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
59139 {N->getOperand(0), Src});
59140 Chain = Cvt.getValue(1);
59141 } else {
59142 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
59143 }
59144
59145 if (NumElts < 4) {
59146 assert(NumElts == 2 && "Unexpected size");
59147 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
59148 DAG.getVectorIdxConstant(0, dl));
59149 }
59150
59151 if (IsStrict) {
59152 // Extend to the original VT if necessary.
59153 if (Cvt.getValueType() != VT) {
59154 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
59155 {Chain, Cvt});
59156 Chain = Cvt.getValue(1);
59157 }
59158 return DAG.getMergeValues({Cvt, Chain}, dl);
59159 }
59160
59161 // Extend to the original VT if necessary.
59162 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
59163}
59164
59165// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
59166// from. Limit this to cases where the loads have the same input chain and the
59167// output chains are unused. This avoids any memory ordering issues.
59170 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
59171 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
59172 "Unknown broadcast load type");
59173
59174 // Only do this if the chain result is unused.
59175 if (N->hasAnyUseOfValue(1))
59176 return SDValue();
59177
59178 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
59179
59180 SDValue Ptr = MemIntrin->getBasePtr();
59181 SDValue Chain = MemIntrin->getChain();
59182 EVT VT = N->getSimpleValueType(0);
59183 EVT MemVT = MemIntrin->getMemoryVT();
59184
59185 // Look at other users of our base pointer and try to find a wider broadcast.
59186 // The input chain and the size of the memory VT must match.
59187 for (SDNode *User : Ptr->users())
59188 if (User != N && User->getOpcode() == N->getOpcode() &&
59189 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
59190 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
59191 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
59192 MemVT.getSizeInBits() &&
59193 !User->hasAnyUseOfValue(1) &&
59194 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
59195 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
59196 VT.getSizeInBits());
59197 Extract = DAG.getBitcast(VT, Extract);
59198 return DCI.CombineTo(N, Extract, SDValue(User, 1));
59199 }
59200
59201 return SDValue();
59202}
59203
59205 const X86Subtarget &Subtarget) {
59206 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
59207 return SDValue();
59208
59209 bool IsStrict = N->isStrictFPOpcode();
59210 EVT VT = N->getValueType(0);
59211 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
59212 EVT SrcVT = Src.getValueType();
59213
59214 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
59215 SrcVT.getVectorElementType() != MVT::f32)
59216 return SDValue();
59217
59218 SDLoc dl(N);
59219
59220 SDValue Cvt, Chain;
59221 unsigned NumElts = VT.getVectorNumElements();
59222 if (Subtarget.hasFP16()) {
59223 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
59224 // v4f32 (xint_to_fp v4i64))))
59225 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
59226 // v8f16 (CVTXI2P v4i64)))
59227 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
59228 Src.getNumOperands() == 2) {
59229 SDValue Cvt0, Cvt1;
59230 SDValue Op0 = Src.getOperand(0);
59231 SDValue Op1 = Src.getOperand(1);
59232 bool IsOp0Strict = Op0->isStrictFPOpcode();
59233 if (Op0.getOpcode() != Op1.getOpcode() ||
59234 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
59235 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
59236 return SDValue();
59237 }
59238 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
59239 if (IsStrict) {
59240 assert(IsOp0Strict && "Op0 must be strict node");
59241 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
59244 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
59245 {Op0.getOperand(0), Op0.getOperand(1)});
59246 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
59247 {Op1.getOperand(0), Op1.getOperand(1)});
59248 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
59249 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
59250 }
59251 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
59253 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
59254 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
59255 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
59256 }
59257 return SDValue();
59258 }
59259
59260 if (NumElts == 1 || !isPowerOf2_32(NumElts))
59261 return SDValue();
59262
59263 // Widen to at least 4 input elements.
59264 if (NumElts < 4)
59265 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
59266 DAG.getConstantFP(0.0, dl, SrcVT));
59267
59268 // Destination is v8i16 with at least 8 elements.
59269 EVT CvtVT =
59270 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
59271 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
59272 if (IsStrict) {
59273 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
59274 {N->getOperand(0), Src, Rnd});
59275 Chain = Cvt.getValue(1);
59276 } else {
59277 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
59278 }
59279
59280 // Extract down to real number of elements.
59281 if (NumElts < 8) {
59283 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
59284 DAG.getVectorIdxConstant(0, dl));
59285 }
59286
59287 Cvt = DAG.getBitcast(VT, Cvt);
59288
59289 if (IsStrict)
59290 return DAG.getMergeValues({Cvt, Chain}, dl);
59291
59292 return Cvt;
59293}
59294
59296 SDValue Src = N->getOperand(0);
59297
59298 // Turn MOVDQ2Q+simple_load into an mmx load.
59299 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
59300 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
59301
59302 if (LN->isSimple()) {
59303 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
59304 LN->getBasePtr(),
59305 LN->getPointerInfo(),
59306 LN->getOriginalAlign(),
59307 LN->getMemOperand()->getFlags());
59308 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
59309 return NewLd;
59310 }
59311 }
59312
59313 return SDValue();
59314}
59315
59318 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
59319 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59320 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
59321 return SDValue(N, 0);
59322
59323 return SDValue();
59324}
59325
59326// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
59327// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
59328// use x86mmx instead.
59330 SDLoc dl(N);
59331
59332 bool MadeChange = false, CastReturnVal = false;
59334 for (const SDValue &Arg : N->op_values()) {
59335 if (Arg.getValueType() == MVT::v1i64) {
59336 MadeChange = true;
59337 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
59338 } else
59339 Args.push_back(Arg);
59340 }
59341 SDVTList VTs = N->getVTList();
59342 SDVTList NewVTs = VTs;
59343 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
59344 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
59345 NewVTArr[0] = MVT::x86mmx;
59346 NewVTs = DAG.getVTList(NewVTArr);
59347 MadeChange = true;
59348 CastReturnVal = true;
59349 }
59350
59351 if (MadeChange) {
59352 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
59353 if (CastReturnVal) {
59355 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
59356 Returns.push_back(Result.getValue(i));
59357 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
59358 return DAG.getMergeValues(Returns, dl);
59359 }
59360 return Result;
59361 }
59362 return SDValue();
59363}
59366 if (!DCI.isBeforeLegalize())
59367 return SDValue();
59368
59369 unsigned IntNo = N->getConstantOperandVal(0);
59370 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
59371
59372 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59373 return FixupMMXIntrinsicTypes(N, DAG);
59374
59375 return SDValue();
59376}
59377
59380 if (!DCI.isBeforeLegalize())
59381 return SDValue();
59382
59383 unsigned IntNo = N->getConstantOperandVal(1);
59384 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
59385
59386 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59387 return FixupMMXIntrinsicTypes(N, DAG);
59388
59389 return SDValue();
59390}
59391
59394 if (!DCI.isBeforeLegalize())
59395 return SDValue();
59396
59397 unsigned IntNo = N->getConstantOperandVal(1);
59398 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
59399
59400 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59401 return FixupMMXIntrinsicTypes(N, DAG);
59402
59403 return SDValue();
59404}
59405
59407 DAGCombinerInfo &DCI) const {
59408 SelectionDAG &DAG = DCI.DAG;
59409 switch (N->getOpcode()) {
59410 // clang-format off
59411 default: break;
59413 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
59415 case X86ISD::PEXTRW:
59416 case X86ISD::PEXTRB:
59417 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
59419 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
59421 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
59423 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
59424 case ISD::VSELECT:
59425 case ISD::SELECT:
59426 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
59427 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
59428 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
59429 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
59430 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
59431 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
59432 case X86ISD::ADD:
59433 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
59434 case X86ISD::CLOAD:
59435 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
59436 case X86ISD::SBB: return combineSBB(N, DAG);
59437 case X86ISD::ADC: return combineADC(N, DAG, DCI);
59438 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
59439 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
59440 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
59441 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
59442 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
59443 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
59444 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
59445 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
59446 case ISD::AVGCEILS:
59447 case ISD::AVGCEILU:
59448 case ISD::AVGFLOORS:
59449 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
59450 case X86ISD::BEXTR:
59451 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
59452 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
59453 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
59454 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
59455 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
59457 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
59458 case ISD::SINT_TO_FP:
59460 return combineSIntToFP(N, DAG, DCI, Subtarget);
59461 case ISD::UINT_TO_FP:
59463 return combineUIntToFP(N, DAG, Subtarget);
59464 case ISD::LRINT:
59465 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
59466 case ISD::FADD:
59467 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
59468 case X86ISD::VFCMULC:
59469 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
59470 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
59471 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
59472 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
59473 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
59474 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
59475 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
59476 case X86ISD::FXOR:
59477 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
59478 case X86ISD::FMIN:
59479 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
59480 case ISD::FMINNUM:
59481 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
59482 case X86ISD::CVTSI2P:
59483 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
59484 case X86ISD::CVTP2SI:
59485 case X86ISD::CVTP2UI:
59487 case X86ISD::CVTTP2SI:
59489 case X86ISD::CVTTP2UI:
59490 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
59492 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
59493 case X86ISD::BT: return combineBT(N, DAG, DCI);
59494 case ISD::ANY_EXTEND:
59495 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
59496 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
59497 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
59501 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
59502 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
59503 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
59504 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
59505 case X86ISD::PACKSS:
59506 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
59507 case X86ISD::HADD:
59508 case X86ISD::HSUB:
59509 case X86ISD::FHADD:
59510 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
59511 case X86ISD::VSHL:
59512 case X86ISD::VSRA:
59513 case X86ISD::VSRL:
59514 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
59515 case X86ISD::VSHLI:
59516 case X86ISD::VSRAI:
59517 case X86ISD::VSRLI:
59518 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
59520 case X86ISD::PINSRB:
59521 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
59522 case X86ISD::SHUFP: // Handle all target specific shuffles
59523 case X86ISD::INSERTPS:
59524 case X86ISD::EXTRQI:
59525 case X86ISD::INSERTQI:
59526 case X86ISD::VALIGN:
59527 case X86ISD::PALIGNR:
59528 case X86ISD::VSHLDQ:
59529 case X86ISD::VSRLDQ:
59530 case X86ISD::BLENDI:
59531 case X86ISD::UNPCKH:
59532 case X86ISD::UNPCKL:
59533 case X86ISD::MOVHLPS:
59534 case X86ISD::MOVLHPS:
59535 case X86ISD::PSHUFB:
59536 case X86ISD::PSHUFD:
59537 case X86ISD::PSHUFHW:
59538 case X86ISD::PSHUFLW:
59539 case X86ISD::MOVSHDUP:
59540 case X86ISD::MOVSLDUP:
59541 case X86ISD::MOVDDUP:
59542 case X86ISD::MOVSS:
59543 case X86ISD::MOVSD:
59544 case X86ISD::MOVSH:
59545 case X86ISD::VBROADCAST:
59546 case X86ISD::VPPERM:
59547 case X86ISD::VPERMI:
59548 case X86ISD::VPERMV:
59549 case X86ISD::VPERMV3:
59550 case X86ISD::VPERMIL2:
59551 case X86ISD::VPERMILPI:
59552 case X86ISD::VPERMILPV:
59553 case X86ISD::VPERM2X128:
59554 case X86ISD::SHUF128:
59555 case X86ISD::VZEXT_MOVL:
59556 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
59557 case X86ISD::FMADD_RND:
59558 case X86ISD::FMSUB:
59560 case X86ISD::FMSUB_RND:
59561 case X86ISD::FNMADD:
59563 case X86ISD::FNMADD_RND:
59564 case X86ISD::FNMSUB:
59566 case X86ISD::FNMSUB_RND:
59567 case ISD::FMA:
59568 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
59571 case X86ISD::FMADDSUB:
59572 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
59573 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
59574 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
59575 case X86ISD::MGATHER:
59576 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
59577 case ISD::MGATHER:
59578 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
59579 case X86ISD::PCMPEQ:
59580 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
59581 case X86ISD::PMULDQ:
59582 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
59583 case X86ISD::VPMADDUBSW:
59584 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
59585 case X86ISD::KSHIFTL:
59586 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
59587 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
59589 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
59591 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
59593 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
59594 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
59595 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
59596 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
59597 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
59598 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
59600 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
59601 // clang-format on
59602 }
59603
59604 return SDValue();
59605}
59606
59608 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
59609}
59610
59611// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
59613 EVT ExtVT) const {
59614 return Subtarget.hasAVX512() || !VT.isVector();
59615}
59616
59617bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
59618 if (!isTypeLegal(VT))
59619 return false;
59620
59621 // There are no vXi8 shifts.
59622 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
59623 return false;
59624
59625 // TODO: Almost no 8-bit ops are desirable because they have no actual
59626 // size/speed advantages vs. 32-bit ops, but they do have a major
59627 // potential disadvantage by causing partial register stalls.
59628 //
59629 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
59630 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
59631 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
59632 // check for a constant operand to the multiply.
59633 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
59634 return false;
59635
59636 // i16 instruction encodings are longer and some i16 instructions are slow,
59637 // so those are not desirable.
59638 if (VT == MVT::i16) {
59639 switch (Opc) {
59640 default:
59641 break;
59642 case ISD::LOAD:
59643 case ISD::SIGN_EXTEND:
59644 case ISD::ZERO_EXTEND:
59645 case ISD::ANY_EXTEND:
59646 case ISD::MUL:
59647 return false;
59648 case ISD::SHL:
59649 case ISD::SRA:
59650 case ISD::SRL:
59651 case ISD::SUB:
59652 case ISD::ADD:
59653 case ISD::AND:
59654 case ISD::OR:
59655 case ISD::XOR:
59656 // NDD instruction never has "partial register write" issue b/c it has
59657 // destination register's upper bits [63:OSIZE]) zeroed even when
59658 // OSIZE=8/16.
59659 return Subtarget.hasNDD();
59660 }
59661 }
59662
59663 // Any legal type not explicitly accounted for above here is desirable.
59664 return true;
59665}
59666
59669 int JTI,
59670 SelectionDAG &DAG) const {
59671 const Module *M = DAG.getMachineFunction().getFunction().getParent();
59672 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
59673 if (IsCFProtectionSupported) {
59674 // In case control-flow branch protection is enabled, we need to add
59675 // notrack prefix to the indirect branch.
59676 // In order to do that we create NT_BRIND SDNode.
59677 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
59678 SDValue Chain = Value;
59679 // Jump table debug info is only needed if CodeView is enabled.
59681 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
59682 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
59683 }
59684
59685 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
59686}
59687
59690 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
59692 EVT VT = LogicOp->getValueType(0);
59693 EVT OpVT = SETCC0->getOperand(0).getValueType();
59694 if (!VT.isInteger())
59696
59697 if (VT.isVector())
59702
59703 // Don't use `NotAnd` as even though `not` is generally shorter code size than
59704 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
59705 // `NotAnd` applies, `AddAnd` does as well.
59706 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
59707 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
59709}
59710
59712 EVT VT = Op.getValueType();
59713 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
59714 isa<ConstantSDNode>(Op.getOperand(1));
59715
59716 // i16 is legal, but undesirable since i16 instruction encodings are longer
59717 // and some i16 instructions are slow.
59718 // 8-bit multiply-by-constant can usually be expanded to something cheaper
59719 // using LEA and/or other ALU ops.
59720 if (VT != MVT::i16 && !Is8BitMulByConstant)
59721 return false;
59722
59723 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
59724 if (!Op.hasOneUse())
59725 return false;
59726 SDNode *User = *Op->user_begin();
59728 return false;
59729 auto *Ld = cast<LoadSDNode>(Load);
59730 auto *St = cast<StoreSDNode>(User);
59731 return Ld->getBasePtr() == St->getBasePtr();
59732 };
59733
59734 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
59735 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
59736 return false;
59737 if (!Op.hasOneUse())
59738 return false;
59739 SDNode *User = *Op->user_begin();
59740 if (User->getOpcode() != ISD::ATOMIC_STORE)
59741 return false;
59742 auto *Ld = cast<AtomicSDNode>(Load);
59743 auto *St = cast<AtomicSDNode>(User);
59744 return Ld->getBasePtr() == St->getBasePtr();
59745 };
59746
59747 auto IsFoldableZext = [](SDValue Op) {
59748 if (!Op.hasOneUse())
59749 return false;
59750 SDNode *User = *Op->user_begin();
59751 EVT VT = User->getValueType(0);
59752 return (User->getOpcode() == ISD::ZERO_EXTEND &&
59753 (VT == MVT::i32 || VT == MVT::i64));
59754 };
59755
59756 bool Commute = false;
59757 switch (Op.getOpcode()) {
59758 default: return false;
59759 case ISD::SIGN_EXTEND:
59760 case ISD::ZERO_EXTEND:
59761 case ISD::ANY_EXTEND:
59762 break;
59763 case ISD::SHL:
59764 case ISD::SRA:
59765 case ISD::SRL: {
59766 SDValue N0 = Op.getOperand(0);
59767 // Look out for (store (shl (load), x)).
59768 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
59769 return false;
59770 break;
59771 }
59772 case ISD::MUL:
59773 // When ZU is enabled, we prefer to not promote for MUL by a constant
59774 // when there is an opportunity to fold a zext with imulzu.
59775 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
59776 (isa<ConstantSDNode>(Op.getOperand(0)) ||
59777 isa<ConstantSDNode>(Op.getOperand(1))))
59778 return false;
59779 [[fallthrough]];
59780 case ISD::ADD:
59781 case ISD::AND:
59782 case ISD::OR:
59783 case ISD::XOR:
59784 Commute = true;
59785 [[fallthrough]];
59786 case ISD::SUB: {
59787 SDValue N0 = Op.getOperand(0);
59788 SDValue N1 = Op.getOperand(1);
59789 // Avoid disabling potential load folding opportunities.
59790 if (X86::mayFoldLoad(N1, Subtarget) &&
59791 (!Commute || !isa<ConstantSDNode>(N0) ||
59792 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
59793 return false;
59794 if (X86::mayFoldLoad(N0, Subtarget) &&
59795 ((Commute && !isa<ConstantSDNode>(N1)) ||
59796 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
59797 return false;
59798 if (IsFoldableAtomicRMW(N0, Op) ||
59799 (Commute && IsFoldableAtomicRMW(N1, Op)))
59800 return false;
59801 }
59802 }
59803
59804 PVT = MVT::i32;
59805 return true;
59806}
59807
59808//===----------------------------------------------------------------------===//
59809// X86 Inline Assembly Support
59810//===----------------------------------------------------------------------===//
59811
59812// Helper to match a string separated by whitespace.
59814 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
59815
59816 for (StringRef Piece : Pieces) {
59817 if (!S.starts_with(Piece)) // Check if the piece matches.
59818 return false;
59819
59820 S = S.substr(Piece.size());
59822 if (Pos == 0) // We matched a prefix.
59823 return false;
59824
59825 S = S.substr(Pos);
59826 }
59827
59828 return S.empty();
59829}
59830
59832
59833 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
59834 if (llvm::is_contained(AsmPieces, "~{cc}") &&
59835 llvm::is_contained(AsmPieces, "~{flags}") &&
59836 llvm::is_contained(AsmPieces, "~{fpsr}")) {
59837
59838 if (AsmPieces.size() == 3)
59839 return true;
59840 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
59841 return true;
59842 }
59843 }
59844 return false;
59845}
59846
59848 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
59849
59850 const std::string &AsmStr = IA->getAsmString();
59851
59852 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
59853 if (!Ty || Ty->getBitWidth() % 16 != 0)
59854 return false;
59855
59856 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
59857 SmallVector<StringRef, 4> AsmPieces;
59858 SplitString(AsmStr, AsmPieces, ";\n");
59859
59860 switch (AsmPieces.size()) {
59861 default: return false;
59862 case 1:
59863 // FIXME: this should verify that we are targeting a 486 or better. If not,
59864 // we will turn this bswap into something that will be lowered to logical
59865 // ops instead of emitting the bswap asm. For now, we don't support 486 or
59866 // lower so don't worry about this.
59867 // bswap $0
59868 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
59869 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
59870 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
59871 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
59872 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
59873 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
59874 // No need to check constraints, nothing other than the equivalent of
59875 // "=r,0" would be valid here.
59877 }
59878
59879 // rorw $$8, ${0:w} --> llvm.bswap.i16
59880 if (CI->getType()->isIntegerTy(16) &&
59881 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
59882 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
59883 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
59884 AsmPieces.clear();
59885 StringRef ConstraintsStr = IA->getConstraintString();
59886 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
59887 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
59888 if (clobbersFlagRegisters(AsmPieces))
59890 }
59891 break;
59892 case 3:
59893 if (CI->getType()->isIntegerTy(32) &&
59894 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
59895 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
59896 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
59897 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
59898 AsmPieces.clear();
59899 StringRef ConstraintsStr = IA->getConstraintString();
59900 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
59901 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
59902 if (clobbersFlagRegisters(AsmPieces))
59904 }
59905
59906 if (CI->getType()->isIntegerTy(64)) {
59907 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
59908 if (Constraints.size() >= 2 &&
59909 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
59910 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
59911 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
59912 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
59913 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
59914 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
59916 }
59917 }
59918 break;
59919 }
59920 return false;
59921}
59922
59925 .Case("{@cca}", X86::COND_A)
59926 .Case("{@ccae}", X86::COND_AE)
59927 .Case("{@ccb}", X86::COND_B)
59928 .Case("{@ccbe}", X86::COND_BE)
59929 .Case("{@ccc}", X86::COND_B)
59930 .Case("{@cce}", X86::COND_E)
59931 .Case("{@ccz}", X86::COND_E)
59932 .Case("{@ccg}", X86::COND_G)
59933 .Case("{@ccge}", X86::COND_GE)
59934 .Case("{@ccl}", X86::COND_L)
59935 .Case("{@ccle}", X86::COND_LE)
59936 .Case("{@ccna}", X86::COND_BE)
59937 .Case("{@ccnae}", X86::COND_B)
59938 .Case("{@ccnb}", X86::COND_AE)
59939 .Case("{@ccnbe}", X86::COND_A)
59940 .Case("{@ccnc}", X86::COND_AE)
59941 .Case("{@ccne}", X86::COND_NE)
59942 .Case("{@ccnz}", X86::COND_NE)
59943 .Case("{@ccng}", X86::COND_LE)
59944 .Case("{@ccnge}", X86::COND_L)
59945 .Case("{@ccnl}", X86::COND_GE)
59946 .Case("{@ccnle}", X86::COND_G)
59947 .Case("{@ccno}", X86::COND_NO)
59948 .Case("{@ccnp}", X86::COND_NP)
59949 .Case("{@ccns}", X86::COND_NS)
59950 .Case("{@cco}", X86::COND_O)
59951 .Case("{@ccp}", X86::COND_P)
59952 .Case("{@ccs}", X86::COND_S)
59954 return Cond;
59955}
59956
59957/// Given a constraint letter, return the type of constraint for this target.
59960 if (Constraint.size() == 1) {
59961 switch (Constraint[0]) {
59962 case 'R':
59963 case 'q':
59964 case 'Q':
59965 case 'f':
59966 case 't':
59967 case 'u':
59968 case 'y':
59969 case 'x':
59970 case 'v':
59971 case 'l':
59972 case 'k': // AVX512 masking registers.
59973 return C_RegisterClass;
59974 case 'a':
59975 case 'b':
59976 case 'c':
59977 case 'd':
59978 case 'S':
59979 case 'D':
59980 case 'A':
59981 return C_Register;
59982 case 'I':
59983 case 'J':
59984 case 'K':
59985 case 'N':
59986 case 'G':
59987 case 'L':
59988 case 'M':
59989 return C_Immediate;
59990 case 'C':
59991 case 'e':
59992 case 'Z':
59993 return C_Other;
59994 default:
59995 break;
59996 }
59997 }
59998 else if (Constraint.size() == 2) {
59999 switch (Constraint[0]) {
60000 default:
60001 break;
60002 case 'W':
60003 if (Constraint[1] != 's')
60004 break;
60005 return C_Other;
60006 case 'Y':
60007 switch (Constraint[1]) {
60008 default:
60009 break;
60010 case 'z':
60011 return C_Register;
60012 case 'i':
60013 case 'm':
60014 case 'k':
60015 case 't':
60016 case '2':
60017 return C_RegisterClass;
60018 }
60019 break;
60020 case 'j':
60021 switch (Constraint[1]) {
60022 default:
60023 break;
60024 case 'r':
60025 case 'R':
60026 return C_RegisterClass;
60027 }
60028 }
60029 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
60030 return C_Other;
60031 return TargetLowering::getConstraintType(Constraint);
60032}
60033
60034/// Examine constraint type and operand type and determine a weight value.
60035/// This object must already have been set up with the operand type
60036/// and the current alternative constraint selected.
60039 AsmOperandInfo &Info, const char *Constraint) const {
60041 Value *CallOperandVal = Info.CallOperandVal;
60042 // If we don't have a value, we can't do a match,
60043 // but allow it at the lowest weight.
60044 if (!CallOperandVal)
60045 return CW_Default;
60046 Type *Ty = CallOperandVal->getType();
60047 // Look at the constraint type.
60048 switch (*Constraint) {
60049 default:
60051 [[fallthrough]];
60052 case 'R':
60053 case 'q':
60054 case 'Q':
60055 case 'a':
60056 case 'b':
60057 case 'c':
60058 case 'd':
60059 case 'S':
60060 case 'D':
60061 case 'A':
60062 if (CallOperandVal->getType()->isIntegerTy())
60063 Wt = CW_SpecificReg;
60064 break;
60065 case 'f':
60066 case 't':
60067 case 'u':
60068 if (Ty->isFloatingPointTy())
60069 Wt = CW_SpecificReg;
60070 break;
60071 case 'y':
60072 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
60073 Wt = CW_SpecificReg;
60074 break;
60075 case 'Y':
60076 if (StringRef(Constraint).size() != 2)
60077 break;
60078 switch (Constraint[1]) {
60079 default:
60080 return CW_Invalid;
60081 // XMM0
60082 case 'z':
60083 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
60084 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
60085 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
60086 return CW_SpecificReg;
60087 return CW_Invalid;
60088 // Conditional OpMask regs (AVX512)
60089 case 'k':
60090 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
60091 return CW_Register;
60092 return CW_Invalid;
60093 // Any MMX reg
60094 case 'm':
60095 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
60096 return CW_SpecificReg;
60097 return CW_Invalid;
60098 // Any SSE reg when ISA >= SSE2, same as 'x'
60099 case 'i':
60100 case 't':
60101 case '2':
60102 if (!Subtarget.hasSSE2())
60103 return CW_Invalid;
60104 break;
60105 }
60106 break;
60107 case 'j':
60108 if (StringRef(Constraint).size() != 2)
60109 break;
60110 switch (Constraint[1]) {
60111 default:
60112 return CW_Invalid;
60113 case 'r':
60114 case 'R':
60115 if (CallOperandVal->getType()->isIntegerTy())
60116 Wt = CW_SpecificReg;
60117 break;
60118 }
60119 break;
60120 case 'v':
60121 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
60122 Wt = CW_Register;
60123 [[fallthrough]];
60124 case 'x':
60125 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
60126 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
60127 Wt = CW_Register;
60128 break;
60129 case 'k':
60130 // Enable conditional vector operations using %k<#> registers.
60131 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
60132 Wt = CW_Register;
60133 break;
60134 case 'I':
60135 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
60136 if (C->getZExtValue() <= 31)
60137 Wt = CW_Constant;
60138 break;
60139 case 'J':
60140 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60141 if (C->getZExtValue() <= 63)
60142 Wt = CW_Constant;
60143 break;
60144 case 'K':
60145 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60146 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
60147 Wt = CW_Constant;
60148 break;
60149 case 'L':
60150 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60151 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
60152 Wt = CW_Constant;
60153 break;
60154 case 'M':
60155 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60156 if (C->getZExtValue() <= 3)
60157 Wt = CW_Constant;
60158 break;
60159 case 'N':
60160 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60161 if (C->getZExtValue() <= 0xff)
60162 Wt = CW_Constant;
60163 break;
60164 case 'G':
60165 case 'C':
60166 if (isa<ConstantFP>(CallOperandVal))
60167 Wt = CW_Constant;
60168 break;
60169 case 'e':
60170 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60171 if ((C->getSExtValue() >= -0x80000000LL) &&
60172 (C->getSExtValue() <= 0x7fffffffLL))
60173 Wt = CW_Constant;
60174 break;
60175 case 'Z':
60176 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60177 if (C->getZExtValue() <= 0xffffffff)
60178 Wt = CW_Constant;
60179 break;
60180 }
60181 return Wt;
60182}
60183
60184/// Try to replace an X constraint, which matches anything, with another that
60185/// has more specific requirements based on the type of the corresponding
60186/// operand.
60188LowerXConstraint(EVT ConstraintVT) const {
60189 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
60190 // 'f' like normal targets.
60191 if (ConstraintVT.isFloatingPoint()) {
60192 if (Subtarget.hasSSE1())
60193 return "x";
60194 }
60195
60196 return TargetLowering::LowerXConstraint(ConstraintVT);
60197}
60198
60199// Lower @cc targets via setcc.
60201 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
60202 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
60204 if (Cond == X86::COND_INVALID)
60205 return SDValue();
60206 // Check that return type is valid.
60207 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
60208 OpInfo.ConstraintVT.getSizeInBits() < 8)
60209 report_fatal_error("Glue output operand is of invalid type");
60210
60211 // Get EFLAGS register. Only update chain when copyfrom is glued.
60212 if (Glue.getNode()) {
60213 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
60214 Chain = Glue.getValue(1);
60215 } else
60216 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
60217 // Extract CC code.
60218 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
60219 // Extend to 32-bits
60220 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
60221
60222 return Result;
60223}
60224
60225/// Lower the specified operand into the Ops vector.
60226/// If it is invalid, don't add anything to Ops.
60228 StringRef Constraint,
60229 std::vector<SDValue> &Ops,
60230 SelectionDAG &DAG) const {
60231 SDValue Result;
60232 char ConstraintLetter = Constraint[0];
60233 switch (ConstraintLetter) {
60234 default: break;
60235 case 'I':
60236 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60237 if (C->getZExtValue() <= 31) {
60238 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60239 Op.getValueType());
60240 break;
60241 }
60242 }
60243 return;
60244 case 'J':
60245 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60246 if (C->getZExtValue() <= 63) {
60247 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60248 Op.getValueType());
60249 break;
60250 }
60251 }
60252 return;
60253 case 'K':
60254 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60255 if (isInt<8>(C->getSExtValue())) {
60256 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60257 Op.getValueType());
60258 break;
60259 }
60260 }
60261 return;
60262 case 'L':
60263 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60264 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
60265 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
60266 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
60267 Op.getValueType());
60268 break;
60269 }
60270 }
60271 return;
60272 case 'M':
60273 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60274 if (C->getZExtValue() <= 3) {
60275 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60276 Op.getValueType());
60277 break;
60278 }
60279 }
60280 return;
60281 case 'N':
60282 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60283 if (C->getZExtValue() <= 255) {
60284 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60285 Op.getValueType());
60286 break;
60287 }
60288 }
60289 return;
60290 case 'O':
60291 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60292 if (C->getZExtValue() <= 127) {
60293 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60294 Op.getValueType());
60295 break;
60296 }
60297 }
60298 return;
60299 case 'e': {
60300 // 32-bit signed value
60301 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60303 C->getSExtValue())) {
60304 // Widen to 64 bits here to get it sign extended.
60305 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
60306 break;
60307 }
60308 // FIXME gcc accepts some relocatable values here too, but only in certain
60309 // memory models; it's complicated.
60310 }
60311 return;
60312 }
60313 case 'W': {
60314 assert(Constraint[1] == 's');
60315 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
60316 // offset.
60317 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
60318 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
60319 BA->getValueType(0)));
60320 } else {
60321 int64_t Offset = 0;
60322 if (Op->getOpcode() == ISD::ADD &&
60323 isa<ConstantSDNode>(Op->getOperand(1))) {
60324 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
60325 Op = Op->getOperand(0);
60326 }
60327 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
60328 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
60329 GA->getValueType(0), Offset));
60330 }
60331 return;
60332 }
60333 case 'Z': {
60334 // 32-bit unsigned value
60335 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60337 C->getZExtValue())) {
60338 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60339 Op.getValueType());
60340 break;
60341 }
60342 }
60343 // FIXME gcc accepts some relocatable values here too, but only in certain
60344 // memory models; it's complicated.
60345 return;
60346 }
60347 case 'i': {
60348 // Literal immediates are always ok.
60349 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
60350 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
60351 BooleanContent BCont = getBooleanContents(MVT::i64);
60352 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
60354 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
60355 : CST->getSExtValue();
60356 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
60357 break;
60358 }
60359
60360 // In any sort of PIC mode addresses need to be computed at runtime by
60361 // adding in a register or some sort of table lookup. These can't
60362 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
60363 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
60364 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
60365 return;
60366
60367 // If we are in non-pic codegen mode, we allow the address of a global (with
60368 // an optional displacement) to be used with 'i'.
60369 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
60370 // If we require an extra load to get this address, as in PIC mode, we
60371 // can't accept it.
60373 Subtarget.classifyGlobalReference(GA->getGlobal())))
60374 return;
60375 break;
60376 }
60377 }
60378
60379 if (Result.getNode()) {
60380 Ops.push_back(Result);
60381 return;
60382 }
60383 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
60384}
60385
60386/// Check if \p RC is a general purpose register class.
60387/// I.e., GR* or one of their variant.
60388static bool isGRClass(const TargetRegisterClass &RC) {
60389 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
60390 RC.hasSuperClassEq(&X86::GR16RegClass) ||
60391 RC.hasSuperClassEq(&X86::GR32RegClass) ||
60392 RC.hasSuperClassEq(&X86::GR64RegClass) ||
60393 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
60394}
60395
60396/// Check if \p RC is a vector register class.
60397/// I.e., FR* / VR* or one of their variant.
60398static bool isFRClass(const TargetRegisterClass &RC) {
60399 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
60400 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
60401 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
60402 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
60403 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
60404 RC.hasSuperClassEq(&X86::VR512RegClass);
60405}
60406
60407/// Check if \p RC is a mask register class.
60408/// I.e., VK* or one of their variant.
60409static bool isVKClass(const TargetRegisterClass &RC) {
60410 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
60411 RC.hasSuperClassEq(&X86::VK2RegClass) ||
60412 RC.hasSuperClassEq(&X86::VK4RegClass) ||
60413 RC.hasSuperClassEq(&X86::VK8RegClass) ||
60414 RC.hasSuperClassEq(&X86::VK16RegClass) ||
60415 RC.hasSuperClassEq(&X86::VK32RegClass) ||
60416 RC.hasSuperClassEq(&X86::VK64RegClass);
60417}
60418
60419static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
60420 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
60421}
60422
60423std::pair<unsigned, const TargetRegisterClass *>
60425 StringRef Constraint,
60426 MVT VT) const {
60427 // First, see if this is a constraint that directly corresponds to an LLVM
60428 // register class.
60429 if (Constraint.size() == 1) {
60430 // GCC Constraint Letters
60431 switch (Constraint[0]) {
60432 default: break;
60433 // 'A' means [ER]AX + [ER]DX.
60434 case 'A':
60435 if (Subtarget.is64Bit())
60436 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
60437 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
60438 "Expecting 64, 32 or 16 bit subtarget");
60439 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
60440
60441 // TODO: Slight differences here in allocation order and leaving
60442 // RIP in the class. Do they matter any more here than they do
60443 // in the normal allocation?
60444 case 'k':
60445 if (Subtarget.hasAVX512()) {
60446 if (VT == MVT::v1i1 || VT == MVT::i1)
60447 return std::make_pair(0U, &X86::VK1RegClass);
60448 if (VT == MVT::v8i1 || VT == MVT::i8)
60449 return std::make_pair(0U, &X86::VK8RegClass);
60450 if (VT == MVT::v16i1 || VT == MVT::i16)
60451 return std::make_pair(0U, &X86::VK16RegClass);
60452 }
60453 if (Subtarget.hasBWI()) {
60454 if (VT == MVT::v32i1 || VT == MVT::i32)
60455 return std::make_pair(0U, &X86::VK32RegClass);
60456 if (VT == MVT::v64i1 || VT == MVT::i64)
60457 return std::make_pair(0U, &X86::VK64RegClass);
60458 }
60459 break;
60460 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
60461 if (Subtarget.is64Bit()) {
60462 if (VT == MVT::i8 || VT == MVT::i1)
60463 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60464 ? &X86::GR8RegClass
60465 : &X86::GR8_NOREX2RegClass);
60466 if (VT == MVT::i16)
60467 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60468 ? &X86::GR16RegClass
60469 : &X86::GR16_NOREX2RegClass);
60470 if (VT == MVT::i32 || VT == MVT::f32)
60471 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60472 ? &X86::GR32RegClass
60473 : &X86::GR32_NOREX2RegClass);
60474 if (VT != MVT::f80 && !VT.isVector())
60475 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60476 ? &X86::GR64RegClass
60477 : &X86::GR64_NOREX2RegClass);
60478 break;
60479 }
60480 [[fallthrough]];
60481 // 32-bit fallthrough
60482 case 'Q': // Q_REGS
60483 if (VT == MVT::i8 || VT == MVT::i1)
60484 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
60485 if (VT == MVT::i16)
60486 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
60487 if (VT == MVT::i32 || VT == MVT::f32 ||
60488 (!VT.isVector() && !Subtarget.is64Bit()))
60489 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
60490 if (VT != MVT::f80 && !VT.isVector())
60491 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
60492 break;
60493 case 'r': // GENERAL_REGS
60494 case 'l': // INDEX_REGS
60495 if (VT == MVT::i8 || VT == MVT::i1)
60496 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60497 ? &X86::GR8RegClass
60498 : &X86::GR8_NOREX2RegClass);
60499 if (VT == MVT::i16)
60500 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60501 ? &X86::GR16RegClass
60502 : &X86::GR16_NOREX2RegClass);
60503 if (VT == MVT::i32 || VT == MVT::f32 ||
60504 (!VT.isVector() && !Subtarget.is64Bit()))
60505 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60506 ? &X86::GR32RegClass
60507 : &X86::GR32_NOREX2RegClass);
60508 if (VT != MVT::f80 && !VT.isVector())
60509 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60510 ? &X86::GR64RegClass
60511 : &X86::GR64_NOREX2RegClass);
60512 break;
60513 case 'R': // LEGACY_REGS
60514 if (VT == MVT::i8 || VT == MVT::i1)
60515 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
60516 if (VT == MVT::i16)
60517 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
60518 if (VT == MVT::i32 || VT == MVT::f32 ||
60519 (!VT.isVector() && !Subtarget.is64Bit()))
60520 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
60521 if (VT != MVT::f80 && !VT.isVector())
60522 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
60523 break;
60524 case 'f': // FP Stack registers.
60525 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
60526 // value to the correct fpstack register class.
60527 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
60528 return std::make_pair(0U, &X86::RFP32RegClass);
60529 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
60530 return std::make_pair(0U, &X86::RFP64RegClass);
60531 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
60532 return std::make_pair(0U, &X86::RFP80RegClass);
60533 break;
60534 case 'y': // MMX_REGS if MMX allowed.
60535 if (!Subtarget.hasMMX()) break;
60536 return std::make_pair(0U, &X86::VR64RegClass);
60537 case 'v':
60538 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
60539 if (!Subtarget.hasSSE1()) break;
60540 bool VConstraint = (Constraint[0] == 'v');
60541
60542 switch (VT.SimpleTy) {
60543 default: break;
60544 // Scalar SSE types.
60545 case MVT::f16:
60546 if (VConstraint && Subtarget.hasFP16())
60547 return std::make_pair(0U, &X86::FR16XRegClass);
60548 break;
60549 case MVT::f32:
60550 case MVT::i32:
60551 if (VConstraint && Subtarget.hasVLX())
60552 return std::make_pair(0U, &X86::FR32XRegClass);
60553 return std::make_pair(0U, &X86::FR32RegClass);
60554 case MVT::f64:
60555 case MVT::i64:
60556 if (VConstraint && Subtarget.hasVLX())
60557 return std::make_pair(0U, &X86::FR64XRegClass);
60558 return std::make_pair(0U, &X86::FR64RegClass);
60559 case MVT::i128:
60560 if (Subtarget.is64Bit()) {
60561 if (VConstraint && Subtarget.hasVLX())
60562 return std::make_pair(0U, &X86::VR128XRegClass);
60563 return std::make_pair(0U, &X86::VR128RegClass);
60564 }
60565 break;
60566 // Vector types and fp128.
60567 case MVT::v8f16:
60568 if (!Subtarget.hasFP16())
60569 break;
60570 if (VConstraint)
60571 return std::make_pair(0U, &X86::VR128XRegClass);
60572 return std::make_pair(0U, &X86::VR128RegClass);
60573 case MVT::v8bf16:
60574 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60575 break;
60576 if (VConstraint)
60577 return std::make_pair(0U, &X86::VR128XRegClass);
60578 return std::make_pair(0U, &X86::VR128RegClass);
60579 case MVT::f128:
60580 case MVT::v16i8:
60581 case MVT::v8i16:
60582 case MVT::v4i32:
60583 case MVT::v2i64:
60584 case MVT::v4f32:
60585 case MVT::v2f64:
60586 if (VConstraint && Subtarget.hasVLX())
60587 return std::make_pair(0U, &X86::VR128XRegClass);
60588 return std::make_pair(0U, &X86::VR128RegClass);
60589 // AVX types.
60590 case MVT::v16f16:
60591 if (!Subtarget.hasFP16())
60592 break;
60593 if (VConstraint)
60594 return std::make_pair(0U, &X86::VR256XRegClass);
60595 return std::make_pair(0U, &X86::VR256RegClass);
60596 case MVT::v16bf16:
60597 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60598 break;
60599 if (VConstraint)
60600 return std::make_pair(0U, &X86::VR256XRegClass);
60601 return std::make_pair(0U, &X86::VR256RegClass);
60602 case MVT::v32i8:
60603 case MVT::v16i16:
60604 case MVT::v8i32:
60605 case MVT::v4i64:
60606 case MVT::v8f32:
60607 case MVT::v4f64:
60608 if (VConstraint && Subtarget.hasVLX())
60609 return std::make_pair(0U, &X86::VR256XRegClass);
60610 if (Subtarget.hasAVX())
60611 return std::make_pair(0U, &X86::VR256RegClass);
60612 break;
60613 case MVT::v32f16:
60614 if (!Subtarget.hasFP16())
60615 break;
60616 if (VConstraint)
60617 return std::make_pair(0U, &X86::VR512RegClass);
60618 return std::make_pair(0U, &X86::VR512_0_15RegClass);
60619 case MVT::v32bf16:
60620 if (!Subtarget.hasBF16())
60621 break;
60622 if (VConstraint)
60623 return std::make_pair(0U, &X86::VR512RegClass);
60624 return std::make_pair(0U, &X86::VR512_0_15RegClass);
60625 case MVT::v64i8:
60626 case MVT::v32i16:
60627 case MVT::v8f64:
60628 case MVT::v16f32:
60629 case MVT::v16i32:
60630 case MVT::v8i64:
60631 if (!Subtarget.hasAVX512()) break;
60632 if (VConstraint)
60633 return std::make_pair(0U, &X86::VR512RegClass);
60634 return std::make_pair(0U, &X86::VR512_0_15RegClass);
60635 }
60636 break;
60637 }
60638 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
60639 switch (Constraint[1]) {
60640 default:
60641 break;
60642 case 'i':
60643 case 't':
60644 case '2':
60645 return getRegForInlineAsmConstraint(TRI, "x", VT);
60646 case 'm':
60647 if (!Subtarget.hasMMX()) break;
60648 return std::make_pair(0U, &X86::VR64RegClass);
60649 case 'z':
60650 if (!Subtarget.hasSSE1()) break;
60651 switch (VT.SimpleTy) {
60652 default: break;
60653 // Scalar SSE types.
60654 case MVT::f16:
60655 if (!Subtarget.hasFP16())
60656 break;
60657 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
60658 case MVT::f32:
60659 case MVT::i32:
60660 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
60661 case MVT::f64:
60662 case MVT::i64:
60663 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
60664 case MVT::v8f16:
60665 if (!Subtarget.hasFP16())
60666 break;
60667 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
60668 case MVT::v8bf16:
60669 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60670 break;
60671 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
60672 case MVT::f128:
60673 case MVT::v16i8:
60674 case MVT::v8i16:
60675 case MVT::v4i32:
60676 case MVT::v2i64:
60677 case MVT::v4f32:
60678 case MVT::v2f64:
60679 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
60680 // AVX types.
60681 case MVT::v16f16:
60682 if (!Subtarget.hasFP16())
60683 break;
60684 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
60685 case MVT::v16bf16:
60686 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60687 break;
60688 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
60689 case MVT::v32i8:
60690 case MVT::v16i16:
60691 case MVT::v8i32:
60692 case MVT::v4i64:
60693 case MVT::v8f32:
60694 case MVT::v4f64:
60695 if (Subtarget.hasAVX())
60696 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
60697 break;
60698 case MVT::v32f16:
60699 if (!Subtarget.hasFP16())
60700 break;
60701 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
60702 case MVT::v32bf16:
60703 if (!Subtarget.hasBF16())
60704 break;
60705 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
60706 case MVT::v64i8:
60707 case MVT::v32i16:
60708 case MVT::v8f64:
60709 case MVT::v16f32:
60710 case MVT::v16i32:
60711 case MVT::v8i64:
60712 if (Subtarget.hasAVX512())
60713 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
60714 break;
60715 }
60716 break;
60717 case 'k':
60718 // This register class doesn't allocate k0 for masked vector operation.
60719 if (Subtarget.hasAVX512()) {
60720 if (VT == MVT::v1i1 || VT == MVT::i1)
60721 return std::make_pair(0U, &X86::VK1WMRegClass);
60722 if (VT == MVT::v8i1 || VT == MVT::i8)
60723 return std::make_pair(0U, &X86::VK8WMRegClass);
60724 if (VT == MVT::v16i1 || VT == MVT::i16)
60725 return std::make_pair(0U, &X86::VK16WMRegClass);
60726 }
60727 if (Subtarget.hasBWI()) {
60728 if (VT == MVT::v32i1 || VT == MVT::i32)
60729 return std::make_pair(0U, &X86::VK32WMRegClass);
60730 if (VT == MVT::v64i1 || VT == MVT::i64)
60731 return std::make_pair(0U, &X86::VK64WMRegClass);
60732 }
60733 break;
60734 }
60735 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
60736 switch (Constraint[1]) {
60737 default:
60738 break;
60739 case 'r':
60740 if (VT == MVT::i8 || VT == MVT::i1)
60741 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
60742 if (VT == MVT::i16)
60743 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
60744 if (VT == MVT::i32 || VT == MVT::f32)
60745 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
60746 if (VT != MVT::f80 && !VT.isVector())
60747 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
60748 break;
60749 case 'R':
60750 if (VT == MVT::i8 || VT == MVT::i1)
60751 return std::make_pair(0U, &X86::GR8RegClass);
60752 if (VT == MVT::i16)
60753 return std::make_pair(0U, &X86::GR16RegClass);
60754 if (VT == MVT::i32 || VT == MVT::f32)
60755 return std::make_pair(0U, &X86::GR32RegClass);
60756 if (VT != MVT::f80 && !VT.isVector())
60757 return std::make_pair(0U, &X86::GR64RegClass);
60758 break;
60759 }
60760 }
60761
60762 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
60763 return std::make_pair(0U, &X86::GR32RegClass);
60764
60765 // Use the default implementation in TargetLowering to convert the register
60766 // constraint into a member of a register class.
60767 std::pair<Register, const TargetRegisterClass*> Res;
60769
60770 // Not found as a standard register?
60771 if (!Res.second) {
60772 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
60773 // to/from f80.
60774 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
60775 // Map st(0) -> st(7) -> ST0
60776 if (Constraint.size() == 7 && Constraint[0] == '{' &&
60777 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
60778 Constraint[3] == '(' &&
60779 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
60780 Constraint[5] == ')' && Constraint[6] == '}') {
60781 // st(7) is not allocatable and thus not a member of RFP80. Return
60782 // singleton class in cases where we have a reference to it.
60783 if (Constraint[4] == '7')
60784 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
60785 return std::make_pair(X86::FP0 + Constraint[4] - '0',
60786 &X86::RFP80RegClass);
60787 }
60788
60789 // GCC allows "st(0)" to be called just plain "st".
60790 if (StringRef("{st}").equals_insensitive(Constraint))
60791 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
60792 }
60793
60794 // flags -> EFLAGS
60795 if (StringRef("{flags}").equals_insensitive(Constraint))
60796 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
60797
60798 // dirflag -> DF
60799 // Only allow for clobber.
60800 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
60801 VT == MVT::Other)
60802 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
60803
60804 // fpsr -> FPSW
60805 // Only allow for clobber.
60806 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
60807 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
60808
60809 return Res;
60810 }
60811
60812 // Make sure it isn't a register that requires 64-bit mode.
60813 if (!Subtarget.is64Bit() &&
60814 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
60815 TRI->getEncodingValue(Res.first) >= 8) {
60816 // Register requires REX prefix, but we're in 32-bit mode.
60817 return std::make_pair(0, nullptr);
60818 }
60819
60820 // Make sure it isn't a register that requires AVX512.
60821 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
60822 TRI->getEncodingValue(Res.first) & 0x10) {
60823 // Register requires EVEX prefix.
60824 return std::make_pair(0, nullptr);
60825 }
60826
60827 // Otherwise, check to see if this is a register class of the wrong value
60828 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
60829 // turn into {ax},{dx}.
60830 // MVT::Other is used to specify clobber names.
60831 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
60832 return Res; // Correct type already, nothing to do.
60833
60834 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
60835 // return "eax". This should even work for things like getting 64bit integer
60836 // registers when given an f64 type.
60837 const TargetRegisterClass *Class = Res.second;
60838 // The generic code will match the first register class that contains the
60839 // given register. Thus, based on the ordering of the tablegened file,
60840 // the "plain" GR classes might not come first.
60841 // Therefore, use a helper method.
60842 if (isGRClass(*Class)) {
60843 unsigned Size = VT.getSizeInBits();
60844 if (Size == 1) Size = 8;
60845 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
60846 return std::make_pair(0, nullptr);
60847 Register DestReg = getX86SubSuperRegister(Res.first, Size);
60848 if (DestReg.isValid()) {
60849 bool is64Bit = Subtarget.is64Bit();
60850 const TargetRegisterClass *RC =
60851 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
60852 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
60853 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
60854 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
60855 if (Size == 64 && !is64Bit) {
60856 // Model GCC's behavior here and select a fixed pair of 32-bit
60857 // registers.
60858 switch (DestReg) {
60859 case X86::RAX:
60860 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
60861 case X86::RDX:
60862 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
60863 case X86::RCX:
60864 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
60865 case X86::RBX:
60866 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
60867 case X86::RSI:
60868 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
60869 case X86::RDI:
60870 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
60871 case X86::RBP:
60872 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
60873 default:
60874 return std::make_pair(0, nullptr);
60875 }
60876 }
60877 if (RC && RC->contains(DestReg))
60878 return std::make_pair(DestReg, RC);
60879 return Res;
60880 }
60881 // No register found/type mismatch.
60882 return std::make_pair(0, nullptr);
60883 } else if (isFRClass(*Class)) {
60884 // Handle references to XMM physical registers that got mapped into the
60885 // wrong class. This can happen with constraints like {xmm0} where the
60886 // target independent register mapper will just pick the first match it can
60887 // find, ignoring the required type.
60888
60889 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
60890 if (VT == MVT::f16)
60891 Res.second = &X86::FR16XRegClass;
60892 else if (VT == MVT::f32 || VT == MVT::i32)
60893 Res.second = &X86::FR32XRegClass;
60894 else if (VT == MVT::f64 || VT == MVT::i64)
60895 Res.second = &X86::FR64XRegClass;
60896 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
60897 Res.second = &X86::VR128XRegClass;
60898 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
60899 Res.second = &X86::VR256XRegClass;
60900 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
60901 Res.second = &X86::VR512RegClass;
60902 else {
60903 // Type mismatch and not a clobber: Return an error;
60904 Res.first = 0;
60905 Res.second = nullptr;
60906 }
60907 } else if (isVKClass(*Class)) {
60908 if (VT == MVT::v1i1 || VT == MVT::i1)
60909 Res.second = &X86::VK1RegClass;
60910 else if (VT == MVT::v8i1 || VT == MVT::i8)
60911 Res.second = &X86::VK8RegClass;
60912 else if (VT == MVT::v16i1 || VT == MVT::i16)
60913 Res.second = &X86::VK16RegClass;
60914 else if (VT == MVT::v32i1 || VT == MVT::i32)
60915 Res.second = &X86::VK32RegClass;
60916 else if (VT == MVT::v64i1 || VT == MVT::i64)
60917 Res.second = &X86::VK64RegClass;
60918 else {
60919 // Type mismatch and not a clobber: Return an error;
60920 Res.first = 0;
60921 Res.second = nullptr;
60922 }
60923 }
60924
60925 return Res;
60926}
60927
60929 // Integer division on x86 is expensive. However, when aggressively optimizing
60930 // for code size, we prefer to use a div instruction, as it is usually smaller
60931 // than the alternative sequence.
60932 // The exception to this is vector division. Since x86 doesn't have vector
60933 // integer division, leaving the division as-is is a loss even in terms of
60934 // size, because it will have to be scalarized, while the alternative code
60935 // sequence can be performed in vector form.
60936 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
60937 return OptSize && !VT.isVector();
60938}
60939
60940void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
60941 if (!Subtarget.is64Bit())
60942 return;
60943
60944 // Update IsSplitCSR in X86MachineFunctionInfo.
60946 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
60947 AFI->setIsSplitCSR(true);
60948}
60949
60950void X86TargetLowering::insertCopiesSplitCSR(
60951 MachineBasicBlock *Entry,
60952 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
60953 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
60954 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
60955 if (!IStart)
60956 return;
60957
60958 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
60959 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
60960 MachineBasicBlock::iterator MBBI = Entry->begin();
60961 for (const MCPhysReg *I = IStart; *I; ++I) {
60962 const TargetRegisterClass *RC = nullptr;
60963 if (X86::GR64RegClass.contains(*I))
60964 RC = &X86::GR64RegClass;
60965 else
60966 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
60967
60968 Register NewVR = MRI->createVirtualRegister(RC);
60969 // Create copy from CSR to a virtual register.
60970 // FIXME: this currently does not emit CFI pseudo-instructions, it works
60971 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
60972 // nounwind. If we want to generalize this later, we may need to emit
60973 // CFI pseudo-instructions.
60974 assert(
60975 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
60976 "Function should be nounwind in insertCopiesSplitCSR!");
60977 Entry->addLiveIn(*I);
60978 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
60979 .addReg(*I);
60980
60981 // Insert the copy-back instructions right before the terminator.
60982 for (auto *Exit : Exits)
60983 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
60984 TII->get(TargetOpcode::COPY), *I)
60985 .addReg(NewVR);
60986 }
60987}
60988
60990 return Subtarget.is64Bit();
60991}
60992
60996 const TargetInstrInfo *TII) const {
60997 assert(MBBI->isCall() && MBBI->getCFIType() &&
60998 "Invalid call instruction for a KCFI check");
60999
61000 MachineFunction &MF = *MBB.getParent();
61001 // If the call target is a memory operand, unfold it and use R11 for the
61002 // call, so KCFI_CHECK won't have to recompute the address.
61003 switch (MBBI->getOpcode()) {
61004 case X86::CALL64m:
61005 case X86::CALL64m_NT:
61006 case X86::TAILJMPm64:
61007 case X86::TAILJMPm64_REX: {
61010 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
61011 /*UnfoldStore=*/false, NewMIs))
61012 report_fatal_error("Failed to unfold memory operand for a KCFI check");
61013 for (auto *NewMI : NewMIs)
61014 MBBI = MBB.insert(OrigCall, NewMI);
61015 assert(MBBI->isCall() &&
61016 "Unexpected instruction after memory operand unfolding");
61017 if (OrigCall->shouldUpdateAdditionalCallInfo())
61018 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
61019 MBBI->setCFIType(MF, OrigCall->getCFIType());
61020 OrigCall->eraseFromParent();
61021 break;
61022 }
61023 default:
61024 break;
61025 }
61026
61027 MachineOperand &Target = MBBI->getOperand(0);
61028 Register TargetReg;
61029 switch (MBBI->getOpcode()) {
61030 case X86::CALL64r:
61031 case X86::CALL64r_NT:
61032 case X86::TAILJMPr64:
61033 case X86::TAILJMPr64_REX:
61034 assert(Target.isReg() && "Unexpected target operand for an indirect call");
61035 Target.setIsRenamable(false);
61036 TargetReg = Target.getReg();
61037 break;
61038 case X86::CALL64pcrel32:
61039 case X86::TAILJMPd64:
61040 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
61041 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
61042 // 64-bit indirect thunk calls.
61043 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
61044 "Unexpected register for an indirect thunk call");
61045 TargetReg = X86::R11;
61046 break;
61047 default:
61048 llvm_unreachable("Unexpected CFI call opcode");
61049 break;
61050 }
61051
61052 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
61053 .addReg(TargetReg)
61054 .addImm(MBBI->getCFIType())
61055 .getInstr();
61056}
61057
61058/// Returns true if stack probing through a function call is requested.
61060 return !getStackProbeSymbolName(MF).empty();
61061}
61062
61063/// Returns true if stack probing through inline assembly is requested.
61065
61066 // No inline stack probe for Windows, they have their own mechanism.
61067 if (Subtarget.isOSWindows() ||
61068 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
61069 return false;
61070
61071 // If the function specifically requests inline stack probes, emit them.
61072 if (MF.getFunction().hasFnAttribute("probe-stack"))
61073 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
61074 "inline-asm";
61075
61076 return false;
61077}
61078
61079/// Returns the name of the symbol used to emit stack probes or the empty
61080/// string if not applicable.
61083 // Inline Stack probes disable stack probe call
61084 if (hasInlineStackProbe(MF))
61085 return "";
61086
61087 // If the function specifically requests stack probes, emit them.
61088 if (MF.getFunction().hasFnAttribute("probe-stack"))
61089 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
61090
61091 // Generally, if we aren't on Windows, the platform ABI does not include
61092 // support for stack probes, so don't emit them.
61093 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
61094 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
61095 return "";
61096
61097 // We need a stack probe to conform to the Windows ABI. Choose the right
61098 // symbol.
61099 if (Subtarget.is64Bit())
61100 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
61101 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
61102}
61103
61104unsigned
61106 // The default stack probe size is 4096 if the function has no stackprobesize
61107 // attribute.
61108 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
61109 4096);
61110}
61111
61113 if (ML && ML->isInnermost() &&
61114 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
61117}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
static const LLT S1
static const LLT F64
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
Definition: BitTracker.cpp:73
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Looks at all the uses of the given value Returns the Liveness deduced from the uses of this value Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses If the result is MaybeLiveUses might be modified but its content should be ignored(since it might not be complete). DeadArgumentEliminationPass
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static Value * getMask(Value *WideMask, unsigned Factor, VectorType *LeafValueTy)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:557
Live Register Matrix
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static bool isUndef(const MachineInstr &MI)
unsigned const TargetRegisterInfo * TRI
#define R2(n)
#define T1
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static StringRef substr(StringRef Str, uint64_t Len)
This file implements the SmallBitVector class.
This file defines the SmallSet class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:245
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, unsigned Depth, const SimplifyQuery &Q, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
@ ShiftBit
@ UndefBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue V)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISD::P...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as a zero or any extension.
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to map a 128-bit or larger integer comparison to vector instructions before type legalization spl...
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static bool matchAsm(StringRef S, ArrayRef< const char * > Pieces)
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, SDValue And1_L, SDValue And1_R, const SDLoc &DL, SelectionDAG &DAG)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, unsigned Reg)
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, bool AllowTruncate)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG)
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isLegalConversion(MVT VT, bool IsSigned, const X86Subtarget &Subtarget)
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, bool HasVariableMask, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget)
Try to lower a vector shuffle as a bit shift (shifts in zeros).
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If a vector select has an operand that is -1 or 0, try to simplify the select to a bitwise logic oper...
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, SDValue Root, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to lower a vector shuffle as a byte rotation.
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, MachineBasicBlock *BB)
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG)
Fold "masked merge" expressions like (m & x) | (~m & y) into the equivalent ((x ^ y) & m) ^ y) patter...
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code? Current x86 isa includes the foll...
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5463
static APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition: APFloat.cpp:5488
void clearSign()
Definition: APFloat.h:1300
opStatus next(bool nextDown)
Definition: APFloat.h:1256
void changeSign()
Definition: APFloat.h:1299
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1081
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition: APInt.cpp:493
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1007
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1492
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:910
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
APInt abs() const
Get the absolute value.
Definition: APInt.h:1773
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void setSignBit()
Set the sign bit to 1.
Definition: APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition: APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition: APInt.h:1079
int32_t exactLogBase2() const
Definition: APInt.h:1761
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition: APInt.h:1607
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1577
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:624
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:219
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1511
void flipAllBits()
Toggle every bit to its opposite value.
Definition: APInt.h:1434
unsigned countl_one() const
Count the number of leading one bits.
Definition: APInt.h:1594
void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition: APInt.cpp:370
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition: APInt.h:1417
unsigned logBase2() const
Definition: APInt.h:1739
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition: APInt.h:471
bool isMask(unsigned numBits) const
Definition: APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1150
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:959
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1389
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:455
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition: APInt.h:399
APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition: APInt.cpp:947
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:652
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:764
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:768
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:396
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
The address of a basic block.
Definition: Constants.h:893
A "pseudo-class" with methods for operating on BUILD_VECTORs.
bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
Value * getCalledOperand() const
Definition: InstrTypes.h:1334
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition: Constants.cpp:3007
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1597
bool isMachineConstantPoolEntry() const
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:403
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:435
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Tagged union holding either a T or a Error.
Definition: Error.h:481
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:128
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:713
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:710
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition: Function.h:911
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1048
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition: GlobalValue.h:568
bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition: Globals.cpp:424
ThreadLocalMode getThreadLocalMode() const
Definition: GlobalValue.h:272
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:121
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
Class to represent integer types.
Definition: DerivedTypes.h:42
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:74
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:176
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:661
MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition: MCContext.cpp:241
MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition: MCContext.cpp:246
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:307
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
unsigned succ_size() const
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
succ_reverse_iterator succ_rbegin()
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:71
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
bool readMem() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:354
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:695
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:121
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
iterator_range< user_iterator > users()
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
Definition: SelectionDAG.h:371
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition: SelectionDAG.h:953
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:751
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:983
SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:499
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:802
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:458
SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:761
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:857
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:828
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:505
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:701
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:797
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:874
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
std::optional< uint64_t > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
Definition: SelectionDAG.h:907
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition: SelectionDAG.h:937
SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
void reserve(size_type NumEntries)
Definition: SmallPtrSet.h:112
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
iterator erase(const_iterator CI)
Definition: SmallVector.h:737
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:578
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:286
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:571
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
size_t size_type
Definition: StringRef.h:57
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
static constexpr size_t npos
Definition: StringRef.h:53
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition: StringRef.h:176
size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Definition: StringRef.cpp:253
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:406
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:81
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:701
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition: Triple.h:758
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:588
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
static IntegerType * getInt1Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:411
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
uint64_t getArrayNumElements() const
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:64
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
use_iterator use_begin()
Definition: Value.h:360
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1094
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
unsigned getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const
bool hasBasePointer(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getDarwinTLSCallPreservedMask() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
Definition: X86Subtarget.h:236
bool hasAnyFMA() const
Definition: X86Subtarget.h:203
bool isOSWindows() const
Definition: X86Subtarget.h:329
bool isTargetMachO() const
Definition: X86Subtarget.h:293
bool useIndirectThunkBranches() const
Definition: X86Subtarget.h:221
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool hasBitScanPassThrough() const
Definition: X86Subtarget.h:269
bool isPICStyleGOT() const
Definition: X86Subtarget.h:337
bool hasSSE42() const
Definition: X86Subtarget.h:198
const X86TargetLowering * getTargetLowering() const override
Definition: X86Subtarget.h:118
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition: X86Subtarget.h:281
bool canUseCMOV() const
Definition: X86Subtarget.h:192
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:340
bool isTargetWindowsMSVC() const
Definition: X86Subtarget.h:305
bool canUseCMPXCHG8B() const
Definition: X86Subtarget.h:185
bool isTargetDarwin() const
Definition: X86Subtarget.h:285
bool isTargetWin64() const
Definition: X86Subtarget.h:333
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition: X86Subtarget.h:178
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:283
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:122
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:346
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool canExtendTo512DQ() const
Definition: X86Subtarget.h:232
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool isTargetELF() const
Definition: X86Subtarget.h:291
bool hasSSEPrefetch() const
Definition: X86Subtarget.h:209
bool canUseCMPXCHG16B() const
Definition: X86Subtarget.h:186
unsigned char classifyGlobalReference(const GlobalValue *GV, const Module &M) const
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasInt256() const
Definition: X86Subtarget.h:202
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:338
bool isTargetCygMing() const
Definition: X86Subtarget.h:325
unsigned char classifyLocalReference(const GlobalValue *GV) const
Classify a global variable reference for the current subtarget according to how we should reference i...
unsigned char classifyBlockAddressReference() const
Classify a blockaddress reference for the current subtarget according to how we should reference it i...
bool isTargetPS() const
Definition: X86Subtarget.h:289
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:132
bool hasAVX() const
Definition: X86Subtarget.h:199
bool isTargetWindowsGNU() const
Definition: X86Subtarget.h:317
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool isTargetWindowsItanium() const
Definition: X86Subtarget.h:321
bool isTargetNaCl64() const
Definition: X86Subtarget.h:301
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:124
bool useBWIRegs() const
Definition: X86Subtarget.h:262
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const
Classify a global function reference for the current subtarget.
bool hasAVX2() const
Definition: X86Subtarget.h:200
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y —> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define INT64_MIN
Definition: DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ X86_ThisCall
Similar to X86_StdCall.
Definition: CallingConv.h:122
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition: CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition: CallingConv.h:103
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1205
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1201
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:491
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1368
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:153
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:512
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1348
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1234
@ ConstantFP
Definition: ISDOpcodes.h:77
@ STRICT_FATAN2
Definition: ISDOpcodes.h:428
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1350
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1320
@ STRICT_FCEIL
Definition: ISDOpcodes.h:441
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1351
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition: ISDOpcodes.h:130
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1081
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ STRICT_FTANH
Definition: ISDOpcodes.h:431
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1110
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:999
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:157
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1333
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:451
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ MEMBARRIER
MEMBARRIER - Compiler barrier only; generate a no-op.
Definition: ISDOpcodes.h:1307
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1312
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition: ISDOpcodes.h:871
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:492
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ STRICT_FLOG2
Definition: ISDOpcodes.h:436
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1346
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1347
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1278
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:418
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1502
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ EH_LABEL
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:1181
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition: ISDOpcodes.h:141
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ STRICT_FASIN
Definition: ISDOpcodes.h:425
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:685
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:465
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:107
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1126
@ STRICT_FATAN
Definition: ISDOpcodes.h:427
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1300
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:788
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1156
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1349
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1135
@ GC_TRANSITION_START
GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the beginning and end of GC transition s...
Definition: ISDOpcodes.h:1399
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1316
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:642
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1230
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:445
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ STRICT_FP_TO_FP16
Definition: ISDOpcodes.h:967
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ STRICT_FP16_TO_FP
Definition: ISDOpcodes.h:966
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1344
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:450
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:439
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1290
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:907
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:440
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ STRICT_FSINH
Definition: ISDOpcodes.h:429
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1352
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:120
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1294
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1120
@ ConstantPool
Definition: ISDOpcodes.h:82
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:860
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ STRICT_FROUND
Definition: ISDOpcodes.h:443
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:464
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
Definition: ISDOpcodes.h:1380
@ STRICT_BF16_TO_FP
Definition: ISDOpcodes.h:975
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:442
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:444
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1342
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:458
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:457
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1343
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1261
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:164
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1287
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ GET_FPENV_MEM
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1086
@ STRICT_FCOSH
Definition: ISDOpcodes.h:430
@ STRICT_FP_TO_BF16
Definition: ISDOpcodes.h:976
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:680
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:407
@ STRICT_FLOG10
Definition: ISDOpcodes.h:435
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition: ISDOpcodes.h:223
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_FEXP2
Definition: ISDOpcodes.h:433
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1341
@ ExternalSymbol
Definition: ISDOpcodes.h:83
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition: ISDOpcodes.h:669
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:882
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:438
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:906
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:147
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1225
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1149
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ GC_TRANSITION_END
Definition: ISDOpcodes.h:1400
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:437
@ SET_FPENV_MEM
Sets the current floating point environment.
Definition: ISDOpcodes.h:1091
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:692
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1284
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ STRICT_FACOS
Definition: ISDOpcodes.h:426
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool isExtVecInRegOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1694
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1689
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1506
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition: ISDOpcodes.h:1676
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1651
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1618
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1598
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition: ISDOpcodes.h:1657
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
ID ArrayRef< Type * > Tys
Definition: Intrinsics.h:102
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:756
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition: PatternMatch.h:664
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
@ GeneralDynamic
Definition: CodeGen.h:46
@ X86
Windows x64, Windows Itanium (IA-64)
@ PTR32_UPTR
Definition: X86.h:214
@ FS
Definition: X86.h:211
@ PTR64
Definition: X86.h:215
@ PTR32_SPTR
Definition: X86.h:213
@ GS
Definition: X86.h:210
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:411
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition: X86BaseInfo.h:391
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: X86BaseInfo.h:488
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition: X86BaseInfo.h:450
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:432
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:456
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition: X86BaseInfo.h:438
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition: X86BaseInfo.h:476
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:403
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:363
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition: X86BaseInfo.h:472
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition: X86BaseInfo.h:460
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:425
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition: X86BaseInfo.h:480
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:444
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:419
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:387
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
@ AddrNumOperands
Definition: X86BaseInfo.h:36
@ TO_NEAREST_INT
Definition: X86BaseInfo.h:42
@ CUR_DIRECTION
Definition: X86BaseInfo.h:46
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:47
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition: X86InstrInfo.h:121
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
@ Length
Definition: DWP.cpp:480
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition: X86InstrInfo.h:139
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition: Utils.cpp:1565
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:360
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, unsigned Reg)
Replace the address used in the instruction with the direct memory reference.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:297
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition: STLExtras.h:2055
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1547
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:347
bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
unsigned M1(unsigned Val)
Definition: VE.h:376
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, unsigned Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:155
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:160
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition: DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1978
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1866
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ SM_SentinelUndef
@ SM_SentinelZero
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
const char * toString(DWARFSectionKind Kind)
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ TRUNCATE_TO_REG
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ TRUNCATE2_TO_REG
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ FIXUPIMM_MASKZ
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition: STLExtras.h:1624
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
#define EQ(a, b)
Definition: regexec.c:112
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:257
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:302
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:306
static const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition: APFloat.cpp:280
static const fltSemantics & IEEEquad() LLVM_READNONE
Definition: APFloat.cpp:259
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:315
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:258
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:256
opStatus
IEEE-754R 7: Default exception handling.
Definition: APFloat.h:318
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:289
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:212
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
Definition: KnownBits.cpp:765
static std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition: KnownBits.cpp:488
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition: KnownBits.h:178
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:100
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:79
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:234
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition: KnownBits.h:266
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:153
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition: KnownBits.h:281
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition: KnownBits.h:85
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:43
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:164
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:53
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition: KnownBits.h:103
static KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition: KnownBits.cpp:228
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:217
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:288
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:303
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:172
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition: KnownBits.h:188
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:137
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:97
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition: KnownBits.h:91
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:804
static std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition: KnownBits.cpp:526
bool isAllOnes() const
Returns true if value is all one bits.
Definition: KnownBits.h:82
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:59
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
MVT ConstraintVT
The ValueType for the operand value.
std::string ConstraintCode
This contains the actual string for the code, like "m".
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
X86AddressMode - This struct holds a generalized full x86 address mode.