LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
305
306 if (!Subtarget.is64Bit()) {
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
329 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
330 MVT::v4i64}) {
333 }
334 if (Subtarget.hasAVX10_2_512()) {
337 }
338 if (Subtarget.is64Bit()) {
341 }
342 }
343
344 // Handle address space casts between mixed sized pointers.
347
348 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
349 if (!Subtarget.hasSSE2()) {
354 if (Subtarget.is64Bit()) {
356 // Without SSE, i64->f64 goes through memory.
358 }
359 } else if (!Subtarget.is64Bit())
361
362 // Scalar integer divide and remainder are lowered to use operations that
363 // produce two results, to match the available instructions. This exposes
364 // the two-result form to trivial CSE, which is able to combine x/y and x%y
365 // into a single instruction.
366 //
367 // Scalar integer multiply-high is also lowered to use two-result
368 // operations, to match the available instructions. However, plain multiply
369 // (low) operations are left as Legal, as there are single-result
370 // instructions for this in x86. Using the two-result multiply instructions
371 // when both high and low results are needed must be arranged by dagcombine.
372 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
379 }
380
381 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
383 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
384 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
387 }
388 if (Subtarget.is64Bit())
393
394 setOperationAction(ISD::FREM , MVT::f32 , Expand);
395 setOperationAction(ISD::FREM , MVT::f64 , Expand);
396 setOperationAction(ISD::FREM , MVT::f80 , Expand);
397 setOperationAction(ISD::FREM , MVT::f128 , Expand);
398
399 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
405 }
406
407 // Promote the i8 variants and force them on up to i32 which has a shorter
408 // encoding.
409 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
411 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
412 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
413 // promote that too.
414 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
416
417 if (!Subtarget.hasBMI()) {
418 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
420 if (Subtarget.is64Bit()) {
421 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
423 }
424 }
425
426 if (Subtarget.hasLZCNT()) {
427 // When promoting the i8 variants, force them to i32 for a shorter
428 // encoding.
429 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
431 } else {
432 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
433 if (VT == MVT::i64 && !Subtarget.is64Bit())
434 continue;
437 }
438 }
439
442 // Special handling for half-precision floating point conversions.
443 // If we don't have F16C support, then lower half float conversions
444 // into library calls.
446 Op, MVT::f32,
447 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
448 // There's never any support for operations beyond MVT::f32.
449 setOperationAction(Op, MVT::f64, Expand);
450 setOperationAction(Op, MVT::f80, Expand);
451 setOperationAction(Op, MVT::f128, Expand);
452 }
453
454 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
457 }
458
459 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
460 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
461 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
462 setTruncStoreAction(VT, MVT::f16, Expand);
463 setTruncStoreAction(VT, MVT::bf16, Expand);
464
467 }
468
472 if (Subtarget.is64Bit())
474 if (Subtarget.hasPOPCNT()) {
475 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
476 // popcntw is longer to encode than popcntl and also has a false dependency
477 // on the dest that popcntl hasn't had since Cannon Lake.
478 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
479 } else {
484 }
485
487
488 if (!Subtarget.hasMOVBE())
490
491 // X86 wants to expand cmov itself.
492 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
497 }
498 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
499 if (VT == MVT::i64 && !Subtarget.is64Bit())
500 continue;
503 }
504
505 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
508
510 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
511 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
515
516 // Darwin ABI issue.
517 for (auto VT : { MVT::i32, MVT::i64 }) {
518 if (VT == MVT::i64 && !Subtarget.is64Bit())
519 continue;
526 }
527
528 // 64-bit shl, sra, srl (iff 32-bit x86)
529 for (auto VT : { MVT::i32, MVT::i64 }) {
530 if (VT == MVT::i64 && !Subtarget.is64Bit())
531 continue;
535 }
536
537 if (Subtarget.hasSSEPrefetch())
539
541
542 // Expand certain atomics
543 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
551 }
552
553 if (!Subtarget.is64Bit())
555
556 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
557 // All CPUs supporting AVX will atomically load/store aligned 128-bit
558 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
561 }
562
563 if (Subtarget.canUseCMPXCHG16B())
565
566 // FIXME - use subtarget debug flags
567 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
568 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
569 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
571 }
572
575
578
579 setOperationAction(ISD::TRAP, MVT::Other, Legal);
581 if (Subtarget.isTargetPS())
583 else
585
586 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
588 setOperationAction(ISD::VAEND , MVT::Other, Expand);
589 bool Is64Bit = Subtarget.is64Bit();
590 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
591 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
592
595
597
598 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
601
603
604 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
605 setOperationAction(ISD::FABS, VT, Action);
606 setOperationAction(ISD::FNEG, VT, Action);
608 setOperationAction(ISD::FREM, VT, Action);
609 setOperationAction(ISD::FMA, VT, Action);
610 setOperationAction(ISD::FMINNUM, VT, Action);
611 setOperationAction(ISD::FMAXNUM, VT, Action);
616 setOperationAction(ISD::FSIN, VT, Action);
617 setOperationAction(ISD::FCOS, VT, Action);
618 setOperationAction(ISD::FSINCOS, VT, Action);
619 setOperationAction(ISD::FTAN, VT, Action);
620 setOperationAction(ISD::FSQRT, VT, Action);
621 setOperationAction(ISD::FPOW, VT, Action);
622 setOperationAction(ISD::FPOWI, VT, Action);
623 setOperationAction(ISD::FLOG, VT, Action);
624 setOperationAction(ISD::FLOG2, VT, Action);
625 setOperationAction(ISD::FLOG10, VT, Action);
626 setOperationAction(ISD::FEXP, VT, Action);
627 setOperationAction(ISD::FEXP2, VT, Action);
628 setOperationAction(ISD::FEXP10, VT, Action);
629 setOperationAction(ISD::FCEIL, VT, Action);
630 setOperationAction(ISD::FFLOOR, VT, Action);
632 setOperationAction(ISD::FRINT, VT, Action);
633 setOperationAction(ISD::BR_CC, VT, Action);
634 setOperationAction(ISD::SETCC, VT, Action);
637 setOperationAction(ISD::FROUND, VT, Action);
639 setOperationAction(ISD::FTRUNC, VT, Action);
640 setOperationAction(ISD::FLDEXP, VT, Action);
641 };
642
643 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
644 // f16, f32 and f64 use SSE.
645 // Set up the FP register classes.
646 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
647 : &X86::FR16RegClass);
648 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
649 : &X86::FR32RegClass);
650 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
651 : &X86::FR64RegClass);
652
653 // Disable f32->f64 extload as we can only generate this in one instruction
654 // under optsize. So its easier to pattern match (fpext (load)) for that
655 // case instead of needing to emit 2 instructions for extload in the
656 // non-optsize case.
657 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
658
659 for (auto VT : { MVT::f32, MVT::f64 }) {
660 // Use ANDPD to simulate FABS.
662
663 // Use XORP to simulate FNEG.
665
666 // Use ANDPD and ORPD to simulate FCOPYSIGN.
668
669 // These might be better off as horizontal vector ops.
672
673 // We don't support sin/cos/fmod
677 }
678
679 // Half type will be promoted by default.
680 setF16Action(MVT::f16, Promote);
691
724
725 // Lower this to MOVMSK plus an AND.
728
729 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
730 (UseX87 || Is64Bit)) {
731 // Use SSE for f32, x87 for f64.
732 // Set up the FP register classes.
733 addRegisterClass(MVT::f32, &X86::FR32RegClass);
734 if (UseX87)
735 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
736
737 // Use ANDPS to simulate FABS.
739
740 // Use XORP to simulate FNEG.
742
743 if (UseX87)
745
746 // Use ANDPS and ORPS to simulate FCOPYSIGN.
747 if (UseX87)
750
751 // We don't support sin/cos/fmod
755
756 if (UseX87) {
757 // Always expand sin/cos functions even though x87 has an instruction.
761 }
762 } else if (UseX87) {
763 // f32 and f64 in x87.
764 // Set up the FP register classes.
765 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
766 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
767
768 for (auto VT : { MVT::f32, MVT::f64 }) {
771
772 // Always expand sin/cos functions even though x87 has an instruction.
776 }
777 }
778
779 // Expand FP32 immediates into loads from the stack, save special cases.
780 if (isTypeLegal(MVT::f32)) {
781 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
782 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
783 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
784 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
785 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
786 } else // SSE immediates.
787 addLegalFPImmediate(APFloat(+0.0f)); // xorps
788 }
789 // Expand FP64 immediates into loads from the stack, save special cases.
790 if (isTypeLegal(MVT::f64)) {
791 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
792 addLegalFPImmediate(APFloat(+0.0)); // FLD0
793 addLegalFPImmediate(APFloat(+1.0)); // FLD1
794 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
795 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
796 } else // SSE immediates.
797 addLegalFPImmediate(APFloat(+0.0)); // xorpd
798 }
799 // Support fp16 0 immediate.
800 if (isTypeLegal(MVT::f16))
801 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
802
803 // Handle constrained floating-point operations of scalar.
816
817 // We don't support FMA.
820
821 // f80 always uses X87.
822 if (UseX87) {
823 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
826 {
828 addLegalFPImmediate(TmpFlt); // FLD0
829 TmpFlt.changeSign();
830 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
831
832 bool ignored;
833 APFloat TmpFlt2(+1.0);
835 &ignored);
836 addLegalFPImmediate(TmpFlt2); // FLD1
837 TmpFlt2.changeSign();
838 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
839 }
840
841 // Always expand sin/cos functions even though x87 has an instruction.
842 // clang-format off
854 // clang-format on
855
867
868 // Handle constrained floating-point operations of scalar.
875 if (isTypeLegal(MVT::f16)) {
878 } else {
880 }
881 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
882 // as Custom.
884 }
885
886 // f128 uses xmm registers, but most operations require libcalls.
887 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
888 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
889 : &X86::VR128RegClass);
890
891 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
892
903
907
908 // clang-format off
916 // clang-format on
917 // No STRICT_FSINCOS
920
923 // We need to custom handle any FP_ROUND with an f128 input, but
924 // LegalizeDAG uses the result type to know when to run a custom handler.
925 // So we have to list all legal floating point result types here.
926 if (isTypeLegal(MVT::f32)) {
929 }
930 if (isTypeLegal(MVT::f64)) {
933 }
934 if (isTypeLegal(MVT::f80)) {
938 }
939
941
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
943 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
944 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
946 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
947 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
948 }
949
950 // Always use a library call for pow.
951 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
953 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
954 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
955
964
965 // Some FP actions are always expanded for vector types.
966 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
967 MVT::v4f32, MVT::v8f32, MVT::v16f32,
968 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
969 // clang-format off
983 // clang-format on
984 }
985
986 // First set operation action for all vector types to either promote
987 // (for widening) or expand (for scalarization). Then we will selectively
988 // turn on ones that can be effectively codegen'd.
1028 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1029 setTruncStoreAction(InnerVT, VT, Expand);
1030
1031 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1032 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1033
1034 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1035 // types, we have to deal with them whether we ask for Expansion or not.
1036 // Setting Expand causes its own optimisation problems though, so leave
1037 // them legal.
1038 if (VT.getVectorElementType() == MVT::i1)
1039 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1040
1041 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1042 // split/scalarized right now.
1043 if (VT.getVectorElementType() == MVT::f16 ||
1044 VT.getVectorElementType() == MVT::bf16)
1045 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1046 }
1047 }
1048
1049 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1050 // with -msoft-float, disable use of MMX as well.
1051 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1052 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1053 // No operations on x86mmx supported, everything uses intrinsics.
1054 }
1055
1056 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1057 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1058 : &X86::VR128RegClass);
1059
1064
1065 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1066 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1074
1075 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1076 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1078
1084 }
1085
1086 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1087 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1088 : &X86::VR128RegClass);
1089
1090 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1091 // registers cannot be used even for integer operations.
1092 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1101 : &X86::VR128RegClass);
1102
1103 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1108 }
1109
1110 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1111 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1116 }
1117
1118 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1119 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1120 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1121
1122 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1123 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1128 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1129 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1130 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1131 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1134
1135 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1136 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1137 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1138
1139 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1141 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1143
1144 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1145 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1146
1147 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1148 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1149 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1150 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1151 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1152 }
1153
1164
1169
1170 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1176
1177 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1178 // setcc all the way to isel and prefer SETGT in some isel patterns.
1181 }
1182
1183 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1184 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1189
1190 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1196 }
1197
1198 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1202
1203 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1204 continue;
1205
1208 }
1209 setF16Action(MVT::v8f16, Expand);
1210 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1213 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1214 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1215 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1217
1218 // Custom lower v2i64 and v2f64 selects.
1225
1232
1233 // Custom legalize these to avoid over promotion or custom promotion.
1234 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1239 }
1240
1245
1248
1251
1252 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1257
1262
1263 // We want to legalize this to an f64 load rather than an i64 load on
1264 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1265 // store.
1266 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1267 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1268 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1269 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1270 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1272
1273 // Add 32-bit vector stores to help vectorization opportunities.
1274 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1276
1280 if (!Subtarget.hasAVX512())
1282
1286
1288
1305
1306 // In the customized shift lowering, the legal v4i32/v2i64 cases
1307 // in AVX2 will be recognized.
1308 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1312 if (VT == MVT::v2i64) continue;
1317 }
1318
1324 }
1325
1326 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1331
1332 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1334 }
1335 }
1336
1337 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1338 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1339 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1340 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1341
1342 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1345 }
1346
1347 // These might be better off as horizontal vector ops.
1352 }
1353
1354 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1355 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1358 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1362 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1368
1370 }
1371
1372 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1373 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1375 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1376 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1377 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1378 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1379 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1380
1384
1385 // FIXME: Do we need to handle scalar-to-vector here?
1386 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1387 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1388
1389 // We directly match byte blends in the backend as they match the VSELECT
1390 // condition form.
1392
1393 // SSE41 brings specific instructions for doing vector sign extend even in
1394 // cases where we don't have SRA.
1395 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1398 }
1399
1400 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1401 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1402 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1406 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1407 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1408 }
1409
1410 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1411 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1412 // do the pre and post work in the vector domain.
1415 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1416 // so that DAG combine doesn't try to turn it into uint_to_fp.
1419 }
1420 }
1421
1422 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1424 }
1425
1426 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1427 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1428 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1431 }
1432
1433 // XOP can efficiently perform BITREVERSE with VPPERM.
1434 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1436 }
1437
1438 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1439 bool HasInt256 = Subtarget.hasInt256();
1440
1441 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1454 : &X86::VR256RegClass);
1455
1456 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1469
1471
1475
1481 }
1482
1483 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1484 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1485
1486 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1487 // even though v8i16 is a legal type.
1488 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1490 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1491 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1495
1502
1514
1515 if (!Subtarget.hasAVX512())
1517
1518 // In the customized shift lowering, the legal v8i32/v4i64 cases
1519 // in AVX2 will be recognized.
1520 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1526 if (VT == MVT::v4i64) continue;
1531 }
1532
1533 // These types need custom splitting if their input is a 128-bit vector.
1538
1542 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1543 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1546
1547 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1551 }
1552
1557
1558 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1563
1564 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1565 // setcc all the way to isel and prefer SETGT in some isel patterns.
1568 }
1569
1570 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1571 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1576
1577 if (Subtarget.hasAnyFMA()) {
1578 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1579 MVT::v2f64, MVT::v4f64 }) {
1582 }
1583 }
1584
1585 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1586 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1587 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1588 }
1589
1590 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1591 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1592 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1593 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1594
1595 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1596 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1597 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1598 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1599 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1600 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1601 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1602 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1603
1604 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1605 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1606
1607 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1610 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1611 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1612
1613 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1619 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1620 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1625
1626 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1627 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1630 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1631 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1632 }
1633
1634 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1637 }
1638
1639 if (HasInt256) {
1640 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1641 // when we have a 256bit-wide blend with immediate.
1644
1645 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1646 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1647 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1651 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1652 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1653 }
1654 }
1655
1656 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1657 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1658 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1660 }
1661
1662 // Extract subvector is special because the value type
1663 // (result) is 128-bit but the source is 256-bit wide.
1664 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1665 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1667 }
1668
1669 // Custom lower several nodes for 256-bit types.
1670 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1671 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1681 }
1682 setF16Action(MVT::v16f16, Expand);
1683 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1686 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1688 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1689 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1690
1691 if (HasInt256) {
1693
1694 // Custom legalize 2x32 to get a little better code.
1697
1698 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1699 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1701 }
1702 }
1703
1704 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1705 Subtarget.hasF16C()) {
1706 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1709 }
1710 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1713 }
1714 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1715 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1716 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1717 }
1718 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1719 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1720 }
1721
1722 // This block controls legalization of the mask vector sizes that are
1723 // available with AVX512. 512-bit vectors are in a separate block controlled
1724 // by useAVX512Regs.
1725 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1726 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1727 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1728 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1729 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1730 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1731
1735
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1738 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1739 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1742 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1743 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1751
1752 // There is no byte sized k-register load or store without AVX512DQ.
1753 if (!Subtarget.hasDQI()) {
1754 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1756 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1757 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1758
1763 }
1764
1765 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1766 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1770 }
1771
1772 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1774
1775 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1779
1786 }
1787
1788 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1790 }
1791 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1792 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1795 }
1796 }
1797
1798 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1799 // elements. 512-bits can be disabled based on prefer-vector-width and
1800 // required-vector-width function attributes.
1801 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1802 bool HasBWI = Subtarget.hasBWI();
1803
1804 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1809 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1810 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1811
1812 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1813 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1816 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1817 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1818 if (HasBWI)
1819 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1820 }
1821
1822 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1833 }
1834 setOperationAction(ISD::LRINT, MVT::v16f32,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 setOperationAction(ISD::LRINT, MVT::v8f64,
1837 Subtarget.hasDQI() ? Legal : Custom);
1838 if (Subtarget.hasDQI())
1839 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1840
1841 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1846 }
1847
1848 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1853 }
1854
1861
1873
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1875 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1876 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1877 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1878 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1879 if (HasBWI)
1880 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1881
1882 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1883 // to 512-bit rather than use the AVX2 instructions so that we can use
1884 // k-masks.
1885 if (!Subtarget.hasVLX()) {
1886 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1887 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1890 }
1891 }
1892
1894 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1895 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1905
1906 if (HasBWI) {
1907 // Extends from v64i1 masks to 512-bit vectors.
1911 }
1912
1913 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1926
1928 }
1929
1930 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1933 }
1934
1935 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1937 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1938 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1939
1940 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1941 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1942 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1943 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1944
1945 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1946 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1947 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1948 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1949 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1950 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1951 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1952 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1953
1954 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1955 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1956
1957 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1967
1968 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1969 // setcc all the way to isel and prefer SETGT in some isel patterns.
1972 }
1973
1974 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1975 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1980
1981 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1988 }
1989
1990 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1991 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1992 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1994 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1996 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1997 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2002 }
2003
2004 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2008 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2009 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2010
2011 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2015 setOperationAction(Opc, MVT::v8i64, Custom);
2016
2017 if (Subtarget.hasDQI())
2018 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2019
2020 if (Subtarget.hasCDI()) {
2021 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2022 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2024 }
2025 } // Subtarget.hasCDI()
2026
2027 if (Subtarget.hasVPOPCNTDQ()) {
2028 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2030 }
2031
2032 // Extract subvector is special because the value type
2033 // (result) is 256-bit but the source is 512-bit wide.
2034 // 128-bit was made Legal under AVX1.
2035 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2036 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2038
2039 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2040 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2050 }
2051 setF16Action(MVT::v32f16, Expand);
2056 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2057 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2058 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2059
2060 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2065 }
2066 if (HasBWI) {
2067 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2070 }
2071 } else {
2072 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2073 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2074 }
2075
2076 if (Subtarget.hasVBMI2()) {
2077 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2080 }
2081
2082 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2083 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2084 }
2085
2086 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2087 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2089 }// useAVX512Regs
2090
2091 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2092 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2093 MVT::v4i64}) {
2096 }
2097 }
2098
2099 // This block controls legalization for operations that don't have
2100 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2101 // narrower widths.
2102 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2103 // These operations are handled on non-VLX by artificially widening in
2104 // isel patterns.
2105
2109
2110 if (Subtarget.hasDQI()) {
2111 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2112 // v2f32 UINT_TO_FP is already custom under SSE2.
2115 "Unexpected operation action!");
2116 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2121 }
2122
2123 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2129 }
2130
2131 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2134 }
2135
2136 // Custom legalize 2x32 to get a little better code.
2139
2140 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2141 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2143
2144 if (Subtarget.hasDQI()) {
2148 setOperationAction(Opc, MVT::v2i64, Custom);
2149 setOperationAction(Opc, MVT::v4i64, Custom);
2150 }
2151 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2152 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2153 }
2154
2155 if (Subtarget.hasCDI()) {
2156 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2158 }
2159 } // Subtarget.hasCDI()
2160
2161 if (Subtarget.hasVPOPCNTDQ()) {
2162 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2164 }
2165
2166 // We can try to convert vectors to different sizes to leverage legal
2167 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2168 // then specialize to Legal below.
2169 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2170 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2171 MVT::v16i16, MVT::v8i8})
2173
2174 // Legal vpcompress depends on various AVX512 extensions.
2175 // Legal in AVX512F
2176 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2178
2179 // Legal in AVX512F + AVX512VL
2180 if (Subtarget.hasVLX())
2181 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2182 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2184
2185 // Legal in AVX512F + AVX512VBMI2
2186 if (Subtarget.hasVBMI2())
2187 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2189
2190 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2191 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2192 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2194 }
2195
2196 // This block control legalization of v32i1/v64i1 which are available with
2197 // AVX512BW..
2198 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2199 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2200 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2201
2202 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2213 }
2214
2215 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2217
2218 // Extends from v32i1 masks to 256-bit vectors.
2222
2223 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2224 MVT::v16f16, MVT::v8f16}) {
2225 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2226 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2227 }
2228
2229 // These operations are handled on non-VLX by artificially widening in
2230 // isel patterns.
2231 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2232
2233 if (Subtarget.hasBITALG()) {
2234 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2236 }
2237 }
2238
2239 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2240 auto setGroup = [&] (MVT VT) {
2251
2264
2266
2269
2275
2281
2285 };
2286
2287 // AVX512_FP16 scalar operations
2288 setGroup(MVT::f16);
2306
2309
2310 if (Subtarget.useAVX512Regs()) {
2311 setGroup(MVT::v32f16);
2317 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2324
2329 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2331 MVT::v32i16);
2332 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2334 MVT::v32i16);
2335 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2337 MVT::v32i16);
2338 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2340 MVT::v32i16);
2341
2345
2346 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2347 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2348
2353 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2354 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2355 }
2356
2361
2362 if (Subtarget.hasVLX()) {
2363 setGroup(MVT::v8f16);
2364 setGroup(MVT::v16f16);
2365
2376
2383
2384 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2387
2391
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2394 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2395 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2396
2397 // Need to custom widen these to prevent scalarization.
2398 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2399 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2400
2405
2410 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2411 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2412 }
2413 }
2414
2415 if (!Subtarget.useSoftFloat() &&
2416 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2417 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2418 : &X86::VR128RegClass);
2419 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2420 : &X86::VR256RegClass);
2421 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2422 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2423 // Set the operation action Custom to do the customization later.
2426 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2427 setF16Action(VT, Expand);
2428 if (!Subtarget.hasBF16())
2434 }
2435 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2436 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2437 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2438 }
2439 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2440 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2442 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2443 }
2444
2445 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2446 Subtarget.useAVX512Regs()) {
2447 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2448 setF16Action(MVT::v32bf16, Expand);
2449 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2450 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2451 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2453 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2457 }
2458
2459 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2460 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2472 }
2473 if (Subtarget.hasAVX10_2_512()) {
2474 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2475 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2476 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2477 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2478 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2479 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2480 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2481 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2482 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2485 }
2486 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2489 }
2490 }
2491
2492 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2493 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2494 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2495 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2496 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2497 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2498
2499 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2500 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2501 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2502 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2503 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2504
2505 if (Subtarget.hasBWI()) {
2506 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2507 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2508 }
2509
2510 if (Subtarget.hasFP16()) {
2511 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2520 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2529 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2534 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2539 }
2540 }
2541
2542 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2543 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2544 }
2545
2546 // We want to custom lower some of our intrinsics.
2550 if (!Subtarget.is64Bit()) {
2552 }
2553
2554 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2555 // handle type legalization for these operations here.
2556 //
2557 // FIXME: We really should do custom legalization for addition and
2558 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2559 // than generic legalization for 64-bit multiplication-with-overflow, though.
2560 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2561 if (VT == MVT::i64 && !Subtarget.is64Bit())
2562 continue;
2563 // Add/Sub/Mul with overflow operations are custom lowered.
2570
2571 // Support carry in as value rather than glue.
2577 }
2578
2579 // Combine sin / cos into _sincos_stret if it is available.
2580 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2581 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2584 }
2585
2586 if (Subtarget.isTargetWin64()) {
2587 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2588 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2589 setOperationAction(ISD::SREM, MVT::i128, Custom);
2590 setOperationAction(ISD::UREM, MVT::i128, Custom);
2599 }
2600
2601 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2602 // is. We should promote the value to 64-bits to solve this.
2603 // This is what the CRT headers do - `fmodf` is an inline header
2604 // function casting to f64 and calling `fmod`.
2605 if (Subtarget.is32Bit() &&
2606 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2607 // clang-format off
2608 for (ISD::NodeType Op :
2626 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2627 ISD::FMODF})
2628 if (isOperationExpand(Op, MVT::f32))
2629 setOperationAction(Op, MVT::f32, Promote);
2630 // clang-format on
2631
2632 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2633 // it, but it's just a wrapper around ldexp.
2634 if (Subtarget.isOSWindows()) {
2636 if (isOperationExpand(Op, MVT::f32))
2637 setOperationAction(Op, MVT::f32, Promote);
2638 }
2639
2640 // We have target-specific dag combine patterns for the following nodes:
2651 ISD::SHL,
2652 ISD::SRA,
2653 ISD::SRL,
2654 ISD::OR,
2655 ISD::AND,
2661 ISD::ADD,
2662 ISD::FADD,
2663 ISD::FSUB,
2664 ISD::FNEG,
2665 ISD::FMA,
2669 ISD::SUB,
2670 ISD::LOAD,
2671 ISD::LRINT,
2673 ISD::MLOAD,
2674 ISD::STORE,
2691 ISD::SETCC,
2692 ISD::MUL,
2693 ISD::XOR,
2704
2706
2707 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2709 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2711 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2713
2714 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2715 // that needs to benchmarked and balanced with the potential use of vector
2716 // load/store types (PR33329, PR33914).
2719
2720 // Default loop alignment, which can be overridden by -align-loops.
2722
2723 // An out-of-order CPU can speculatively execute past a predictable branch,
2724 // but a conditional move could be stalled by an expensive earlier operation.
2725 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2726 EnableExtLdPromotion = true;
2728
2730
2731 // Default to having -disable-strictnode-mutation on
2732 IsStrictFPEnabled = true;
2733}
2734
2735// This has so far only been implemented for 64-bit MachO.
2737 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2738}
2739
2741 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2742 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2743}
2744
2746 const SDLoc &DL) const {
2747 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2748 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2749 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2750 return SDValue(Node, 0);
2751}
2752
2755 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2756 !Subtarget.hasBWI())
2757 return TypeSplitVector;
2758
2759 // Since v8f16 is legal, widen anything over v4f16.
2760 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2761 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2762 VT.getVectorElementType() == MVT::f16)
2763 return TypeSplitVector;
2764
2765 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2766 VT.getVectorElementType() != MVT::i1)
2767 return TypeWidenVector;
2768
2770}
2771
2772FastISel *
2774 const TargetLibraryInfo *libInfo) const {
2775 return X86::createFastISel(funcInfo, libInfo);
2776}
2777
2778//===----------------------------------------------------------------------===//
2779// Other Lowering Hooks
2780//===----------------------------------------------------------------------===//
2781
2783 bool AssumeSingleUse) {
2784 if (!AssumeSingleUse && !Op.hasOneUse())
2785 return false;
2786 if (!ISD::isNormalLoad(Op.getNode()))
2787 return false;
2788
2789 // If this is an unaligned vector, make sure the target supports folding it.
2790 auto *Ld = cast<LoadSDNode>(Op.getNode());
2791 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2792 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2793 return false;
2794
2795 // TODO: If this is a non-temporal load and the target has an instruction
2796 // for it, it should not be folded. See "useNonTemporalLoad()".
2797
2798 return true;
2799}
2800
2802 const X86Subtarget &Subtarget,
2803 bool AssumeSingleUse) {
2804 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2805 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2806 return false;
2807
2808 // We can not replace a wide volatile load with a broadcast-from-memory,
2809 // because that would narrow the load, which isn't legal for volatiles.
2810 auto *Ld = cast<LoadSDNode>(Op.getNode());
2811 return !Ld->isVolatile() ||
2812 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2813}
2814
2816 if (!Op.hasOneUse())
2817 return false;
2818 // Peek through (oneuse) bitcast users
2819 SDNode *User = *Op->user_begin();
2820 while (User->getOpcode() == ISD::BITCAST) {
2821 if (!User->hasOneUse())
2822 return false;
2823 User = *User->user_begin();
2824 }
2825 return ISD::isNormalStore(User);
2826}
2827
2829 if (Op.hasOneUse()) {
2830 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2831 return (ISD::ZERO_EXTEND == Opcode);
2832 }
2833 return false;
2834}
2835
2836static bool isLogicOp(unsigned Opcode) {
2837 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2838 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2839}
2840
2841static bool isTargetShuffle(unsigned Opcode) {
2842 switch(Opcode) {
2843 default: return false;
2844 case X86ISD::BLENDI:
2845 case X86ISD::PSHUFB:
2846 case X86ISD::PSHUFD:
2847 case X86ISD::PSHUFHW:
2848 case X86ISD::PSHUFLW:
2849 case X86ISD::SHUFP:
2850 case X86ISD::INSERTPS:
2851 case X86ISD::EXTRQI:
2852 case X86ISD::INSERTQI:
2853 case X86ISD::VALIGN:
2854 case X86ISD::PALIGNR:
2855 case X86ISD::VSHLDQ:
2856 case X86ISD::VSRLDQ:
2857 case X86ISD::MOVLHPS:
2858 case X86ISD::MOVHLPS:
2859 case X86ISD::MOVSHDUP:
2860 case X86ISD::MOVSLDUP:
2861 case X86ISD::MOVDDUP:
2862 case X86ISD::MOVSS:
2863 case X86ISD::MOVSD:
2864 case X86ISD::MOVSH:
2865 case X86ISD::UNPCKL:
2866 case X86ISD::UNPCKH:
2867 case X86ISD::VBROADCAST:
2868 case X86ISD::VPERMILPI:
2869 case X86ISD::VPERMILPV:
2870 case X86ISD::VPERM2X128:
2871 case X86ISD::SHUF128:
2872 case X86ISD::VPERMIL2:
2873 case X86ISD::VPERMI:
2874 case X86ISD::VPPERM:
2875 case X86ISD::VPERMV:
2876 case X86ISD::VPERMV3:
2877 case X86ISD::VZEXT_MOVL:
2878 return true;
2879 }
2880}
2881
2882static bool isTargetShuffleVariableMask(unsigned Opcode) {
2883 switch (Opcode) {
2884 default: return false;
2885 // Target Shuffles.
2886 case X86ISD::PSHUFB:
2887 case X86ISD::VPERMILPV:
2888 case X86ISD::VPERMIL2:
2889 case X86ISD::VPPERM:
2890 case X86ISD::VPERMV:
2891 case X86ISD::VPERMV3:
2892 return true;
2893 // 'Faux' Target Shuffles.
2894 case ISD::OR:
2895 case ISD::AND:
2896 case X86ISD::ANDNP:
2897 return true;
2898 }
2899}
2900
2903 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2905 int ReturnAddrIndex = FuncInfo->getRAIndex();
2906
2907 if (ReturnAddrIndex == 0) {
2908 // Set up a frame object for the return address.
2909 unsigned SlotSize = RegInfo->getSlotSize();
2910 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2911 -(int64_t)SlotSize,
2912 false);
2913 FuncInfo->setRAIndex(ReturnAddrIndex);
2914 }
2915
2916 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2917}
2918
2920 bool HasSymbolicDisplacement) {
2921 // Offset should fit into 32 bit immediate field.
2922 if (!isInt<32>(Offset))
2923 return false;
2924
2925 // If we don't have a symbolic displacement - we don't have any extra
2926 // restrictions.
2927 if (!HasSymbolicDisplacement)
2928 return true;
2929
2930 // We can fold large offsets in the large code model because we always use
2931 // 64-bit offsets.
2932 if (CM == CodeModel::Large)
2933 return true;
2934
2935 // For kernel code model we know that all object resist in the negative half
2936 // of 32bits address space. We may not accept negative offsets, since they may
2937 // be just off and we may accept pretty large positive ones.
2938 if (CM == CodeModel::Kernel)
2939 return Offset >= 0;
2940
2941 // For other non-large code models we assume that latest small object is 16MB
2942 // before end of 31 bits boundary. We may also accept pretty large negative
2943 // constants knowing that all objects are in the positive half of address
2944 // space.
2945 return Offset < 16 * 1024 * 1024;
2946}
2947
2948/// Return true if the condition is an signed comparison operation.
2949static bool isX86CCSigned(X86::CondCode X86CC) {
2950 switch (X86CC) {
2951 default:
2952 llvm_unreachable("Invalid integer condition!");
2953 case X86::COND_E:
2954 case X86::COND_NE:
2955 case X86::COND_B:
2956 case X86::COND_A:
2957 case X86::COND_BE:
2958 case X86::COND_AE:
2959 return false;
2960 case X86::COND_G:
2961 case X86::COND_GE:
2962 case X86::COND_L:
2963 case X86::COND_LE:
2964 return true;
2965 }
2966}
2967
2969 switch (SetCCOpcode) {
2970 // clang-format off
2971 default: llvm_unreachable("Invalid integer condition!");
2972 case ISD::SETEQ: return X86::COND_E;
2973 case ISD::SETGT: return X86::COND_G;
2974 case ISD::SETGE: return X86::COND_GE;
2975 case ISD::SETLT: return X86::COND_L;
2976 case ISD::SETLE: return X86::COND_LE;
2977 case ISD::SETNE: return X86::COND_NE;
2978 case ISD::SETULT: return X86::COND_B;
2979 case ISD::SETUGT: return X86::COND_A;
2980 case ISD::SETULE: return X86::COND_BE;
2981 case ISD::SETUGE: return X86::COND_AE;
2982 // clang-format on
2983 }
2984}
2985
2986/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2987/// condition code, returning the condition code and the LHS/RHS of the
2988/// comparison to make.
2990 bool isFP, SDValue &LHS, SDValue &RHS,
2991 SelectionDAG &DAG) {
2992 if (!isFP) {
2993 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2994 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2995 // X > -1 -> X == 0, jump !sign.
2996 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2997 return X86::COND_NS;
2998 }
2999 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
3000 // X < 0 -> X == 0, jump on sign.
3001 return X86::COND_S;
3002 }
3003 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3004 // X >= 0 -> X == 0, jump on !sign.
3005 return X86::COND_NS;
3006 }
3007 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3008 // X < 1 -> X <= 0
3009 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3010 return X86::COND_LE;
3011 }
3012 }
3013
3014 return TranslateIntegerX86CC(SetCCOpcode);
3015 }
3016
3017 // First determine if it is required or is profitable to flip the operands.
3018
3019 // If LHS is a foldable load, but RHS is not, flip the condition.
3020 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3021 !ISD::isNON_EXTLoad(RHS.getNode())) {
3022 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3023 std::swap(LHS, RHS);
3024 }
3025
3026 switch (SetCCOpcode) {
3027 default: break;
3028 case ISD::SETOLT:
3029 case ISD::SETOLE:
3030 case ISD::SETUGT:
3031 case ISD::SETUGE:
3032 std::swap(LHS, RHS);
3033 break;
3034 }
3035
3036 // On a floating point condition, the flags are set as follows:
3037 // ZF PF CF op
3038 // 0 | 0 | 0 | X > Y
3039 // 0 | 0 | 1 | X < Y
3040 // 1 | 0 | 0 | X == Y
3041 // 1 | 1 | 1 | unordered
3042 switch (SetCCOpcode) {
3043 // clang-format off
3044 default: llvm_unreachable("Condcode should be pre-legalized away");
3045 case ISD::SETUEQ:
3046 case ISD::SETEQ: return X86::COND_E;
3047 case ISD::SETOLT: // flipped
3048 case ISD::SETOGT:
3049 case ISD::SETGT: return X86::COND_A;
3050 case ISD::SETOLE: // flipped
3051 case ISD::SETOGE:
3052 case ISD::SETGE: return X86::COND_AE;
3053 case ISD::SETUGT: // flipped
3054 case ISD::SETULT:
3055 case ISD::SETLT: return X86::COND_B;
3056 case ISD::SETUGE: // flipped
3057 case ISD::SETULE:
3058 case ISD::SETLE: return X86::COND_BE;
3059 case ISD::SETONE:
3060 case ISD::SETNE: return X86::COND_NE;
3061 case ISD::SETUO: return X86::COND_P;
3062 case ISD::SETO: return X86::COND_NP;
3063 case ISD::SETOEQ:
3064 case ISD::SETUNE: return X86::COND_INVALID;
3065 // clang-format on
3066 }
3067}
3068
3069/// Is there a floating point cmov for the specific X86 condition code?
3070/// Current x86 isa includes the following FP cmov instructions:
3071/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3072static bool hasFPCMov(unsigned X86CC) {
3073 switch (X86CC) {
3074 default:
3075 return false;
3076 case X86::COND_B:
3077 case X86::COND_BE:
3078 case X86::COND_E:
3079 case X86::COND_P:
3080 case X86::COND_A:
3081 case X86::COND_AE:
3082 case X86::COND_NE:
3083 case X86::COND_NP:
3084 return true;
3085 }
3086}
3087
3088static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3089 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3090 VT.is512BitVector();
3091}
3092
3094 const CallInst &I,
3095 MachineFunction &MF,
3096 unsigned Intrinsic) const {
3098 Info.offset = 0;
3099
3100 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
3101 if (!IntrData) {
3102 switch (Intrinsic) {
3103 case Intrinsic::x86_aesenc128kl:
3104 case Intrinsic::x86_aesdec128kl:
3106 Info.ptrVal = I.getArgOperand(1);
3107 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3108 Info.align = Align(1);
3110 return true;
3111 case Intrinsic::x86_aesenc256kl:
3112 case Intrinsic::x86_aesdec256kl:
3114 Info.ptrVal = I.getArgOperand(1);
3115 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3116 Info.align = Align(1);
3118 return true;
3119 case Intrinsic::x86_aesencwide128kl:
3120 case Intrinsic::x86_aesdecwide128kl:
3122 Info.ptrVal = I.getArgOperand(0);
3123 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3124 Info.align = Align(1);
3126 return true;
3127 case Intrinsic::x86_aesencwide256kl:
3128 case Intrinsic::x86_aesdecwide256kl:
3130 Info.ptrVal = I.getArgOperand(0);
3131 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3132 Info.align = Align(1);
3134 return true;
3135 case Intrinsic::x86_cmpccxadd32:
3136 case Intrinsic::x86_cmpccxadd64:
3137 case Intrinsic::x86_atomic_bts:
3138 case Intrinsic::x86_atomic_btc:
3139 case Intrinsic::x86_atomic_btr: {
3141 Info.ptrVal = I.getArgOperand(0);
3142 unsigned Size = I.getType()->getScalarSizeInBits();
3143 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3144 Info.align = Align(Size);
3147 return true;
3148 }
3149 case Intrinsic::x86_atomic_bts_rm:
3150 case Intrinsic::x86_atomic_btc_rm:
3151 case Intrinsic::x86_atomic_btr_rm: {
3153 Info.ptrVal = I.getArgOperand(0);
3154 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3155 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3156 Info.align = Align(Size);
3159 return true;
3160 }
3161 case Intrinsic::x86_aadd32:
3162 case Intrinsic::x86_aadd64:
3163 case Intrinsic::x86_aand32:
3164 case Intrinsic::x86_aand64:
3165 case Intrinsic::x86_aor32:
3166 case Intrinsic::x86_aor64:
3167 case Intrinsic::x86_axor32:
3168 case Intrinsic::x86_axor64:
3169 case Intrinsic::x86_atomic_add_cc:
3170 case Intrinsic::x86_atomic_sub_cc:
3171 case Intrinsic::x86_atomic_or_cc:
3172 case Intrinsic::x86_atomic_and_cc:
3173 case Intrinsic::x86_atomic_xor_cc: {
3175 Info.ptrVal = I.getArgOperand(0);
3176 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3177 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3178 Info.align = Align(Size);
3181 return true;
3182 }
3183 }
3184 return false;
3185 }
3186
3187 switch (IntrData->Type) {
3190 case TRUNCATE_TO_MEM_VI32: {
3192 Info.ptrVal = I.getArgOperand(0);
3193 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3195 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3196 ScalarVT = MVT::i8;
3197 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3198 ScalarVT = MVT::i16;
3199 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3200 ScalarVT = MVT::i32;
3201
3202 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3203 Info.align = Align(1);
3205 break;
3206 }
3207 case GATHER:
3208 case GATHER_AVX2: {
3210 Info.ptrVal = nullptr;
3211 MVT DataVT = MVT::getVT(I.getType());
3212 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3213 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3214 IndexVT.getVectorNumElements());
3215 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3216 Info.align = Align(1);
3218 break;
3219 }
3220 case SCATTER: {
3222 Info.ptrVal = nullptr;
3223 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3224 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3225 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3226 IndexVT.getVectorNumElements());
3227 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3228 Info.align = Align(1);
3230 break;
3231 }
3232 default:
3233 return false;
3234 }
3235
3236 return true;
3237}
3238
3239/// Returns true if the target can instruction select the
3240/// specified FP immediate natively. If false, the legalizer will
3241/// materialize the FP immediate as a load from a constant pool.
3243 bool ForCodeSize) const {
3244 for (const APFloat &FPImm : LegalFPImmediates)
3245 if (Imm.bitwiseIsEqual(FPImm))
3246 return true;
3247 return false;
3248}
3249
3251 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3252 std::optional<unsigned> ByteOffset) const {
3253 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3254
3255 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3256 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3257 N = *N->user_begin();
3258 return N;
3259 };
3260
3261 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3262 // relocation target a movq or addq instruction: don't let the load shrink.
3263 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3264 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3265 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3266 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3267
3268 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3269 // those uses are extracted directly into a store, then the extract + store
3270 // can be store-folded, or (4) any use will be used by legal full width
3271 // instruction. Then, it's probably not worth splitting the load.
3272 EVT VT = Load->getValueType(0);
3273 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3274 !SDValue(Load, 0).hasOneUse()) {
3275 bool FullWidthUse = false;
3276 bool AllExtractStores = true;
3277 for (SDUse &Use : Load->uses()) {
3278 // Skip uses of the chain value. Result 0 of the node is the load value.
3279 if (Use.getResNo() != 0)
3280 continue;
3281
3282 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3283
3284 // If this use is an extract + store, it's probably not worth splitting.
3285 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3286 all_of(User->uses(), [&](const SDUse &U) {
3287 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3288 return Inner->getOpcode() == ISD::STORE;
3289 }))
3290 continue;
3291
3292 AllExtractStores = false;
3293
3294 // If any use is a full width legal/target bin op, then assume its legal
3295 // and won't split.
3296 if (isBinOp(User->getOpcode()) &&
3297 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3298 User->getOpcode() > ISD::BUILTIN_OP_END))
3299 FullWidthUse = true;
3300 }
3301
3302 if (AllExtractStores)
3303 return false;
3304
3305 // If we have an user that uses the full vector width, then this use is
3306 // only worth splitting if the offset isn't 0 (to avoid an
3307 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3308 if (FullWidthUse)
3309 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3310 }
3311
3312 return true;
3313}
3314
3315/// Returns true if it is beneficial to convert a load of a constant
3316/// to just the constant itself.
3318 Type *Ty) const {
3319 assert(Ty->isIntegerTy());
3320
3321 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3322 if (BitSize == 0 || BitSize > 64)
3323 return false;
3324 return true;
3325}
3326
3328 // If we are using XMM registers in the ABI and the condition of the select is
3329 // a floating-point compare and we have blendv or conditional move, then it is
3330 // cheaper to select instead of doing a cross-register move and creating a
3331 // load that depends on the compare result.
3332 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3333 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3334}
3335
3337 // TODO: It might be a win to ease or lift this restriction, but the generic
3338 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3339 if (VT.isVector() && Subtarget.hasAVX512())
3340 return false;
3341
3342 return true;
3343}
3344
3346 SDValue C) const {
3347 // TODO: We handle scalars using custom code, but generic combining could make
3348 // that unnecessary.
3349 APInt MulC;
3350 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3351 return false;
3352
3353 // Find the type this will be legalized too. Otherwise we might prematurely
3354 // convert this to shl+add/sub and then still have to type legalize those ops.
3355 // Another choice would be to defer the decision for illegal types until
3356 // after type legalization. But constant splat vectors of i64 can't make it
3357 // through type legalization on 32-bit targets so we would need to special
3358 // case vXi64.
3359 while (getTypeAction(Context, VT) != TypeLegal)
3360 VT = getTypeToTransformTo(Context, VT);
3361
3362 // If vector multiply is legal, assume that's faster than shl + add/sub.
3363 // Multiply is a complex op with higher latency and lower throughput in
3364 // most implementations, sub-vXi32 vector multiplies are always fast,
3365 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3366 // is always going to be slow.
3367 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3368 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3369 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3370 return false;
3371
3372 // shl+add, shl+sub, shl+add+neg
3373 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3374 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3375}
3376
3378 unsigned Index) const {
3380 return false;
3381
3382 // Mask vectors support all subregister combinations and operations that
3383 // extract half of vector.
3384 if (ResVT.getVectorElementType() == MVT::i1)
3385 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3386 (Index == ResVT.getVectorNumElements()));
3387
3388 return (Index % ResVT.getVectorNumElements()) == 0;
3389}
3390
3392 unsigned Opc = VecOp.getOpcode();
3393
3394 // Assume target opcodes can't be scalarized.
3395 // TODO - do we have any exceptions?
3396 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3397 return false;
3398
3399 // If the vector op is not supported, try to convert to scalar.
3400 EVT VecVT = VecOp.getValueType();
3402 return true;
3403
3404 // If the vector op is supported, but the scalar op is not, the transform may
3405 // not be worthwhile.
3406 EVT ScalarVT = VecVT.getScalarType();
3407 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3408}
3409
3411 bool) const {
3412 // TODO: Allow vectors?
3413 if (VT.isVector())
3414 return false;
3415 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3416}
3417
3419 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3420 // i32/i64 or can rely on BSF passthrough value.
3421 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3422 Subtarget.hasBitScanPassThrough() ||
3423 (!Ty->isVectorTy() &&
3424 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3425}
3426
3428 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3429 // passthrough value.
3430 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3431 Subtarget.hasBitScanPassThrough();
3432}
3433
3435 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3436 // expensive than a straight movsd. On the other hand, it's important to
3437 // shrink long double fp constant since fldt is very slow.
3438 return !Subtarget.hasSSE2() || VT == MVT::f80;
3439}
3440
3442 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3443 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3444}
3445
3447 const SelectionDAG &DAG,
3448 const MachineMemOperand &MMO) const {
3449 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3450 BitcastVT.getVectorElementType() == MVT::i1)
3451 return false;
3452
3453 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3454 return false;
3455
3456 // If both types are legal vectors, it's always ok to convert them.
3457 if (LoadVT.isVector() && BitcastVT.isVector() &&
3458 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3459 return true;
3460
3461 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3462}
3463
3465 const MachineFunction &MF) const {
3466 // Do not merge to float value size (128 bytes) if no implicit
3467 // float attribute is set.
3468 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3469
3470 if (NoFloat) {
3471 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3472 return (MemVT.getSizeInBits() <= MaxIntSize);
3473 }
3474 // Make sure we don't merge greater than our preferred vector
3475 // width.
3476 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3477 return false;
3478
3479 return true;
3480}
3481
3483 return Subtarget.hasFastLZCNT();
3484}
3485
3487 const Instruction &AndI) const {
3488 return true;
3489}
3490
3492 EVT VT = Y.getValueType();
3493
3494 if (VT.isVector())
3495 return false;
3496
3497 if (!Subtarget.hasBMI())
3498 return false;
3499
3500 // There are only 32-bit and 64-bit forms for 'andn'.
3501 if (VT != MVT::i32 && VT != MVT::i64)
3502 return false;
3503
3504 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3505}
3506
3508 EVT VT = Y.getValueType();
3509
3510 if (!VT.isVector())
3511 return hasAndNotCompare(Y);
3512
3513 // Vector.
3514
3515 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3516 return false;
3517
3518 if (VT == MVT::v4i32)
3519 return true;
3520
3521 return Subtarget.hasSSE2();
3522}
3523
3525 return X.getValueType().isScalarInteger(); // 'bt'
3526}
3527
3531 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3532 SelectionDAG &DAG) const {
3533 // Does baseline recommend not to perform the fold by default?
3535 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3536 return false;
3537 // For scalars this transform is always beneficial.
3538 if (X.getValueType().isScalarInteger())
3539 return true;
3540 // If all the shift amounts are identical, then transform is beneficial even
3541 // with rudimentary SSE2 shifts.
3542 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3543 return true;
3544 // If we have AVX2 with it's powerful shift operations, then it's also good.
3545 if (Subtarget.hasAVX2())
3546 return true;
3547 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3548 return NewShiftOpcode == ISD::SHL;
3549}
3550
3552 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3553 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3554 if (!VT.isInteger())
3555 return ShiftOpc;
3556
3557 bool PreferRotate = false;
3558 if (VT.isVector()) {
3559 // For vectors, if we have rotate instruction support, then its definetly
3560 // best. Otherwise its not clear what the best so just don't make changed.
3561 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3562 VT.getScalarType() == MVT::i64);
3563 } else {
3564 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3565 // rotate unless we have a zext mask+shr.
3566 PreferRotate = Subtarget.hasBMI2();
3567 if (!PreferRotate) {
3568 unsigned MaskBits =
3569 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3570 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3571 }
3572 }
3573
3574 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3575 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3576
3577 if (PreferRotate && MayTransformRotate)
3578 return ISD::ROTL;
3579
3580 // If vector we don't really get much benefit swapping around constants.
3581 // Maybe we could check if the DAG has the flipped node already in the
3582 // future.
3583 if (VT.isVector())
3584 return ShiftOpc;
3585
3586 // See if the beneficial to swap shift type.
3587 if (ShiftOpc == ISD::SHL) {
3588 // If the current setup has imm64 mask, then inverse will have
3589 // at least imm32 mask (or be zext i32 -> i64).
3590 if (VT == MVT::i64)
3591 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3592 : ShiftOpc;
3593
3594 // We can only benefit if req at least 7-bit for the mask. We
3595 // don't want to replace shl of 1,2,3 as they can be implemented
3596 // with lea/add.
3597 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3598 }
3599
3600 if (VT == MVT::i64)
3601 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3602 // extremely efficient.
3603 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3604
3605 // Keep small shifts as shl so we can generate add/lea.
3606 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3607 }
3608
3609 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3610 // (PreferRotate will be set in the latter case).
3611 if (PreferRotate || !MayTransformRotate || VT.isVector())
3612 return ShiftOpc;
3613
3614 // Non-vector type and we have a zext mask with SRL.
3615 return ISD::SRL;
3616}
3617
3620 const Value *Lhs,
3621 const Value *Rhs) const {
3622 using namespace llvm::PatternMatch;
3623 int BaseCost = BrMergingBaseCostThresh.getValue();
3624 // With CCMP, branches can be merged in a more efficient way.
3625 if (BaseCost >= 0 && Subtarget.hasCCMP())
3626 BaseCost += BrMergingCcmpBias;
3627 // a == b && a == c is a fast pattern on x86.
3628 if (BaseCost >= 0 && Opc == Instruction::And &&
3631 BaseCost += 1;
3632 return {BaseCost, BrMergingLikelyBias.getValue(),
3633 BrMergingUnlikelyBias.getValue()};
3634}
3635
3637 return N->getOpcode() != ISD::FP_EXTEND;
3638}
3639
3641 const SDNode *N, CombineLevel Level) const {
3642 assert(((N->getOpcode() == ISD::SHL &&
3643 N->getOperand(0).getOpcode() == ISD::SRL) ||
3644 (N->getOpcode() == ISD::SRL &&
3645 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3646 "Expected shift-shift mask");
3647 // TODO: Should we always create i64 masks? Or only folded immediates?
3648 EVT VT = N->getValueType(0);
3649 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3650 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3651 // Only fold if the shift values are equal - so it folds to AND.
3652 // TODO - we should fold if either is a non-uniform vector but we don't do
3653 // the fold for non-splats yet.
3654 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3655 }
3657}
3658
3660 EVT VT = Y.getValueType();
3661
3662 // For vectors, we don't have a preference, but we probably want a mask.
3663 if (VT.isVector())
3664 return false;
3665
3666 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3667 if (VT == MVT::i64 && !Subtarget.is64Bit())
3668 return false;
3669
3670 return true;
3671}
3672
3675 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3677 !Subtarget.isOSWindows())
3680 ExpansionFactor);
3681}
3682
3684 // Any legal vector type can be splatted more efficiently than
3685 // loading/spilling from memory.
3686 return isTypeLegal(VT);
3687}
3688
3690 MVT VT = MVT::getIntegerVT(NumBits);
3691 if (isTypeLegal(VT))
3692 return VT;
3693
3694 // PMOVMSKB can handle this.
3695 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3696 return MVT::v16i8;
3697
3698 // VPMOVMSKB can handle this.
3699 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3700 return MVT::v32i8;
3701
3702 // TODO: Allow 64-bit type for 32-bit target.
3703 // TODO: 512-bit types should be allowed, but make sure that those
3704 // cases are handled in combineVectorSizedSetCCEquality().
3705
3707}
3708
3709/// Val is the undef sentinel value or equal to the specified value.
3710static bool isUndefOrEqual(int Val, int CmpVal) {
3711 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3712}
3713
3714/// Return true if every element in Mask is the undef sentinel value or equal to
3715/// the specified value.
3716static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3717 return llvm::all_of(Mask, [CmpVal](int M) {
3718 return (M == SM_SentinelUndef) || (M == CmpVal);
3719 });
3720}
3721
3722/// Return true if every element in Mask, beginning from position Pos and ending
3723/// in Pos+Size is the undef sentinel value or equal to the specified value.
3724static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3725 unsigned Size) {
3726 return llvm::all_of(Mask.slice(Pos, Size),
3727 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3728}
3729
3730/// Val is either the undef or zero sentinel value.
3731static bool isUndefOrZero(int Val) {
3732 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3733}
3734
3735/// Return true if every element in Mask, beginning from position Pos and ending
3736/// in Pos+Size is the undef sentinel value.
3737static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3738 return llvm::all_of(Mask.slice(Pos, Size),
3739 [](int M) { return M == SM_SentinelUndef; });
3740}
3741
3742/// Return true if the mask creates a vector whose lower half is undefined.
3744 unsigned NumElts = Mask.size();
3745 return isUndefInRange(Mask, 0, NumElts / 2);
3746}
3747
3748/// Return true if the mask creates a vector whose upper half is undefined.
3750 unsigned NumElts = Mask.size();
3751 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3752}
3753
3754/// Return true if Val falls within the specified range (L, H].
3755static bool isInRange(int Val, int Low, int Hi) {
3756 return (Val >= Low && Val < Hi);
3757}
3758
3759/// Return true if the value of any element in Mask falls within the specified
3760/// range (L, H].
3761static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3762 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3763}
3764
3765/// Return true if the value of any element in Mask is the zero sentinel value.
3766static bool isAnyZero(ArrayRef<int> Mask) {
3767 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3768}
3769
3770/// Return true if Val is undef or if its value falls within the
3771/// specified range (L, H].
3772static bool isUndefOrInRange(int Val, int Low, int Hi) {
3773 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3774}
3775
3776/// Return true if every element in Mask is undef or if its value
3777/// falls within the specified range (L, H].
3778static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3779 return llvm::all_of(
3780 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3781}
3782
3783/// Return true if Val is undef, zero or if its value falls within the
3784/// specified range (L, H].
3785static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3786 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3787}
3788
3789/// Return true if every element in Mask is undef, zero or if its value
3790/// falls within the specified range (L, H].
3791static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3792 return llvm::all_of(
3793 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3794}
3795
3796/// Return true if every element in Mask, is an in-place blend/select mask or is
3797/// undef.
3799 unsigned NumElts = Mask.size();
3800 for (auto [I, M] : enumerate(Mask))
3801 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3802 return false;
3803 return true;
3804}
3805
3806/// Return true if every element in Mask, beginning
3807/// from position Pos and ending in Pos + Size, falls within the specified
3808/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3809static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3810 unsigned Size, int Low, int Step = 1) {
3811 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3812 if (!isUndefOrEqual(Mask[i], Low))
3813 return false;
3814 return true;
3815}
3816
3817/// Return true if every element in Mask, beginning
3818/// from position Pos and ending in Pos+Size, falls within the specified
3819/// sequential range (Low, Low+Size], or is undef or is zero.
3821 unsigned Size, int Low,
3822 int Step = 1) {
3823 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3824 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3825 return false;
3826 return true;
3827}
3828
3829/// Return true if every element in Mask, beginning
3830/// from position Pos and ending in Pos+Size is undef or is zero.
3831static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3832 unsigned Size) {
3833 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3834}
3835
3836/// Return true if every element of a single input is referenced by the shuffle
3837/// mask. i.e. it just permutes them all.
3839 unsigned NumElts = Mask.size();
3840 APInt DemandedElts = APInt::getZero(NumElts);
3841 for (int M : Mask)
3842 if (isInRange(M, 0, NumElts))
3843 DemandedElts.setBit(M);
3844 return DemandedElts.isAllOnes();
3845}
3846
3847/// Helper function to test whether a shuffle mask could be
3848/// simplified by widening the elements being shuffled.
3849///
3850/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3851/// leaves it in an unspecified state.
3852///
3853/// NOTE: This must handle normal vector shuffle masks and *target* vector
3854/// shuffle masks. The latter have the special property of a '-2' representing
3855/// a zero-ed lane of a vector.
3857 SmallVectorImpl<int> &WidenedMask) {
3858 WidenedMask.assign(Mask.size() / 2, 0);
3859 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3860 int M0 = Mask[i];
3861 int M1 = Mask[i + 1];
3862
3863 // If both elements are undef, its trivial.
3864 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3865 WidenedMask[i / 2] = SM_SentinelUndef;
3866 continue;
3867 }
3868
3869 // Check for an undef mask and a mask value properly aligned to fit with
3870 // a pair of values. If we find such a case, use the non-undef mask's value.
3871 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3872 WidenedMask[i / 2] = M1 / 2;
3873 continue;
3874 }
3875 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3876 WidenedMask[i / 2] = M0 / 2;
3877 continue;
3878 }
3879
3880 // When zeroing, we need to spread the zeroing across both lanes to widen.
3881 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3882 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3884 WidenedMask[i / 2] = SM_SentinelZero;
3885 continue;
3886 }
3887 return false;
3888 }
3889
3890 // Finally check if the two mask values are adjacent and aligned with
3891 // a pair.
3892 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3893 WidenedMask[i / 2] = M0 / 2;
3894 continue;
3895 }
3896
3897 // Otherwise we can't safely widen the elements used in this shuffle.
3898 return false;
3899 }
3900 assert(WidenedMask.size() == Mask.size() / 2 &&
3901 "Incorrect size of mask after widening the elements!");
3902
3903 return true;
3904}
3905
3907 const APInt &Zeroable,
3908 bool V2IsZero,
3909 SmallVectorImpl<int> &WidenedMask) {
3910 // Create an alternative mask with info about zeroable elements.
3911 // Here we do not set undef elements as zeroable.
3912 SmallVector<int, 64> ZeroableMask(Mask);
3913 if (V2IsZero) {
3914 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3915 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3916 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3917 ZeroableMask[i] = SM_SentinelZero;
3918 }
3919 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3920}
3921
3923 SmallVector<int, 32> WidenedMask;
3924 return canWidenShuffleElements(Mask, WidenedMask);
3925}
3926
3927// Attempt to narrow/widen shuffle mask until it matches the target number of
3928// elements.
3929static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3930 SmallVectorImpl<int> &ScaledMask) {
3931 unsigned NumSrcElts = Mask.size();
3932 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3933 "Illegal shuffle scale factor");
3934
3935 // Narrowing is guaranteed to work.
3936 if (NumDstElts >= NumSrcElts) {
3937 int Scale = NumDstElts / NumSrcElts;
3938 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3939 return true;
3940 }
3941
3942 // We have to repeat the widening until we reach the target size, but we can
3943 // split out the first widening as it sets up ScaledMask for us.
3944 if (canWidenShuffleElements(Mask, ScaledMask)) {
3945 while (ScaledMask.size() > NumDstElts) {
3946 SmallVector<int, 16> WidenedMask;
3947 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3948 return false;
3949 ScaledMask = std::move(WidenedMask);
3950 }
3951 return true;
3952 }
3953
3954 return false;
3955}
3956
3957static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3958 SmallVector<int, 32> ScaledMask;
3959 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3960}
3961
3962// Helper to grow the shuffle mask for a larger value type.
3963// NOTE: This is different to scaleShuffleElements which is a same size type.
3964static void growShuffleMask(ArrayRef<int> SrcMask,
3965 SmallVectorImpl<int> &DstMask,
3966 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3967 assert(DstMask.empty() && "Expected an empty shuffle mas");
3968 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3969 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3970 unsigned NumSrcElts = SrcMask.size();
3971 DstMask.assign(SrcMask.begin(), SrcMask.end());
3972 for (int &M : DstMask) {
3973 if (M < 0)
3974 continue;
3975 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3976 }
3977 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3978}
3979
3980/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3982 return isNullConstant(Elt) || isNullFPConstant(Elt);
3983}
3984
3985// Build a vector of constants.
3986// Use an UNDEF node if MaskElt == -1.
3987// Split 64-bit constants in the 32-bit mode.
3989 const SDLoc &dl, bool IsMask = false) {
3990
3992 bool Split = false;
3993
3994 MVT ConstVecVT = VT;
3995 unsigned NumElts = VT.getVectorNumElements();
3996 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3997 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3998 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3999 Split = true;
4000 }
4001
4002 MVT EltVT = ConstVecVT.getVectorElementType();
4003 for (unsigned i = 0; i < NumElts; ++i) {
4004 bool IsUndef = Values[i] < 0 && IsMask;
4005 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4006 DAG.getConstant(Values[i], dl, EltVT);
4007 Ops.push_back(OpNode);
4008 if (Split)
4009 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4010 DAG.getConstant(0, dl, EltVT));
4011 }
4012 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4013 if (Split)
4014 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4015 return ConstsNode;
4016}
4017
4018static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4019 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4020 assert(Bits.size() == Undefs.getBitWidth() &&
4021 "Unequal constant and undef arrays");
4023 bool Split = false;
4024
4025 MVT ConstVecVT = VT;
4026 unsigned NumElts = VT.getVectorNumElements();
4027 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4028 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4029 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4030 Split = true;
4031 }
4032
4033 MVT EltVT = ConstVecVT.getVectorElementType();
4034 MVT EltIntVT = EltVT.changeTypeToInteger();
4035 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4036 if (Undefs[i]) {
4037 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4038 continue;
4039 }
4040 const APInt &V = Bits[i];
4041 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4042 if (Split) {
4043 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4044 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4045 } else {
4046 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4047 }
4048 }
4049
4050 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4051 return DAG.getBitcast(VT, ConstsNode);
4052}
4053
4055 SelectionDAG &DAG, const SDLoc &dl) {
4056 APInt Undefs = APInt::getZero(Bits.size());
4057 return getConstVector(Bits, Undefs, VT, DAG, dl);
4058}
4059
4060/// Returns a vector of specified type with all zero elements.
4061static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4062 SelectionDAG &DAG, const SDLoc &dl) {
4063 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4064 VT.getVectorElementType() == MVT::i1) &&
4065 "Unexpected vector type");
4066
4067 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4068 // type. This ensures they get CSE'd. But if the integer type is not
4069 // available, use a floating-point +0.0 instead.
4070 SDValue Vec;
4071 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4072 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4073 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4074 } else if (VT.isFloatingPoint() &&
4076 Vec = DAG.getConstantFP(+0.0, dl, VT);
4077 } else if (VT.getVectorElementType() == MVT::i1) {
4078 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4079 "Unexpected vector type");
4080 Vec = DAG.getConstant(0, dl, VT);
4081 } else {
4082 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4083 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4084 }
4085 return DAG.getBitcast(VT, Vec);
4086}
4087
4088// Helper to determine if the ops are all the extracted subvectors come from a
4089// single source. If we allow commute they don't have to be in order (Lo/Hi).
4090static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4091 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4092 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4093 LHS.getValueType() != RHS.getValueType() ||
4094 LHS.getOperand(0) != RHS.getOperand(0))
4095 return SDValue();
4096
4097 SDValue Src = LHS.getOperand(0);
4098 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4099 return SDValue();
4100
4101 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4102 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4103 RHS.getConstantOperandAPInt(1) == NumElts) ||
4104 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4105 LHS.getConstantOperandAPInt(1) == NumElts))
4106 return Src;
4107
4108 return SDValue();
4109}
4110
4111static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4112 const SDLoc &dl, unsigned vectorWidth) {
4113 EVT VT = Vec.getValueType();
4114 EVT ElVT = VT.getVectorElementType();
4115 unsigned ResultNumElts =
4116 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4117 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4118
4119 assert(ResultVT.getSizeInBits() == vectorWidth &&
4120 "Illegal subvector extraction");
4121
4122 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4123 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4124 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4125
4126 // This is the index of the first element of the vectorWidth-bit chunk
4127 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4128 IdxVal &= ~(ElemsPerChunk - 1);
4129
4130 // If the input is a buildvector just emit a smaller one.
4131 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4132 return DAG.getBuildVector(ResultVT, dl,
4133 Vec->ops().slice(IdxVal, ElemsPerChunk));
4134
4135 // Check if we're extracting the upper undef of a widening pattern.
4136 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4137 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4138 isNullConstant(Vec.getOperand(2)))
4139 return DAG.getUNDEF(ResultVT);
4140
4141 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4142}
4143
4144/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4145/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4146/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4147/// instructions or a simple subregister reference. Idx is an index in the
4148/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4149/// lowering EXTRACT_VECTOR_ELT operations easier.
4150static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4151 SelectionDAG &DAG, const SDLoc &dl) {
4153 Vec.getValueType().is512BitVector()) &&
4154 "Unexpected vector size!");
4155 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4156}
4157
4158/// Generate a DAG to grab 256-bits from a 512-bit vector.
4159static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4160 SelectionDAG &DAG, const SDLoc &dl) {
4161 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4162 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4163}
4164
4165static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4166 SelectionDAG &DAG, const SDLoc &dl,
4167 unsigned vectorWidth) {
4168 assert((vectorWidth == 128 || vectorWidth == 256) &&
4169 "Unsupported vector width");
4170 // Inserting UNDEF is Result
4171 if (Vec.isUndef())
4172 return Result;
4173
4174 // Insert the relevant vectorWidth bits.
4175 EVT VT = Vec.getValueType();
4176 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4177 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4178
4179 // This is the index of the first element of the vectorWidth-bit chunk
4180 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4181 IdxVal &= ~(ElemsPerChunk - 1);
4182 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4183}
4184
4185/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4186/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4187/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4188/// simple superregister reference. Idx is an index in the 128 bits
4189/// we want. It need not be aligned to a 128-bit boundary. That makes
4190/// lowering INSERT_VECTOR_ELT operations easier.
4191static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4192 SelectionDAG &DAG, const SDLoc &dl) {
4193 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4194 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4195}
4196
4197/// Widen a vector to a larger size with the same scalar type, with the new
4198/// elements either zero or undef.
4199static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4200 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4201 const SDLoc &dl) {
4202 EVT VecVT = Vec.getValueType();
4204 VecVT.getScalarType() == VT.getScalarType() &&
4205 "Unsupported vector widening type");
4206 // If the upper 128-bits of a build vector are already undef/zero, then try to
4207 // widen from the lower 128-bits.
4208 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4209 unsigned NumSrcElts = VecVT.getVectorNumElements();
4210 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4211 if (all_of(Hi, [&](SDValue V) {
4212 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4213 }))
4214 Vec = extract128BitVector(Vec, 0, DAG, dl);
4215 }
4216 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4217 : DAG.getUNDEF(VT);
4218 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4219}
4220
4221/// Widen a vector to a larger size with the same scalar type, with the new
4222/// elements either zero or undef.
4223static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4224 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4225 const SDLoc &dl, unsigned WideSizeInBits) {
4226 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4227 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4228 "Unsupported vector widening type");
4229 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4230 MVT SVT = Vec.getSimpleValueType().getScalarType();
4231 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4232 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4233}
4234
4235/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4236/// and bitcast with integer types.
4237static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4238 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4239 unsigned NumElts = VT.getVectorNumElements();
4240 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4241 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4242 return VT;
4243}
4244
4245/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4246/// bitcast with integer types.
4247static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4248 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4249 const SDLoc &dl) {
4250 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4251 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4252}
4253
4254// Helper function to collect subvector ops that are concatenated together,
4255// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4256// The subvectors in Ops are guaranteed to be the same type.
4258 SelectionDAG &DAG) {
4259 assert(Ops.empty() && "Expected an empty ops vector");
4260
4261 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4262 Ops.append(N->op_begin(), N->op_end());
4263 return true;
4264 }
4265
4266 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4267 SDValue Src = N->getOperand(0);
4268 SDValue Sub = N->getOperand(1);
4269 const APInt &Idx = N->getConstantOperandAPInt(2);
4270 EVT VT = Src.getValueType();
4271 EVT SubVT = Sub.getValueType();
4272
4273 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4274 // insert_subvector(undef, x, lo)
4275 if (Idx == 0 && Src.isUndef()) {
4276 Ops.push_back(Sub);
4277 Ops.push_back(DAG.getUNDEF(SubVT));
4278 return true;
4279 }
4280 if (Idx == (VT.getVectorNumElements() / 2)) {
4281 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4282 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4283 Src.getOperand(1).getValueType() == SubVT &&
4284 isNullConstant(Src.getOperand(2))) {
4285 // Attempt to recurse into inner (matching) concats.
4286 SDValue Lo = Src.getOperand(1);
4287 SDValue Hi = Sub;
4288 SmallVector<SDValue, 2> LoOps, HiOps;
4289 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4290 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4291 LoOps.size() == HiOps.size()) {
4292 Ops.append(LoOps);
4293 Ops.append(HiOps);
4294 return true;
4295 }
4296 Ops.push_back(Lo);
4297 Ops.push_back(Hi);
4298 return true;
4299 }
4300 // insert_subvector(x, extract_subvector(x, lo), hi)
4301 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4302 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4303 Ops.append(2, Sub);
4304 return true;
4305 }
4306 // insert_subvector(undef, x, hi)
4307 if (Src.isUndef()) {
4308 Ops.push_back(DAG.getUNDEF(SubVT));
4309 Ops.push_back(Sub);
4310 return true;
4311 }
4312 }
4313 }
4314 }
4315
4316 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4317 EVT VT = N->getValueType(0);
4318 SDValue Src = N->getOperand(0);
4319 uint64_t Idx = N->getConstantOperandVal(1);
4320
4321 // Collect all the subvectors from the source vector and slice off the
4322 // extraction.
4324 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4325 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4326 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4327 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4328 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4329 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4330 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4331 return true;
4332 }
4333 }
4334
4335 assert(Ops.empty() && "Expected an empty ops vector");
4336 return false;
4337}
4338
4339// Helper to check if \p V can be split into subvectors and the upper subvectors
4340// are all undef. In which case return the lower subvector.
4342 SelectionDAG &DAG) {
4343 SmallVector<SDValue> SubOps;
4344 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4345 return SDValue();
4346
4347 unsigned NumSubOps = SubOps.size();
4348 unsigned HalfNumSubOps = NumSubOps / 2;
4349 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4350
4351 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4352 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4353 return SDValue();
4354
4355 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4356 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4357 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4358}
4359
4360// Helper to check if we can access all the constituent subvectors without any
4361// extract ops.
4364 return collectConcatOps(V.getNode(), Ops, DAG);
4365}
4366
4367static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4368 const SDLoc &dl) {
4369 EVT VT = Op.getValueType();
4370 unsigned NumElems = VT.getVectorNumElements();
4371 unsigned SizeInBits = VT.getSizeInBits();
4372 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4373 "Can't split odd sized vector");
4374
4376 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4377 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4378 unsigned HalfOps = SubOps.size() / 2;
4379 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4380 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4381 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4382 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4383 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4384 return std::make_pair(Lo, Hi);
4385 }
4386
4387 // If this is a splat value (with no-undefs) then use the lower subvector,
4388 // which should be a free extraction.
4389 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4390 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4391 return std::make_pair(Lo, Lo);
4392
4393 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4394 return std::make_pair(Lo, Hi);
4395}
4396
4397/// Break an operation into 2 half sized ops and then concatenate the results.
4399 unsigned NumOps = Op.getNumOperands();
4400 EVT VT = Op.getValueType();
4401
4402 // Extract the LHS Lo/Hi vectors
4403 SmallVector<SDValue> LoOps(NumOps, SDValue());
4404 SmallVector<SDValue> HiOps(NumOps, SDValue());
4405 for (unsigned I = 0; I != NumOps; ++I) {
4406 SDValue SrcOp = Op.getOperand(I);
4407 if (!SrcOp.getValueType().isVector()) {
4408 LoOps[I] = HiOps[I] = SrcOp;
4409 continue;
4410 }
4411 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4412 }
4413
4414 EVT LoVT, HiVT;
4415 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4416 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4417 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4418 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4419}
4420
4421/// Break an unary integer operation into 2 half sized ops and then
4422/// concatenate the result back.
4424 const SDLoc &dl) {
4425 // Make sure we only try to split 256/512-bit types to avoid creating
4426 // narrow vectors.
4427 [[maybe_unused]] EVT VT = Op.getValueType();
4428 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4429 Op.getOperand(0).getValueType().is512BitVector()) &&
4430 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4431 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4432 VT.getVectorNumElements() &&
4433 "Unexpected VTs!");
4434 return splitVectorOp(Op, DAG, dl);
4435}
4436
4437/// Break a binary integer operation into 2 half sized ops and then
4438/// concatenate the result back.
4440 const SDLoc &dl) {
4441 // Assert that all the types match.
4442 [[maybe_unused]] EVT VT = Op.getValueType();
4443 assert(Op.getOperand(0).getValueType() == VT &&
4444 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4445 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4446 return splitVectorOp(Op, DAG, dl);
4447}
4448
4449// Helper for splitting operands of an operation to legal target size and
4450// apply a function on each part.
4451// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4452// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4453// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4454// The argument Builder is a function that will be applied on each split part:
4455// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4456template <typename F>
4458 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4459 F Builder, bool CheckBWI = true) {
4460 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4461 unsigned NumSubs = 1;
4462 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4463 (!CheckBWI && Subtarget.useAVX512Regs())) {
4464 if (VT.getSizeInBits() > 512) {
4465 NumSubs = VT.getSizeInBits() / 512;
4466 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4467 }
4468 } else if (Subtarget.hasAVX2()) {
4469 if (VT.getSizeInBits() > 256) {
4470 NumSubs = VT.getSizeInBits() / 256;
4471 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4472 }
4473 } else {
4474 if (VT.getSizeInBits() > 128) {
4475 NumSubs = VT.getSizeInBits() / 128;
4476 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4477 }
4478 }
4479
4480 if (NumSubs == 1)
4481 return Builder(DAG, DL, Ops);
4482
4484 for (unsigned i = 0; i != NumSubs; ++i) {
4486 for (SDValue Op : Ops) {
4487 EVT OpVT = Op.getValueType();
4488 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4489 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4490 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4491 }
4492 Subs.push_back(Builder(DAG, DL, SubOps));
4493 }
4494 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4495}
4496
4497// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4498// targets.
4499static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4501 const X86Subtarget &Subtarget) {
4502 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4503 MVT SVT = VT.getScalarType();
4504
4505 // If we have a 32/64 splatted constant, splat it to DstTy to
4506 // encourage a foldable broadcast'd operand.
4507 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4508 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4509 // AVX512 broadcasts 32/64-bit operands.
4510 // TODO: Support float once getAVX512Node is used by fp-ops.
4511 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4513 return SDValue();
4514 // If we're not widening, don't bother if we're not bitcasting.
4515 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4516 return SDValue();
4517 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4518 APInt SplatValue, SplatUndef;
4519 unsigned SplatBitSize;
4520 bool HasAnyUndefs;
4521 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4522 HasAnyUndefs, OpEltSizeInBits) &&
4523 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4524 return DAG.getConstant(SplatValue, DL, DstVT);
4525 }
4526 return SDValue();
4527 };
4528
4529 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4530
4531 MVT DstVT = VT;
4532 if (Widen)
4533 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4534
4535 // Canonicalize src operands.
4536 SmallVector<SDValue> SrcOps(Ops);
4537 for (SDValue &Op : SrcOps) {
4538 MVT OpVT = Op.getSimpleValueType();
4539 // Just pass through scalar operands.
4540 if (!OpVT.isVector())
4541 continue;
4542 assert(OpVT == VT && "Vector type mismatch");
4543
4544 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4545 Op = BroadcastOp;
4546 continue;
4547 }
4548
4549 // Just widen the subvector by inserting into an undef wide vector.
4550 if (Widen)
4551 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4552 }
4553
4554 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4555
4556 // Perform the 512-bit op then extract the bottom subvector.
4557 if (Widen)
4558 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4559 return Res;
4560}
4561
4562/// Insert i1-subvector to i1-vector.
4564 const X86Subtarget &Subtarget) {
4565
4566 SDLoc dl(Op);
4567 SDValue Vec = Op.getOperand(0);
4568 SDValue SubVec = Op.getOperand(1);
4569 SDValue Idx = Op.getOperand(2);
4570 unsigned IdxVal = Op.getConstantOperandVal(2);
4571
4572 // Inserting undef is a nop. We can just return the original vector.
4573 if (SubVec.isUndef())
4574 return Vec;
4575
4576 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4577 return Op;
4578
4579 MVT OpVT = Op.getSimpleValueType();
4580 unsigned NumElems = OpVT.getVectorNumElements();
4581 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4582
4583 // Extend to natively supported kshift.
4584 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4585
4586 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4587 // if necessary.
4588 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4589 // May need to promote to a legal type.
4590 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4591 DAG.getConstant(0, dl, WideOpVT),
4592 SubVec, Idx);
4593 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4594 }
4595
4596 MVT SubVecVT = SubVec.getSimpleValueType();
4597 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4598 assert(IdxVal + SubVecNumElems <= NumElems &&
4599 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4600 "Unexpected index value in INSERT_SUBVECTOR");
4601
4602 SDValue Undef = DAG.getUNDEF(WideOpVT);
4603
4604 if (IdxVal == 0) {
4605 // Zero lower bits of the Vec
4606 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4607 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4608 ZeroIdx);
4609 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4610 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4611 // Merge them together, SubVec should be zero extended.
4612 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4613 DAG.getConstant(0, dl, WideOpVT),
4614 SubVec, ZeroIdx);
4615 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4616 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4617 }
4618
4619 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4620 Undef, SubVec, ZeroIdx);
4621
4622 if (Vec.isUndef()) {
4623 assert(IdxVal != 0 && "Unexpected index");
4624 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4625 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4626 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4627 }
4628
4630 assert(IdxVal != 0 && "Unexpected index");
4631 // If upper elements of Vec are known undef, then just shift into place.
4632 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4633 [](SDValue V) { return V.isUndef(); })) {
4634 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4635 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4636 } else {
4637 NumElems = WideOpVT.getVectorNumElements();
4638 unsigned ShiftLeft = NumElems - SubVecNumElems;
4639 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4640 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4641 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4642 if (ShiftRight != 0)
4643 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4644 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4645 }
4646 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4647 }
4648
4649 // Simple case when we put subvector in the upper part
4650 if (IdxVal + SubVecNumElems == NumElems) {
4651 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4652 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4653 if (SubVecNumElems * 2 == NumElems) {
4654 // Special case, use legal zero extending insert_subvector. This allows
4655 // isel to optimize when bits are known zero.
4656 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4657 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4658 DAG.getConstant(0, dl, WideOpVT),
4659 Vec, ZeroIdx);
4660 } else {
4661 // Otherwise use explicit shifts to zero the bits.
4662 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4663 Undef, Vec, ZeroIdx);
4664 NumElems = WideOpVT.getVectorNumElements();
4665 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4666 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4667 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4668 }
4669 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4670 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4671 }
4672
4673 // Inserting into the middle is more complicated.
4674
4675 NumElems = WideOpVT.getVectorNumElements();
4676
4677 // Widen the vector if needed.
4678 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4679
4680 unsigned ShiftLeft = NumElems - SubVecNumElems;
4681 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4682
4683 // Do an optimization for the most frequently used types.
4684 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4685 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4686 Mask0.flipAllBits();
4687 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4688 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4689 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4690 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4691 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4692 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4693 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4694 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4695
4696 // Reduce to original width if needed.
4697 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4698 }
4699
4700 // Clear the upper bits of the subvector and move it to its insert position.
4701 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4702 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4703 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4704 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4705
4706 // Isolate the bits below the insertion point.
4707 unsigned LowShift = NumElems - IdxVal;
4708 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4709 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4710 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4711 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4712
4713 // Isolate the bits after the last inserted bit.
4714 unsigned HighShift = IdxVal + SubVecNumElems;
4715 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4716 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4717 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4718 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4719
4720 // Now OR all 3 pieces together.
4721 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4722 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4723
4724 // Reduce to original width if needed.
4725 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4726}
4727
4729 const SDLoc &dl) {
4730 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4731 EVT SubVT = V1.getValueType();
4732 EVT SubSVT = SubVT.getScalarType();
4733 unsigned SubNumElts = SubVT.getVectorNumElements();
4734 unsigned SubVectorWidth = SubVT.getSizeInBits();
4735 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4736 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4737 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4738}
4739
4740/// Returns a vector of specified type with all bits set.
4741/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4742/// Then bitcast to their original type, ensuring they get CSE'd.
4743static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4744 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4745 "Expected a 128/256/512-bit vector type");
4746 unsigned NumElts = VT.getSizeInBits() / 32;
4747 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4748 return DAG.getBitcast(VT, Vec);
4749}
4750
4751static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4752 SDValue In, SelectionDAG &DAG) {
4753 EVT InVT = In.getValueType();
4754 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4755
4756 // Canonicalize Opcode to general extension version.
4757 switch (Opcode) {
4758 case ISD::ANY_EXTEND:
4760 Opcode = ISD::ANY_EXTEND;
4761 break;
4762 case ISD::SIGN_EXTEND:
4764 Opcode = ISD::SIGN_EXTEND;
4765 break;
4766 case ISD::ZERO_EXTEND:
4768 Opcode = ISD::ZERO_EXTEND;
4769 break;
4770 default:
4771 llvm_unreachable("Unknown extension opcode");
4772 }
4773
4774 // For 256-bit vectors, we only need the lower (128-bit) input half.
4775 // For 512-bit vectors, we only need the lower input half or quarter.
4776 if (InVT.getSizeInBits() > 128) {
4777 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4778 "Expected VTs to be the same size!");
4779 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4780 In = extractSubVector(In, 0, DAG, DL,
4781 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4782 InVT = In.getValueType();
4783 }
4784
4785 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4786 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4787
4788 return DAG.getNode(Opcode, DL, VT, In);
4789}
4790
4791// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4792static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4793 SDValue Mask, SelectionDAG &DAG) {
4794 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4795 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4796 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4797}
4798
4800 bool Lo, bool Unary) {
4801 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4802 "Illegal vector type to unpack");
4803 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4804 int NumElts = VT.getVectorNumElements();
4805 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4806 for (int i = 0; i < NumElts; ++i) {
4807 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4808 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4809 Pos += (Unary ? 0 : NumElts * (i % 2));
4810 Pos += (Lo ? 0 : NumEltsInLane / 2);
4811 Mask.push_back(Pos);
4812 }
4813}
4814
4815/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4816/// imposed by AVX and specific to the unary pattern. Example:
4817/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4818/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4820 bool Lo) {
4821 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4822 int NumElts = VT.getVectorNumElements();
4823 for (int i = 0; i < NumElts; ++i) {
4824 int Pos = i / 2;
4825 Pos += (Lo ? 0 : NumElts / 2);
4826 Mask.push_back(Pos);
4827 }
4828}
4829
4830// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4831static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4832 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4835 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4836 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4837 int M = Mask[I];
4838 if (M < 0)
4839 continue;
4840 SDValue V = (M < NumElts) ? V1 : V2;
4841 if (V.isUndef())
4842 continue;
4843 Ops[I] = V.getOperand(M % NumElts);
4844 }
4845 return DAG.getBuildVector(VT, dl, Ops);
4846 }
4847
4848 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4849}
4850
4851/// Returns a vector_shuffle node for an unpackl operation.
4852static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4853 SDValue V1, SDValue V2) {
4855 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4856 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4857}
4858
4859/// Returns a vector_shuffle node for an unpackh operation.
4860static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4861 SDValue V1, SDValue V2) {
4863 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4864 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4865}
4866
4867/// Returns a node that packs the LHS + RHS nodes together at half width.
4868/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4869/// TODO: Add subvector splitting if/when we have a need for it.
4870static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4871 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4872 bool PackHiHalf = false) {
4873 MVT OpVT = LHS.getSimpleValueType();
4874 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4875 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4876 assert(OpVT == RHS.getSimpleValueType() &&
4877 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4878 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4879 "Unexpected PACK operand types");
4880 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4881 "Unexpected PACK result type");
4882
4883 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4884 if (EltSizeInBits == 32) {
4885 SmallVector<int> PackMask;
4886 int Offset = PackHiHalf ? 1 : 0;
4887 int NumElts = VT.getVectorNumElements();
4888 for (int I = 0; I != NumElts; I += 4) {
4889 PackMask.push_back(I + Offset);
4890 PackMask.push_back(I + Offset + 2);
4891 PackMask.push_back(I + Offset + NumElts);
4892 PackMask.push_back(I + Offset + NumElts + 2);
4893 }
4894 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4895 DAG.getBitcast(VT, RHS), PackMask);
4896 }
4897
4898 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4899 if (!PackHiHalf) {
4900 if (UsePackUS &&
4901 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4902 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4903 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4904
4905 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4906 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4907 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4908 }
4909
4910 // Fallback to sign/zero extending the requested half and pack.
4911 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4912 if (UsePackUS) {
4913 if (PackHiHalf) {
4914 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4915 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4916 } else {
4917 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4918 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4919 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4920 };
4921 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4922 };
4923
4924 if (!PackHiHalf) {
4925 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4926 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4927 }
4928 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4929 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4930 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4931}
4932
4933/// Return a vector_shuffle of the specified vector of zero or undef vector.
4934/// This produces a shuffle where the low element of V2 is swizzled into the
4935/// zero/undef vector, landing at element Idx.
4936/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4938 bool IsZero,
4939 const X86Subtarget &Subtarget,
4940 SelectionDAG &DAG) {
4941 MVT VT = V2.getSimpleValueType();
4942 SDValue V1 = IsZero
4943 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4944 int NumElems = VT.getVectorNumElements();
4945 SmallVector<int, 16> MaskVec(NumElems);
4946 for (int i = 0; i != NumElems; ++i)
4947 // If this is the insertion idx, put the low elt of V2 here.
4948 MaskVec[i] = (i == Idx) ? NumElems : i;
4949 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4950}
4951
4953 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4954 Ptr.getOpcode() == X86ISD::WrapperRIP)
4955 Ptr = Ptr.getOperand(0);
4956 return dyn_cast<ConstantPoolSDNode>(Ptr);
4957}
4958
4959// TODO: Add support for non-zero offsets.
4962 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4963 return nullptr;
4964 return CNode->getConstVal();
4965}
4966
4968 if (!Load || !ISD::isNormalLoad(Load))
4969 return nullptr;
4970 return getTargetConstantFromBasePtr(Load->getBasePtr());
4971}
4972
4975 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4976}
4977
4978const Constant *
4980 assert(LD && "Unexpected null LoadSDNode");
4981 return getTargetConstantFromNode(LD);
4982}
4983
4985 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4986 SDValue Cond = N->getOperand(0);
4987 SDValue RHS = N->getOperand(2);
4988 EVT CondVT = Cond.getValueType();
4989 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4990 CondVT.getVectorElementType() == MVT::i1 &&
4992}
4993
4994// Extract raw constant bits from constant pools.
4995static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4996 APInt &UndefElts,
4997 SmallVectorImpl<APInt> &EltBits,
4998 bool AllowWholeUndefs = true,
4999 bool AllowPartialUndefs = false) {
5000 assert(EltBits.empty() && "Expected an empty EltBits vector");
5001
5003
5004 EVT VT = Op.getValueType();
5005 unsigned SizeInBits = VT.getSizeInBits();
5006 unsigned NumElts = SizeInBits / EltSizeInBits;
5007
5008 // Can't split constant.
5009 if ((SizeInBits % EltSizeInBits) != 0)
5010 return false;
5011
5012 // Bitcast a source array of element bits to the target size.
5013 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5014 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5015 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5016 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5017 "Constant bit sizes don't match");
5018
5019 // Don't split if we don't allow undef bits.
5020 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5021 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5022 return false;
5023
5024 // If we're already the right size, don't bother bitcasting.
5025 if (NumSrcElts == NumElts) {
5026 UndefElts = UndefSrcElts;
5027 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5028 return true;
5029 }
5030
5031 // Extract all the undef/constant element data and pack into single bitsets.
5032 APInt UndefBits(SizeInBits, 0);
5033 APInt MaskBits(SizeInBits, 0);
5034
5035 for (unsigned i = 0; i != NumSrcElts; ++i) {
5036 unsigned BitOffset = i * SrcEltSizeInBits;
5037 if (UndefSrcElts[i])
5038 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5039 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5040 }
5041
5042 // Split the undef/constant single bitset data into the target elements.
5043 UndefElts = APInt(NumElts, 0);
5044 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5045
5046 for (unsigned i = 0; i != NumElts; ++i) {
5047 unsigned BitOffset = i * EltSizeInBits;
5048 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5049
5050 // Only treat an element as UNDEF if all bits are UNDEF.
5051 if (UndefEltBits.isAllOnes()) {
5052 if (!AllowWholeUndefs)
5053 return false;
5054 UndefElts.setBit(i);
5055 continue;
5056 }
5057
5058 // If only some bits are UNDEF then treat them as zero (or bail if not
5059 // supported).
5060 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5061 return false;
5062
5063 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5064 }
5065 return true;
5066 };
5067
5068 // Collect constant bits and insert into mask/undef bit masks.
5069 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5070 unsigned UndefBitIndex) {
5071 if (!Cst)
5072 return false;
5073 if (isa<UndefValue>(Cst)) {
5074 Undefs.setBit(UndefBitIndex);
5075 return true;
5076 }
5077 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5078 Mask = CInt->getValue();
5079 return true;
5080 }
5081 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5082 Mask = CFP->getValueAPF().bitcastToAPInt();
5083 return true;
5084 }
5085 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5086 Type *Ty = CDS->getType();
5088 Type *EltTy = CDS->getElementType();
5089 bool IsInteger = EltTy->isIntegerTy();
5090 bool IsFP =
5091 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5092 if (!IsInteger && !IsFP)
5093 return false;
5094 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5095 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5096 if (IsInteger)
5097 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5098 else
5099 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5100 I * EltBits);
5101 return true;
5102 }
5103 return false;
5104 };
5105
5106 // Handle UNDEFs.
5107 if (Op.isUndef()) {
5108 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5109 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5110 return CastBitData(UndefSrcElts, SrcEltBits);
5111 }
5112
5113 // Extract scalar constant bits.
5114 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5115 APInt UndefSrcElts = APInt::getZero(1);
5116 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5117 return CastBitData(UndefSrcElts, SrcEltBits);
5118 }
5119 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5120 APInt UndefSrcElts = APInt::getZero(1);
5121 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5122 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5123 return CastBitData(UndefSrcElts, SrcEltBits);
5124 }
5125
5126 // Extract constant bits from build vector.
5127 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5128 BitVector Undefs;
5129 SmallVector<APInt> SrcEltBits;
5130 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5131 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5132 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5133 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5134 if (Undefs[I])
5135 UndefSrcElts.setBit(I);
5136 return CastBitData(UndefSrcElts, SrcEltBits);
5137 }
5138 }
5139
5140 // Extract constant bits from constant pool vector.
5141 if (auto *Cst = getTargetConstantFromNode(Op)) {
5142 Type *CstTy = Cst->getType();
5143 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5144 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5145 return false;
5146
5147 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5148 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5149 if ((SizeInBits % SrcEltSizeInBits) != 0)
5150 return false;
5151
5152 APInt UndefSrcElts(NumSrcElts, 0);
5153 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5154 for (unsigned i = 0; i != NumSrcElts; ++i)
5155 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5156 UndefSrcElts, i))
5157 return false;
5158
5159 return CastBitData(UndefSrcElts, SrcEltBits);
5160 }
5161
5162 // Extract constant bits from a broadcasted constant pool scalar.
5163 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5164 EltSizeInBits <= VT.getScalarSizeInBits()) {
5165 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5166 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5167 return false;
5168
5169 SDValue Ptr = MemIntr->getBasePtr();
5171 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5172 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5173
5174 APInt UndefSrcElts(NumSrcElts, 0);
5175 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5176 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5177 if (UndefSrcElts[0])
5178 UndefSrcElts.setBits(0, NumSrcElts);
5179 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5180 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5181 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5182 return CastBitData(UndefSrcElts, SrcEltBits);
5183 }
5184 }
5185 }
5186
5187 // Extract constant bits from a subvector broadcast.
5188 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5189 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5190 SDValue Ptr = MemIntr->getBasePtr();
5191 // The source constant may be larger than the subvector broadcast,
5192 // ensure we extract the correct subvector constants.
5193 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5194 Type *CstTy = Cst->getType();
5195 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5196 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5197 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5198 (SizeInBits % SubVecSizeInBits) != 0)
5199 return false;
5200 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5201 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5202 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5203 APInt UndefSubElts(NumSubElts, 0);
5204 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5205 APInt(CstEltSizeInBits, 0));
5206 for (unsigned i = 0; i != NumSubElts; ++i) {
5207 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5208 UndefSubElts, i))
5209 return false;
5210 for (unsigned j = 1; j != NumSubVecs; ++j)
5211 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5212 }
5213 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5214 UndefSubElts);
5215 return CastBitData(UndefSubElts, SubEltBits);
5216 }
5217 }
5218
5219 // Extract a rematerialized scalar constant insertion.
5220 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5221 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5222 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5223 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5224 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5225
5226 APInt UndefSrcElts(NumSrcElts, 0);
5227 SmallVector<APInt, 64> SrcEltBits;
5228 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5229 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5230 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5231 return CastBitData(UndefSrcElts, SrcEltBits);
5232 }
5233
5234 // Insert constant bits from a base and sub vector sources.
5235 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5236 // If bitcasts to larger elements we might lose track of undefs - don't
5237 // allow any to be safe.
5238 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5239 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5240
5241 APInt UndefSrcElts, UndefSubElts;
5242 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5243 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5244 UndefSubElts, EltSubBits,
5245 AllowWholeUndefs && AllowUndefs,
5246 AllowPartialUndefs && AllowUndefs) &&
5247 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5248 UndefSrcElts, EltSrcBits,
5249 AllowWholeUndefs && AllowUndefs,
5250 AllowPartialUndefs && AllowUndefs)) {
5251 unsigned BaseIdx = Op.getConstantOperandVal(2);
5252 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5253 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5254 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5255 return CastBitData(UndefSrcElts, EltSrcBits);
5256 }
5257 }
5258
5259 // Extract constant bits from a subvector's source.
5260 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5261 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5262 EltBits, AllowWholeUndefs,
5263 AllowPartialUndefs)) {
5264 EVT SrcVT = Op.getOperand(0).getValueType();
5265 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5266 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5267 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5268 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5269 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5270 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5271 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5272
5273 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5274 if ((BaseIdx + NumSubElts) != NumSrcElts)
5275 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5276 if (BaseIdx != 0)
5277 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5278 return true;
5279 }
5280
5281 // Extract constant bits from shuffle node sources.
5282 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5283 // TODO - support shuffle through bitcasts.
5284 if (EltSizeInBits != VT.getScalarSizeInBits())
5285 return false;
5286
5287 ArrayRef<int> Mask = SVN->getMask();
5288 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5289 llvm::any_of(Mask, [](int M) { return M < 0; }))
5290 return false;
5291
5292 APInt UndefElts0, UndefElts1;
5293 SmallVector<APInt, 32> EltBits0, EltBits1;
5294 if (isAnyInRange(Mask, 0, NumElts) &&
5295 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5296 UndefElts0, EltBits0, AllowWholeUndefs,
5297 AllowPartialUndefs))
5298 return false;
5299 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5300 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5301 UndefElts1, EltBits1, AllowWholeUndefs,
5302 AllowPartialUndefs))
5303 return false;
5304
5305 UndefElts = APInt::getZero(NumElts);
5306 for (int i = 0; i != (int)NumElts; ++i) {
5307 int M = Mask[i];
5308 if (M < 0) {
5309 UndefElts.setBit(i);
5310 EltBits.push_back(APInt::getZero(EltSizeInBits));
5311 } else if (M < (int)NumElts) {
5312 if (UndefElts0[M])
5313 UndefElts.setBit(i);
5314 EltBits.push_back(EltBits0[M]);
5315 } else {
5316 if (UndefElts1[M - NumElts])
5317 UndefElts.setBit(i);
5318 EltBits.push_back(EltBits1[M - NumElts]);
5319 }
5320 }
5321 return true;
5322 }
5323
5324 return false;
5325}
5326
5327namespace llvm {
5328namespace X86 {
5329bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5330 APInt UndefElts;
5331 SmallVector<APInt, 16> EltBits;
5333 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5334 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5335 int SplatIndex = -1;
5336 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5337 if (UndefElts[i])
5338 continue;
5339 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5340 SplatIndex = -1;
5341 break;
5342 }
5343 SplatIndex = i;
5344 }
5345 if (0 <= SplatIndex) {
5346 SplatVal = EltBits[SplatIndex];
5347 return true;
5348 }
5349 }
5350
5351 return false;
5352}
5353} // namespace X86
5354} // namespace llvm
5355
5357 unsigned MaskEltSizeInBits,
5359 APInt &UndefElts) {
5360 // Extract the raw target constant bits.
5361 SmallVector<APInt, 64> EltBits;
5362 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5363 EltBits, /* AllowWholeUndefs */ true,
5364 /* AllowPartialUndefs */ false))
5365 return false;
5366
5367 // Insert the extracted elements into the mask.
5368 for (const APInt &Elt : EltBits)
5369 RawMask.push_back(Elt.getZExtValue());
5370
5371 return true;
5372}
5373
5374static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5375 bool AllowUndefs) {
5376 APInt UndefElts;
5377 SmallVector<APInt, 64> EltBits;
5378 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5379 /*AllowWholeUndefs*/ AllowUndefs,
5380 /*AllowPartialUndefs*/ false))
5381 return false;
5382
5383 bool IsPow2OrUndef = true;
5384 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5385 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5386 return IsPow2OrUndef;
5387}
5388
5389// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5391 // TODO: don't always ignore oneuse constraints.
5392 V = peekThroughBitcasts(V);
5393 EVT VT = V.getValueType();
5394
5395 // Match not(xor X, -1) -> X.
5396 if (V.getOpcode() == ISD::XOR &&
5397 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5398 isAllOnesConstant(V.getOperand(1))))
5399 return V.getOperand(0);
5400
5401 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5402 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5403 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5404 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5405 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5406 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5407 V.getOperand(1));
5408 }
5409 }
5410
5411 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5412 if (V.getOpcode() == X86ISD::PCMPGT &&
5413 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5414 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5415 V.getOperand(0).hasOneUse()) {
5416 APInt UndefElts;
5417 SmallVector<APInt> EltBits;
5418 if (getTargetConstantBitsFromNode(V.getOperand(0),
5419 V.getScalarValueSizeInBits(), UndefElts,
5420 EltBits) &&
5421 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5422 // Don't fold min_signed_value -> (min_signed_value - 1)
5423 bool MinSigned = false;
5424 for (APInt &Elt : EltBits) {
5425 MinSigned |= Elt.isMinSignedValue();
5426 Elt -= 1;
5427 }
5428 if (!MinSigned) {
5429 SDLoc DL(V);
5430 MVT VT = V.getSimpleValueType();
5431 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5432 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5433 }
5434 }
5435 }
5436
5437 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5439 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5440 for (SDValue &CatOp : CatOps) {
5441 SDValue NotCat = IsNOT(CatOp, DAG);
5442 if (!NotCat)
5443 return SDValue();
5444 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5445 }
5446 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5447 }
5448
5449 // Match not(or(not(X),not(Y))) -> and(X, Y).
5450 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5451 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5452 // TODO: Handle cases with single NOT operand -> ANDNP
5453 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5454 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5455 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5456 DAG.getBitcast(VT, Op1));
5457 }
5458
5459 return SDValue();
5460}
5461
5462/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5463/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5464/// Note: This ignores saturation, so inputs must be checked first.
5466 bool Unary, unsigned NumStages = 1) {
5467 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5468 unsigned NumElts = VT.getVectorNumElements();
5469 unsigned NumLanes = VT.getSizeInBits() / 128;
5470 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5471 unsigned Offset = Unary ? 0 : NumElts;
5472 unsigned Repetitions = 1u << (NumStages - 1);
5473 unsigned Increment = 1u << NumStages;
5474 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5475
5476 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5477 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5478 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5479 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5480 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5481 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5482 }
5483 }
5484}
5485
5486// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5487static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5488 APInt &DemandedLHS, APInt &DemandedRHS) {
5489 int NumLanes = VT.getSizeInBits() / 128;
5490 int NumElts = DemandedElts.getBitWidth();
5491 int NumInnerElts = NumElts / 2;
5492 int NumEltsPerLane = NumElts / NumLanes;
5493 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5494
5495 DemandedLHS = APInt::getZero(NumInnerElts);
5496 DemandedRHS = APInt::getZero(NumInnerElts);
5497
5498 // Map DemandedElts to the packed operands.
5499 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5500 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5501 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5502 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5503 if (DemandedElts[OuterIdx])
5504 DemandedLHS.setBit(InnerIdx);
5505 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5506 DemandedRHS.setBit(InnerIdx);
5507 }
5508 }
5509}
5510
5511// Split the demanded elts of a HADD/HSUB node between its operands.
5512static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5513 APInt &DemandedLHS, APInt &DemandedRHS) {
5515 DemandedLHS, DemandedRHS);
5516 DemandedLHS |= DemandedLHS << 1;
5517 DemandedRHS |= DemandedRHS << 1;
5518}
5519
5520/// Calculates the shuffle mask corresponding to the target-specific opcode.
5521/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5522/// operands in \p Ops, and returns true.
5523/// Sets \p IsUnary to true if only one source is used. Note that this will set
5524/// IsUnary for shuffles which use a single input multiple times, and in those
5525/// cases it will adjust the mask to only have indices within that single input.
5526/// It is an error to call this with non-empty Mask/Ops vectors.
5527static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5529 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5530 if (!isTargetShuffle(N.getOpcode()))
5531 return false;
5532
5533 MVT VT = N.getSimpleValueType();
5534 unsigned NumElems = VT.getVectorNumElements();
5535 unsigned MaskEltSize = VT.getScalarSizeInBits();
5537 APInt RawUndefs;
5538 uint64_t ImmN;
5539
5540 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5541 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5542
5543 IsUnary = false;
5544 bool IsFakeUnary = false;
5545 switch (N.getOpcode()) {
5546 case X86ISD::BLENDI:
5547 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5548 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5549 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5550 DecodeBLENDMask(NumElems, ImmN, Mask);
5551 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5552 break;
5553 case X86ISD::SHUFP:
5554 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5555 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5556 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5557 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5558 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5559 break;
5560 case X86ISD::INSERTPS:
5561 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5562 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5563 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5564 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5565 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5566 break;
5567 case X86ISD::EXTRQI:
5568 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5569 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5570 isa<ConstantSDNode>(N.getOperand(2))) {
5571 int BitLen = N.getConstantOperandVal(1);
5572 int BitIdx = N.getConstantOperandVal(2);
5573 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5574 IsUnary = true;
5575 }
5576 break;
5577 case X86ISD::INSERTQI:
5578 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5579 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5580 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5581 isa<ConstantSDNode>(N.getOperand(3))) {
5582 int BitLen = N.getConstantOperandVal(2);
5583 int BitIdx = N.getConstantOperandVal(3);
5584 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5585 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5586 }
5587 break;
5588 case X86ISD::UNPCKH:
5589 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5590 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5591 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5592 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5593 break;
5594 case X86ISD::UNPCKL:
5595 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5596 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5597 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5598 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5599 break;
5600 case X86ISD::MOVHLPS:
5601 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5602 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5603 DecodeMOVHLPSMask(NumElems, Mask);
5604 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5605 break;
5606 case X86ISD::MOVLHPS:
5607 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5608 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5609 DecodeMOVLHPSMask(NumElems, Mask);
5610 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5611 break;
5612 case X86ISD::VALIGN:
5613 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5614 "Only 32-bit and 64-bit elements are supported!");
5615 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5616 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5617 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5618 DecodeVALIGNMask(NumElems, ImmN, Mask);
5619 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5620 Ops.push_back(N.getOperand(1));
5621 Ops.push_back(N.getOperand(0));
5622 break;
5623 case X86ISD::PALIGNR:
5624 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5625 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5626 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5627 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5628 DecodePALIGNRMask(NumElems, ImmN, Mask);
5629 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5630 Ops.push_back(N.getOperand(1));
5631 Ops.push_back(N.getOperand(0));
5632 break;
5633 case X86ISD::VSHLDQ:
5634 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5635 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5636 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5637 DecodePSLLDQMask(NumElems, ImmN, Mask);
5638 IsUnary = true;
5639 break;
5640 case X86ISD::VSRLDQ:
5641 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5642 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5643 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5644 DecodePSRLDQMask(NumElems, ImmN, Mask);
5645 IsUnary = true;
5646 break;
5647 case X86ISD::PSHUFD:
5648 case X86ISD::VPERMILPI:
5649 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5650 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5651 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5652 IsUnary = true;
5653 break;
5654 case X86ISD::PSHUFHW:
5655 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5656 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5657 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5658 IsUnary = true;
5659 break;
5660 case X86ISD::PSHUFLW:
5661 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5662 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5663 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5664 IsUnary = true;
5665 break;
5666 case X86ISD::VZEXT_MOVL:
5667 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5668 DecodeZeroMoveLowMask(NumElems, Mask);
5669 IsUnary = true;
5670 break;
5671 case X86ISD::VBROADCAST:
5672 // We only decode broadcasts of same-sized vectors, peeking through to
5673 // extracted subvectors is likely to cause hasOneUse issues with
5674 // SimplifyDemandedBits etc.
5675 if (N.getOperand(0).getValueType() == VT) {
5676 DecodeVectorBroadcast(NumElems, Mask);
5677 IsUnary = true;
5678 break;
5679 }
5680 return false;
5681 case X86ISD::VPERMILPV: {
5682 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5683 IsUnary = true;
5684 SDValue MaskNode = N.getOperand(1);
5685 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5686 RawUndefs)) {
5687 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5688 break;
5689 }
5690 return false;
5691 }
5692 case X86ISD::PSHUFB: {
5693 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5694 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5695 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5696 IsUnary = true;
5697 SDValue MaskNode = N.getOperand(1);
5698 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5699 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5700 break;
5701 }
5702 return false;
5703 }
5704 case X86ISD::VPERMI:
5705 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5706 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5707 DecodeVPERMMask(NumElems, ImmN, Mask);
5708 IsUnary = true;
5709 break;
5710 case X86ISD::MOVSS:
5711 case X86ISD::MOVSD:
5712 case X86ISD::MOVSH:
5713 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5714 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5715 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5716 break;
5717 case X86ISD::VPERM2X128:
5718 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5719 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5720 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5721 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5722 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5723 break;
5724 case X86ISD::SHUF128:
5725 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5726 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5727 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5728 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5729 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5730 break;
5731 case X86ISD::MOVSLDUP:
5732 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5733 DecodeMOVSLDUPMask(NumElems, Mask);
5734 IsUnary = true;
5735 break;
5736 case X86ISD::MOVSHDUP:
5737 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5738 DecodeMOVSHDUPMask(NumElems, Mask);
5739 IsUnary = true;
5740 break;
5741 case X86ISD::MOVDDUP:
5742 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5743 DecodeMOVDDUPMask(NumElems, Mask);
5744 IsUnary = true;
5745 break;
5746 case X86ISD::VPERMIL2: {
5747 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5748 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5749 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5750 SDValue MaskNode = N.getOperand(2);
5751 SDValue CtrlNode = N.getOperand(3);
5752 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5753 unsigned CtrlImm = CtrlOp->getZExtValue();
5754 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5755 RawUndefs)) {
5756 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5757 Mask);
5758 break;
5759 }
5760 }
5761 return false;
5762 }
5763 case X86ISD::VPPERM: {
5764 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5765 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5766 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5767 SDValue MaskNode = N.getOperand(2);
5768 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5769 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5770 break;
5771 }
5772 return false;
5773 }
5774 case X86ISD::VPERMV: {
5775 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5776 IsUnary = true;
5777 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5778 Ops.push_back(N.getOperand(1));
5779 SDValue MaskNode = N.getOperand(0);
5780 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5781 RawUndefs)) {
5782 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5783 break;
5784 }
5785 return false;
5786 }
5787 case X86ISD::VPERMV3: {
5788 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5789 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5790 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5791 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5792 Ops.push_back(N.getOperand(0));
5793 Ops.push_back(N.getOperand(2));
5794 SDValue MaskNode = N.getOperand(1);
5795 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5796 RawUndefs)) {
5797 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5798 break;
5799 }
5800 return false;
5801 }
5802 default:
5803 llvm_unreachable("unknown target shuffle node");
5804 }
5805
5806 // Empty mask indicates the decode failed.
5807 if (Mask.empty())
5808 return false;
5809
5810 // Check if we're getting a shuffle mask with zero'd elements.
5811 if (!AllowSentinelZero && isAnyZero(Mask))
5812 return false;
5813
5814 // If we have a fake unary shuffle, the shuffle mask is spread across two
5815 // inputs that are actually the same node. Re-map the mask to always point
5816 // into the first input.
5817 if (IsFakeUnary)
5818 for (int &M : Mask)
5819 if (M >= (int)Mask.size())
5820 M -= Mask.size();
5821
5822 // If we didn't already add operands in the opcode-specific code, default to
5823 // adding 1 or 2 operands starting at 0.
5824 if (Ops.empty()) {
5825 Ops.push_back(N.getOperand(0));
5826 if (!IsUnary || IsFakeUnary)
5827 Ops.push_back(N.getOperand(1));
5828 }
5829
5830 return true;
5831}
5832
5833// Wrapper for getTargetShuffleMask with InUnary;
5834static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5836 SmallVectorImpl<int> &Mask) {
5837 bool IsUnary;
5838 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5839}
5840
5841/// Compute whether each element of a shuffle is zeroable.
5842///
5843/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5844/// Either it is an undef element in the shuffle mask, the element of the input
5845/// referenced is undef, or the element of the input referenced is known to be
5846/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5847/// as many lanes with this technique as possible to simplify the remaining
5848/// shuffle.
5850 SDValue V1, SDValue V2,
5851 APInt &KnownUndef, APInt &KnownZero) {
5852 int Size = Mask.size();
5853 KnownUndef = KnownZero = APInt::getZero(Size);
5854
5855 V1 = peekThroughBitcasts(V1);
5856 V2 = peekThroughBitcasts(V2);
5857
5858 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5859 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5860
5861 int VectorSizeInBits = V1.getValueSizeInBits();
5862 int ScalarSizeInBits = VectorSizeInBits / Size;
5863 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5864
5865 for (int i = 0; i < Size; ++i) {
5866 int M = Mask[i];
5867 // Handle the easy cases.
5868 if (M < 0) {
5869 KnownUndef.setBit(i);
5870 continue;
5871 }
5872 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5873 KnownZero.setBit(i);
5874 continue;
5875 }
5876
5877 // Determine shuffle input and normalize the mask.
5878 SDValue V = M < Size ? V1 : V2;
5879 M %= Size;
5880
5881 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5882 if (V.getOpcode() != ISD::BUILD_VECTOR)
5883 continue;
5884
5885 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5886 // the (larger) source element must be UNDEF/ZERO.
5887 if ((Size % V.getNumOperands()) == 0) {
5888 int Scale = Size / V->getNumOperands();
5889 SDValue Op = V.getOperand(M / Scale);
5890 if (Op.isUndef())
5891 KnownUndef.setBit(i);
5892 if (X86::isZeroNode(Op))
5893 KnownZero.setBit(i);
5894 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5895 APInt Val = Cst->getAPIntValue();
5896 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5897 if (Val == 0)
5898 KnownZero.setBit(i);
5899 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5900 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5901 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5902 if (Val == 0)
5903 KnownZero.setBit(i);
5904 }
5905 continue;
5906 }
5907
5908 // If the BUILD_VECTOR has more elements then all the (smaller) source
5909 // elements must be UNDEF or ZERO.
5910 if ((V.getNumOperands() % Size) == 0) {
5911 int Scale = V->getNumOperands() / Size;
5912 bool AllUndef = true;
5913 bool AllZero = true;
5914 for (int j = 0; j < Scale; ++j) {
5915 SDValue Op = V.getOperand((M * Scale) + j);
5916 AllUndef &= Op.isUndef();
5917 AllZero &= X86::isZeroNode(Op);
5918 }
5919 if (AllUndef)
5920 KnownUndef.setBit(i);
5921 if (AllZero)
5922 KnownZero.setBit(i);
5923 continue;
5924 }
5925 }
5926}
5927
5928/// Decode a target shuffle mask and inputs and see if any values are
5929/// known to be undef or zero from their inputs.
5930/// Returns true if the target shuffle mask was decoded.
5931/// FIXME: Merge this with computeZeroableShuffleElements?
5934 APInt &KnownUndef, APInt &KnownZero) {
5935 bool IsUnary;
5936 if (!isTargetShuffle(N.getOpcode()))
5937 return false;
5938
5939 MVT VT = N.getSimpleValueType();
5940 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5941 return false;
5942
5943 int Size = Mask.size();
5944 SDValue V1 = Ops[0];
5945 SDValue V2 = IsUnary ? V1 : Ops[1];
5946 KnownUndef = KnownZero = APInt::getZero(Size);
5947
5948 V1 = peekThroughBitcasts(V1);
5949 V2 = peekThroughBitcasts(V2);
5950
5951 assert((VT.getSizeInBits() % Size) == 0 &&
5952 "Illegal split of shuffle value type");
5953 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5954
5955 // Extract known constant input data.
5956 APInt UndefSrcElts[2];
5957 SmallVector<APInt, 32> SrcEltBits[2];
5958 bool IsSrcConstant[2] = {
5959 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5960 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5961 /*AllowPartialUndefs*/ false),
5962 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5963 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5964 /*AllowPartialUndefs*/ false)};
5965
5966 for (int i = 0; i < Size; ++i) {
5967 int M = Mask[i];
5968
5969 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5970 if (M < 0) {
5971 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5972 if (SM_SentinelUndef == M)
5973 KnownUndef.setBit(i);
5974 if (SM_SentinelZero == M)
5975 KnownZero.setBit(i);
5976 continue;
5977 }
5978
5979 // Determine shuffle input and normalize the mask.
5980 unsigned SrcIdx = M / Size;
5981 SDValue V = M < Size ? V1 : V2;
5982 M %= Size;
5983
5984 // We are referencing an UNDEF input.
5985 if (V.isUndef()) {
5986 KnownUndef.setBit(i);
5987 continue;
5988 }
5989
5990 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5991 // TODO: We currently only set UNDEF for integer types - floats use the same
5992 // registers as vectors and many of the scalar folded loads rely on the
5993 // SCALAR_TO_VECTOR pattern.
5994 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5995 (Size % V.getValueType().getVectorNumElements()) == 0) {
5996 int Scale = Size / V.getValueType().getVectorNumElements();
5997 int Idx = M / Scale;
5998 if (Idx != 0 && !VT.isFloatingPoint())
5999 KnownUndef.setBit(i);
6000 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6001 KnownZero.setBit(i);
6002 continue;
6003 }
6004
6005 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6006 // base vectors.
6007 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6008 SDValue Vec = V.getOperand(0);
6009 int NumVecElts = Vec.getValueType().getVectorNumElements();
6010 if (Vec.isUndef() && Size == NumVecElts) {
6011 int Idx = V.getConstantOperandVal(2);
6012 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6013 if (M < Idx || (Idx + NumSubElts) <= M)
6014 KnownUndef.setBit(i);
6015 }
6016 continue;
6017 }
6018
6019 // Attempt to extract from the source's constant bits.
6020 if (IsSrcConstant[SrcIdx]) {
6021 if (UndefSrcElts[SrcIdx][M])
6022 KnownUndef.setBit(i);
6023 else if (SrcEltBits[SrcIdx][M] == 0)
6024 KnownZero.setBit(i);
6025 }
6026 }
6027
6028 assert(VT.getVectorNumElements() == (unsigned)Size &&
6029 "Different mask size from vector size!");
6030 return true;
6031}
6032
6033// Replace target shuffle mask elements with known undef/zero sentinels.
6035 const APInt &KnownUndef,
6036 const APInt &KnownZero,
6037 bool ResolveKnownZeros= true) {
6038 unsigned NumElts = Mask.size();
6039 assert(KnownUndef.getBitWidth() == NumElts &&
6040 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6041
6042 for (unsigned i = 0; i != NumElts; ++i) {
6043 if (KnownUndef[i])
6044 Mask[i] = SM_SentinelUndef;
6045 else if (ResolveKnownZeros && KnownZero[i])
6046 Mask[i] = SM_SentinelZero;
6047 }
6048}
6049
6050// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6052 APInt &KnownUndef,
6053 APInt &KnownZero) {
6054 unsigned NumElts = Mask.size();
6055 KnownUndef = KnownZero = APInt::getZero(NumElts);
6056
6057 for (unsigned i = 0; i != NumElts; ++i) {
6058 int M = Mask[i];
6059 if (SM_SentinelUndef == M)
6060 KnownUndef.setBit(i);
6061 if (SM_SentinelZero == M)
6062 KnownZero.setBit(i);
6063 }
6064}
6065
6066// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6068 SDValue Cond, bool IsBLENDV = false) {
6069 EVT CondVT = Cond.getValueType();
6070 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6071 unsigned NumElts = CondVT.getVectorNumElements();
6072
6073 APInt UndefElts;
6074 SmallVector<APInt, 32> EltBits;
6075 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6076 /*AllowWholeUndefs*/ true,
6077 /*AllowPartialUndefs*/ false))
6078 return false;
6079
6080 Mask.resize(NumElts, SM_SentinelUndef);
6081
6082 for (int i = 0; i != (int)NumElts; ++i) {
6083 Mask[i] = i;
6084 // Arbitrarily choose from the 2nd operand if the select condition element
6085 // is undef.
6086 // TODO: Can we do better by matching patterns such as even/odd?
6087 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6088 (IsBLENDV && EltBits[i].isNonNegative()))
6089 Mask[i] += NumElts;
6090 }
6091
6092 return true;
6093}
6094
6095// Forward declaration (for getFauxShuffleMask recursive check).
6096static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6099 const SelectionDAG &DAG, unsigned Depth,
6100 bool ResolveKnownElts);
6101
6102// Attempt to decode ops that could be represented as a shuffle mask.
6103// The decoded shuffle mask may contain a different number of elements to the
6104// destination value type.
6105// TODO: Merge into getTargetShuffleInputs()
6106static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6109 const SelectionDAG &DAG, unsigned Depth,
6110 bool ResolveKnownElts) {
6111 Mask.clear();
6112 Ops.clear();
6113
6114 MVT VT = N.getSimpleValueType();
6115 unsigned NumElts = VT.getVectorNumElements();
6116 unsigned NumSizeInBits = VT.getSizeInBits();
6117 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6118 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6119 return false;
6120 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6121 unsigned NumSizeInBytes = NumSizeInBits / 8;
6122 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6123
6124 unsigned Opcode = N.getOpcode();
6125 switch (Opcode) {
6126 case ISD::VECTOR_SHUFFLE: {
6127 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6128 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6129 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6130 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6131 Ops.push_back(N.getOperand(0));
6132 Ops.push_back(N.getOperand(1));
6133 return true;
6134 }
6135 return false;
6136 }
6137 case ISD::AND:
6138 case X86ISD::ANDNP: {
6139 // Attempt to decode as a per-byte mask.
6140 APInt UndefElts;
6141 SmallVector<APInt, 32> EltBits;
6142 SDValue N0 = N.getOperand(0);
6143 SDValue N1 = N.getOperand(1);
6144 bool IsAndN = (X86ISD::ANDNP == Opcode);
6145 uint64_t ZeroMask = IsAndN ? 255 : 0;
6146 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6147 /*AllowWholeUndefs*/ false,
6148 /*AllowPartialUndefs*/ false))
6149 return false;
6150 // We can't assume an undef src element gives an undef dst - the other src
6151 // might be zero.
6152 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6153 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6154 const APInt &ByteBits = EltBits[i];
6155 if (ByteBits != 0 && ByteBits != 255)
6156 return false;
6157 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6158 }
6159 Ops.push_back(IsAndN ? N1 : N0);
6160 return true;
6161 }
6162 case ISD::OR: {
6163 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6164 // is a valid shuffle index.
6165 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6166 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6167 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6168 return false;
6169
6170 SmallVector<int, 64> SrcMask0, SrcMask1;
6171 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6174 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6175 Depth + 1, true) ||
6176 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6177 Depth + 1, true))
6178 return false;
6179
6180 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6181 SmallVector<int, 64> Mask0, Mask1;
6182 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6183 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6184 for (int i = 0; i != (int)MaskSize; ++i) {
6185 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6186 // loops converting between OR and BLEND shuffles due to
6187 // canWidenShuffleElements merging away undef elements, meaning we
6188 // fail to recognise the OR as the undef element isn't known zero.
6189 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6190 Mask.push_back(SM_SentinelZero);
6191 else if (Mask1[i] == SM_SentinelZero)
6192 Mask.push_back(i);
6193 else if (Mask0[i] == SM_SentinelZero)
6194 Mask.push_back(i + MaskSize);
6195 else
6196 return false;
6197 }
6198 Ops.push_back(N.getOperand(0));
6199 Ops.push_back(N.getOperand(1));
6200 return true;
6201 }
6202 case ISD::CONCAT_VECTORS: {
6203 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6204 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6205 if (NumBitsPerElt == 64) {
6206 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6207 for (unsigned M = 0; M != NumSubElts; ++M)
6208 Mask.push_back((I * NumElts) + M);
6209 Ops.push_back(N.getOperand(I));
6210 }
6211 return true;
6212 }
6213 return false;
6214 }
6215 case ISD::INSERT_SUBVECTOR: {
6216 SDValue Src = N.getOperand(0);
6217 SDValue Sub = N.getOperand(1);
6218 EVT SubVT = Sub.getValueType();
6219 unsigned NumSubElts = SubVT.getVectorNumElements();
6220 uint64_t InsertIdx = N.getConstantOperandVal(2);
6221 // Subvector isn't demanded - just return the base vector.
6222 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6223 Mask.resize(NumElts);
6224 std::iota(Mask.begin(), Mask.end(), 0);
6225 Ops.push_back(Src);
6226 return true;
6227 }
6228 // Handle CONCAT(SUB0, SUB1).
6229 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6230 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6231 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6232 Src.getOperand(0).isUndef() &&
6233 Src.getOperand(1).getValueType() == SubVT &&
6234 Src.getConstantOperandVal(2) == 0 &&
6235 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6236 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6237 Mask.resize(NumElts);
6238 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6239 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6240 Ops.push_back(Src.getOperand(1));
6241 Ops.push_back(Sub);
6242 return true;
6243 }
6244 if (!N->isOnlyUserOf(Sub.getNode()))
6245 return false;
6246
6247 SmallVector<int, 64> SubMask;
6248 SmallVector<SDValue, 2> SubInputs;
6250 EVT SubSrcVT = SubSrc.getValueType();
6251 if (!SubSrcVT.isVector())
6252 return false;
6253
6254 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6255 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6256 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6257 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6258 SDValue SubSrcSrc = SubSrc.getOperand(0);
6259 unsigned NumSubSrcSrcElts =
6260 SubSrcSrc.getValueType().getVectorNumElements();
6261 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6262 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6263 "Subvector valuetype mismatch");
6264 InsertIdx *= (MaxElts / NumElts);
6265 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6266 NumSubElts *= (MaxElts / NumElts);
6267 bool SrcIsUndef = Src.isUndef();
6268 for (int i = 0; i != (int)MaxElts; ++i)
6269 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6270 for (int i = 0; i != (int)NumSubElts; ++i)
6271 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6272 if (!SrcIsUndef)
6273 Ops.push_back(Src);
6274 Ops.push_back(SubSrcSrc);
6275 return true;
6276 }
6277
6278 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6279 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6280 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6281 Depth + 1, ResolveKnownElts))
6282 return false;
6283
6284 // Subvector shuffle inputs must not be larger than the subvector.
6285 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6286 return SubVT.getFixedSizeInBits() <
6287 SubInput.getValueSizeInBits().getFixedValue();
6288 }))
6289 return false;
6290
6291 if (SubMask.size() != NumSubElts) {
6292 assert(((SubMask.size() % NumSubElts) == 0 ||
6293 (NumSubElts % SubMask.size()) == 0) &&
6294 "Illegal submask scale");
6295 if ((NumSubElts % SubMask.size()) == 0) {
6296 int Scale = NumSubElts / SubMask.size();
6297 SmallVector<int, 64> ScaledSubMask;
6298 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6299 SubMask = ScaledSubMask;
6300 } else {
6301 int Scale = SubMask.size() / NumSubElts;
6302 NumSubElts = SubMask.size();
6303 NumElts *= Scale;
6304 InsertIdx *= Scale;
6305 }
6306 }
6307 Ops.push_back(Src);
6308 Ops.append(SubInputs.begin(), SubInputs.end());
6309 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6310 Mask.append(NumElts, SM_SentinelZero);
6311 else
6312 for (int i = 0; i != (int)NumElts; ++i)
6313 Mask.push_back(i);
6314 for (int i = 0; i != (int)NumSubElts; ++i) {
6315 int M = SubMask[i];
6316 if (0 <= M) {
6317 int InputIdx = M / NumSubElts;
6318 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6319 }
6320 Mask[i + InsertIdx] = M;
6321 }
6322 return true;
6323 }
6324 case X86ISD::PINSRB:
6325 case X86ISD::PINSRW:
6328 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6329 // vector, for matching src/dst vector types.
6330 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6331
6332 unsigned DstIdx = 0;
6333 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6334 // Check we have an in-range constant insertion index.
6335 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6336 N.getConstantOperandAPInt(2).uge(NumElts))
6337 return false;
6338 DstIdx = N.getConstantOperandVal(2);
6339
6340 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6341 if (X86::isZeroNode(Scl)) {
6342 Ops.push_back(N.getOperand(0));
6343 for (unsigned i = 0; i != NumElts; ++i)
6344 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6345 return true;
6346 }
6347 }
6348
6349 // Peek through trunc/aext/zext/bitcast.
6350 // TODO: aext shouldn't require SM_SentinelZero padding.
6351 // TODO: handle shift of scalars.
6352 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6353 while (Scl.getOpcode() == ISD::TRUNCATE ||
6354 Scl.getOpcode() == ISD::ANY_EXTEND ||
6355 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6356 (Scl.getOpcode() == ISD::BITCAST &&
6359 Scl = Scl.getOperand(0);
6360 MinBitsPerElt =
6361 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6362 }
6363 if ((MinBitsPerElt % 8) != 0)
6364 return false;
6365
6366 // Attempt to find the source vector the scalar was extracted from.
6367 SDValue SrcExtract;
6368 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6369 Scl.getOpcode() == X86ISD::PEXTRW ||
6370 Scl.getOpcode() == X86ISD::PEXTRB) &&
6371 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6372 SrcExtract = Scl;
6373 }
6374 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6375 return false;
6376
6377 SDValue SrcVec = SrcExtract.getOperand(0);
6378 EVT SrcVT = SrcVec.getValueType();
6379 if (!SrcVT.getScalarType().isByteSized())
6380 return false;
6381 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6382 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6383 unsigned DstByte = DstIdx * NumBytesPerElt;
6384 MinBitsPerElt =
6385 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6386
6387 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6388 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6389 Ops.push_back(SrcVec);
6390 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6391 } else {
6392 Ops.push_back(SrcVec);
6393 Ops.push_back(N.getOperand(0));
6394 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6395 Mask.push_back(NumSizeInBytes + i);
6396 }
6397
6398 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6399 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6400 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6401 Mask[DstByte + i] = SrcByte + i;
6402 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6403 Mask[DstByte + i] = SM_SentinelZero;
6404 return true;
6405 }
6406 case X86ISD::PACKSS:
6407 case X86ISD::PACKUS: {
6408 SDValue N0 = N.getOperand(0);
6409 SDValue N1 = N.getOperand(1);
6410 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6411 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6412 "Unexpected input value type");
6413
6414 APInt EltsLHS, EltsRHS;
6415 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6416
6417 // If we know input saturation won't happen (or we don't care for particular
6418 // lanes), we can treat this as a truncation shuffle.
6419 bool Offset0 = false, Offset1 = false;
6420 if (Opcode == X86ISD::PACKSS) {
6421 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6422 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6423 (!(N1.isUndef() || EltsRHS.isZero()) &&
6424 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6425 return false;
6426 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6427 // PACKSS then it was likely being used for sign-extension for a
6428 // truncation, so just peek through and adjust the mask accordingly.
6429 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6430 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6431 Offset0 = true;
6432 N0 = N0.getOperand(0);
6433 }
6434 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6435 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6436 Offset1 = true;
6437 N1 = N1.getOperand(0);
6438 }
6439 } else {
6440 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6441 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6442 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6443 (!(N1.isUndef() || EltsRHS.isZero()) &&
6444 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6445 return false;
6446 }
6447
6448 bool IsUnary = (N0 == N1);
6449
6450 Ops.push_back(N0);
6451 if (!IsUnary)
6452 Ops.push_back(N1);
6453
6454 createPackShuffleMask(VT, Mask, IsUnary);
6455
6456 if (Offset0 || Offset1) {
6457 for (int &M : Mask)
6458 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6459 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6460 ++M;
6461 }
6462 return true;
6463 }
6464 case ISD::VSELECT:
6465 case X86ISD::BLENDV: {
6466 SDValue Cond = N.getOperand(0);
6467 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6468 Ops.push_back(N.getOperand(1));
6469 Ops.push_back(N.getOperand(2));
6470 return true;
6471 }
6472 return false;
6473 }
6474 case X86ISD::VTRUNC: {
6475 SDValue Src = N.getOperand(0);
6476 EVT SrcVT = Src.getValueType();
6477 if (SrcVT.getSizeInBits() != NumSizeInBits)
6478 return false;
6479 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6480 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6481 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6482 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6483 for (unsigned i = 0; i != NumSrcElts; ++i)
6484 Mask.push_back(i * Scale);
6485 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6486 Ops.push_back(Src);
6487 return true;
6488 }
6489 case ISD::SHL:
6490 case ISD::SRL: {
6491 APInt UndefElts;
6492 SmallVector<APInt, 32> EltBits;
6493 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6494 UndefElts, EltBits,
6495 /*AllowWholeUndefs*/ true,
6496 /*AllowPartialUndefs*/ false))
6497 return false;
6498
6499 // We can only decode 'whole byte' bit shifts as shuffles.
6500 for (unsigned I = 0; I != NumElts; ++I)
6501 if (DemandedElts[I] && !UndefElts[I] &&
6502 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6503 return false;
6504
6505 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6506 Ops.push_back(N.getOperand(0));
6507
6508 for (unsigned I = 0; I != NumElts; ++I) {
6509 if (!DemandedElts[I] || UndefElts[I])
6510 continue;
6511 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6512 unsigned Lo = I * NumBytesPerElt;
6513 unsigned Hi = Lo + NumBytesPerElt;
6514 // Clear mask to all zeros and insert the shifted byte indices.
6515 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6516 if (ISD::SHL == Opcode)
6517 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6518 else
6519 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6520 Lo + ByteShift);
6521 }
6522 return true;
6523 }
6524 case X86ISD::VSHLI:
6525 case X86ISD::VSRLI: {
6526 uint64_t ShiftVal = N.getConstantOperandVal(1);
6527 // Out of range bit shifts are guaranteed to be zero.
6528 if (NumBitsPerElt <= ShiftVal) {
6529 Mask.append(NumElts, SM_SentinelZero);
6530 return true;
6531 }
6532
6533 // We can only decode 'whole byte' bit shifts as shuffles.
6534 if ((ShiftVal % 8) != 0)
6535 break;
6536
6537 uint64_t ByteShift = ShiftVal / 8;
6538 Ops.push_back(N.getOperand(0));
6539
6540 // Clear mask to all zeros and insert the shifted byte indices.
6541 Mask.append(NumSizeInBytes, SM_SentinelZero);
6542
6543 if (X86ISD::VSHLI == Opcode) {
6544 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6545 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6546 Mask[i + j] = i + j - ByteShift;
6547 } else {
6548 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6549 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6550 Mask[i + j - ByteShift] = i + j;
6551 }
6552 return true;
6553 }
6554 case X86ISD::VROTLI:
6555 case X86ISD::VROTRI: {
6556 // We can only decode 'whole byte' bit rotates as shuffles.
6557 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6558 if ((RotateVal % 8) != 0)
6559 return false;
6560 Ops.push_back(N.getOperand(0));
6561 int Offset = RotateVal / 8;
6562 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6563 for (int i = 0; i != (int)NumElts; ++i) {
6564 int BaseIdx = i * NumBytesPerElt;
6565 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6566 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6567 }
6568 }
6569 return true;
6570 }
6571 case X86ISD::VBROADCAST: {
6572 SDValue Src = N.getOperand(0);
6573 if (!Src.getSimpleValueType().isVector()) {
6574 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6575 !isNullConstant(Src.getOperand(1)) ||
6576 Src.getOperand(0).getValueType().getScalarType() !=
6577 VT.getScalarType())
6578 return false;
6579 Src = Src.getOperand(0);
6580 }
6581 Ops.push_back(Src);
6582 Mask.append(NumElts, 0);
6583 return true;
6584 }
6586 SDValue Src = N.getOperand(0);
6587 EVT SrcVT = Src.getValueType();
6588 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6589
6590 // Extended source must be a simple vector.
6591 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6592 (NumBitsPerSrcElt % 8) != 0)
6593 return false;
6594
6595 // We can only handle all-signbits extensions.
6596 APInt DemandedSrcElts =
6597 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6598 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6599 return false;
6600
6601 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6602 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6603 for (unsigned I = 0; I != NumElts; ++I)
6604 Mask.append(Scale, I);
6605 Ops.push_back(Src);
6606 return true;
6607 }
6608 case ISD::ZERO_EXTEND:
6609 case ISD::ANY_EXTEND:
6612 SDValue Src = N.getOperand(0);
6613 EVT SrcVT = Src.getValueType();
6614
6615 // Extended source must be a simple vector.
6616 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6617 (SrcVT.getScalarSizeInBits() % 8) != 0)
6618 return false;
6619
6620 bool IsAnyExtend =
6621 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6622 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6623 IsAnyExtend, Mask);
6624 Ops.push_back(Src);
6625 return true;
6626 }
6627 }
6628
6629 return false;
6630}
6631
6632/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6634 SmallVectorImpl<int> &Mask) {
6635 int MaskWidth = Mask.size();
6636 SmallVector<SDValue, 16> UsedInputs;
6637 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6638 int lo = UsedInputs.size() * MaskWidth;
6639 int hi = lo + MaskWidth;
6640
6641 // Strip UNDEF input usage.
6642 if (Inputs[i].isUndef())
6643 for (int &M : Mask)
6644 if ((lo <= M) && (M < hi))
6645 M = SM_SentinelUndef;
6646
6647 // Check for unused inputs.
6648 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6649 for (int &M : Mask)
6650 if (lo <= M)
6651 M -= MaskWidth;
6652 continue;
6653 }
6654
6655 // Check for repeated inputs.
6656 bool IsRepeat = false;
6657 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6658 if (UsedInputs[j] != Inputs[i])
6659 continue;
6660 for (int &M : Mask)
6661 if (lo <= M)
6662 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6663 IsRepeat = true;
6664 break;
6665 }
6666 if (IsRepeat)
6667 continue;
6668
6669 UsedInputs.push_back(Inputs[i]);
6670 }
6671 Inputs = UsedInputs;
6672}
6673
6674/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6675/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6676/// Returns true if the target shuffle mask was decoded.
6677static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6680 APInt &KnownUndef, APInt &KnownZero,
6681 const SelectionDAG &DAG, unsigned Depth,
6682 bool ResolveKnownElts) {
6684 return false; // Limit search depth.
6685
6686 EVT VT = Op.getValueType();
6687 if (!VT.isSimple() || !VT.isVector())
6688 return false;
6689
6690 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6691 if (ResolveKnownElts)
6692 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6693 return true;
6694 }
6695 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6696 ResolveKnownElts)) {
6697 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6698 return true;
6699 }
6700 return false;
6701}
6702
6703static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6706 const SelectionDAG &DAG, unsigned Depth,
6707 bool ResolveKnownElts) {
6708 APInt KnownUndef, KnownZero;
6709 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6710 KnownZero, DAG, Depth, ResolveKnownElts);
6711}
6712
6715 const SelectionDAG &DAG, unsigned Depth = 0,
6716 bool ResolveKnownElts = true) {
6717 EVT VT = Op.getValueType();
6718 if (!VT.isSimple() || !VT.isVector())
6719 return false;
6720
6721 unsigned NumElts = Op.getValueType().getVectorNumElements();
6722 APInt DemandedElts = APInt::getAllOnes(NumElts);
6723 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6724 ResolveKnownElts);
6725}
6726
6727// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6728static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6729 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6730 SelectionDAG &DAG) {
6731 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6732 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6733 "Unknown broadcast load type");
6734
6735 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6736 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6737 return SDValue();
6738
6741 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6742 SDValue Ops[] = {Mem->getChain(), Ptr};
6743 SDValue BcstLd = DAG.getMemIntrinsicNode(
6744 Opcode, DL, Tys, Ops, MemVT,
6746 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6747 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6748 return BcstLd;
6749}
6750
6751/// Returns the scalar element that will make up the i'th
6752/// element of the result of the vector shuffle.
6753static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6754 SelectionDAG &DAG, unsigned Depth) {
6756 return SDValue(); // Limit search depth.
6757
6758 EVT VT = Op.getValueType();
6759 unsigned Opcode = Op.getOpcode();
6760 unsigned NumElems = VT.getVectorNumElements();
6761
6762 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6763 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6764 int Elt = SV->getMaskElt(Index);
6765
6766 if (Elt < 0)
6767 return DAG.getUNDEF(VT.getVectorElementType());
6768
6769 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6770 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6771 }
6772
6773 // Recurse into target specific vector shuffles to find scalars.
6774 if (isTargetShuffle(Opcode)) {
6775 MVT ShufVT = VT.getSimpleVT();
6776 MVT ShufSVT = ShufVT.getVectorElementType();
6777 int NumElems = (int)ShufVT.getVectorNumElements();
6778 SmallVector<int, 16> ShuffleMask;
6780 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6781 return SDValue();
6782
6783 int Elt = ShuffleMask[Index];
6784 if (Elt == SM_SentinelZero)
6785 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6786 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6787 if (Elt == SM_SentinelUndef)
6788 return DAG.getUNDEF(ShufSVT);
6789
6790 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6791 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6792 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6793 }
6794
6795 // Recurse into insert_subvector base/sub vector to find scalars.
6796 if (Opcode == ISD::INSERT_SUBVECTOR) {
6797 SDValue Vec = Op.getOperand(0);
6798 SDValue Sub = Op.getOperand(1);
6799 uint64_t SubIdx = Op.getConstantOperandVal(2);
6800 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6801
6802 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6803 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6804 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6805 }
6806
6807 // Recurse into concat_vectors sub vector to find scalars.
6808 if (Opcode == ISD::CONCAT_VECTORS) {
6809 EVT SubVT = Op.getOperand(0).getValueType();
6810 unsigned NumSubElts = SubVT.getVectorNumElements();
6811 uint64_t SubIdx = Index / NumSubElts;
6812 uint64_t SubElt = Index % NumSubElts;
6813 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6814 }
6815
6816 // Recurse into extract_subvector src vector to find scalars.
6817 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6818 SDValue Src = Op.getOperand(0);
6819 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6820 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6821 }
6822
6823 // We only peek through bitcasts of the same vector width.
6824 if (Opcode == ISD::BITCAST) {
6825 SDValue Src = Op.getOperand(0);
6826 EVT SrcVT = Src.getValueType();
6827 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6828 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6829 return SDValue();
6830 }
6831
6832 // Actual nodes that may contain scalar elements
6833
6834 // For insert_vector_elt - either return the index matching scalar or recurse
6835 // into the base vector.
6836 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6837 isa<ConstantSDNode>(Op.getOperand(2))) {
6838 if (Op.getConstantOperandAPInt(2) == Index)
6839 return Op.getOperand(1);
6840 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6841 }
6842
6843 if (Opcode == ISD::SCALAR_TO_VECTOR)
6844 return (Index == 0) ? Op.getOperand(0)
6845 : DAG.getUNDEF(VT.getVectorElementType());
6846
6847 if (Opcode == ISD::BUILD_VECTOR)
6848 return Op.getOperand(Index);
6849
6850 return SDValue();
6851}
6852
6853// Use PINSRB/PINSRW/PINSRD to create a build vector.
6855 const APInt &NonZeroMask,
6856 unsigned NumNonZero, unsigned NumZero,
6857 SelectionDAG &DAG,
6858 const X86Subtarget &Subtarget) {
6859 MVT VT = Op.getSimpleValueType();
6860 unsigned NumElts = VT.getVectorNumElements();
6861 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6862 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6863 "Illegal vector insertion");
6864
6865 SDValue V;
6866 bool First = true;
6867
6868 for (unsigned i = 0; i < NumElts; ++i) {
6869 bool IsNonZero = NonZeroMask[i];
6870 if (!IsNonZero)
6871 continue;
6872
6873 // If the build vector contains zeros or our first insertion is not the
6874 // first index then insert into zero vector to break any register
6875 // dependency else use SCALAR_TO_VECTOR.
6876 if (First) {
6877 First = false;
6878 if (NumZero || 0 != i)
6879 V = getZeroVector(VT, Subtarget, DAG, DL);
6880 else {
6881 assert(0 == i && "Expected insertion into zero-index");
6882 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6883 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6884 V = DAG.getBitcast(VT, V);
6885 continue;
6886 }
6887 }
6888 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6889 DAG.getVectorIdxConstant(i, DL));
6890 }
6891
6892 return V;
6893}
6894
6895/// Custom lower build_vector of v16i8.
6897 const APInt &NonZeroMask,
6898 unsigned NumNonZero, unsigned NumZero,
6899 SelectionDAG &DAG,
6900 const X86Subtarget &Subtarget) {
6901 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6902 return SDValue();
6903
6904 // SSE4.1 - use PINSRB to insert each byte directly.
6905 if (Subtarget.hasSSE41())
6906 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6907 DAG, Subtarget);
6908
6909 SDValue V;
6910
6911 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6912 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6913 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6914 !NonZeroMask.extractBits(2, 2).isZero()) {
6915 for (unsigned I = 0; I != 4; ++I) {
6916 if (!NonZeroMask[I])
6917 continue;
6918 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6919 if (I != 0)
6920 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6921 DAG.getConstant(I * 8, DL, MVT::i8));
6922 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6923 }
6924 assert(V && "Failed to fold v16i8 vector to zero");
6925 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6926 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6927 V = DAG.getBitcast(MVT::v8i16, V);
6928 }
6929 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6930 bool ThisIsNonZero = NonZeroMask[i];
6931 bool NextIsNonZero = NonZeroMask[i + 1];
6932 if (!ThisIsNonZero && !NextIsNonZero)
6933 continue;
6934
6935 SDValue Elt;
6936 if (ThisIsNonZero) {
6937 if (NumZero || NextIsNonZero)
6938 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6939 else
6940 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6941 }
6942
6943 if (NextIsNonZero) {
6944 SDValue NextElt = Op.getOperand(i + 1);
6945 if (i == 0 && NumZero)
6946 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6947 else
6948 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6949 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6950 DAG.getConstant(8, DL, MVT::i8));
6951 if (ThisIsNonZero)
6952 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6953 else
6954 Elt = NextElt;
6955 }
6956
6957 // If our first insertion is not the first index or zeros are needed, then
6958 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6959 // elements undefined).
6960 if (!V) {
6961 if (i != 0 || NumZero)
6962 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6963 else {
6964 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6965 V = DAG.getBitcast(MVT::v8i16, V);
6966 continue;
6967 }
6968 }
6969 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6970 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6971 DAG.getVectorIdxConstant(i / 2, DL));
6972 }
6973
6974 return DAG.getBitcast(MVT::v16i8, V);
6975}
6976
6977/// Custom lower build_vector of v8i16.
6979 const APInt &NonZeroMask,
6980 unsigned NumNonZero, unsigned NumZero,
6981 SelectionDAG &DAG,
6982 const X86Subtarget &Subtarget) {
6983 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6984 return SDValue();
6985
6986 // Use PINSRW to insert each byte directly.
6987 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6988 Subtarget);
6989}
6990
6991/// Custom lower build_vector of v4i32 or v4f32.
6993 SelectionDAG &DAG,
6994 const X86Subtarget &Subtarget) {
6995 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6996 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6997 // Because we're creating a less complicated build vector here, we may enable
6998 // further folding of the MOVDDUP via shuffle transforms.
6999 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7000 Op.getOperand(0) == Op.getOperand(2) &&
7001 Op.getOperand(1) == Op.getOperand(3) &&
7002 Op.getOperand(0) != Op.getOperand(1)) {
7003 MVT VT = Op.getSimpleValueType();
7004 MVT EltVT = VT.getVectorElementType();
7005 // Create a new build vector with the first 2 elements followed by undef
7006 // padding, bitcast to v2f64, duplicate, and bitcast back.
7007 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7008 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7009 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7010 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7011 return DAG.getBitcast(VT, Dup);
7012 }
7013
7014 // Find all zeroable elements.
7015 std::bitset<4> Zeroable, Undefs;
7016 for (int i = 0; i < 4; ++i) {
7017 SDValue Elt = Op.getOperand(i);
7018 Undefs[i] = Elt.isUndef();
7019 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7020 }
7021 assert(Zeroable.size() - Zeroable.count() > 1 &&
7022 "We expect at least two non-zero elements!");
7023
7024 // We only know how to deal with build_vector nodes where elements are either
7025 // zeroable or extract_vector_elt with constant index.
7026 SDValue FirstNonZero;
7027 unsigned FirstNonZeroIdx;
7028 for (unsigned i = 0; i < 4; ++i) {
7029 if (Zeroable[i])
7030 continue;
7031 SDValue Elt = Op.getOperand(i);
7032 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7033 !isa<ConstantSDNode>(Elt.getOperand(1)))
7034 return SDValue();
7035 // Make sure that this node is extracting from a 128-bit vector.
7036 MVT VT = Elt.getOperand(0).getSimpleValueType();
7037 if (!VT.is128BitVector())
7038 return SDValue();
7039 if (!FirstNonZero.getNode()) {
7040 FirstNonZero = Elt;
7041 FirstNonZeroIdx = i;
7042 }
7043 }
7044
7045 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7046 SDValue V1 = FirstNonZero.getOperand(0);
7047 MVT VT = V1.getSimpleValueType();
7048
7049 // See if this build_vector can be lowered as a blend with zero.
7050 SDValue Elt;
7051 unsigned EltMaskIdx, EltIdx;
7052 int Mask[4];
7053 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7054 if (Zeroable[EltIdx]) {
7055 // The zero vector will be on the right hand side.
7056 Mask[EltIdx] = EltIdx+4;
7057 continue;
7058 }
7059
7060 Elt = Op->getOperand(EltIdx);
7061 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7062 EltMaskIdx = Elt.getConstantOperandVal(1);
7063 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7064 break;
7065 Mask[EltIdx] = EltIdx;
7066 }
7067
7068 if (EltIdx == 4) {
7069 // Let the shuffle legalizer deal with blend operations.
7070 SDValue VZeroOrUndef = (Zeroable == Undefs)
7071 ? DAG.getUNDEF(VT)
7072 : getZeroVector(VT, Subtarget, DAG, DL);
7073 if (V1.getSimpleValueType() != VT)
7074 V1 = DAG.getBitcast(VT, V1);
7075 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7076 }
7077
7078 // See if we can lower this build_vector to a INSERTPS.
7079 if (!Subtarget.hasSSE41())
7080 return SDValue();
7081
7082 SDValue V2 = Elt.getOperand(0);
7083 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7084 V1 = SDValue();
7085
7086 bool CanFold = true;
7087 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7088 if (Zeroable[i])
7089 continue;
7090
7091 SDValue Current = Op->getOperand(i);
7092 SDValue SrcVector = Current->getOperand(0);
7093 if (!V1.getNode())
7094 V1 = SrcVector;
7095 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7096 }
7097
7098 if (!CanFold)
7099 return SDValue();
7100
7101 assert(V1.getNode() && "Expected at least two non-zero elements!");
7102 if (V1.getSimpleValueType() != MVT::v4f32)
7103 V1 = DAG.getBitcast(MVT::v4f32, V1);
7104 if (V2.getSimpleValueType() != MVT::v4f32)
7105 V2 = DAG.getBitcast(MVT::v4f32, V2);
7106
7107 // Ok, we can emit an INSERTPS instruction.
7108 unsigned ZMask = Zeroable.to_ulong();
7109
7110 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7111 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7112 SDValue Result =
7113 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7114 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7115 return DAG.getBitcast(VT, Result);
7116}
7117
7118/// Return a vector logical shift node.
7119static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7120 SelectionDAG &DAG, const TargetLowering &TLI,
7121 const SDLoc &dl) {
7122 assert(VT.is128BitVector() && "Unknown type for VShift");
7123 MVT ShVT = MVT::v16i8;
7124 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7125 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7126 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7127 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7128 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7129}
7130
7132 SelectionDAG &DAG) {
7133
7134 // Check if the scalar load can be widened into a vector load. And if
7135 // the address is "base + cst" see if the cst can be "absorbed" into
7136 // the shuffle mask.
7137 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
7138 SDValue Ptr = LD->getBasePtr();
7139 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7140 return SDValue();
7141 EVT PVT = LD->getValueType(0);
7142 if (PVT != MVT::i32 && PVT != MVT::f32)
7143 return SDValue();
7144
7145 int FI = -1;
7146 int64_t Offset = 0;
7147 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7148 FI = FINode->getIndex();
7149 Offset = 0;
7150 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7151 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7152 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7153 Offset = Ptr.getConstantOperandVal(1);
7154 Ptr = Ptr.getOperand(0);
7155 } else {
7156 return SDValue();
7157 }
7158
7159 // FIXME: 256-bit vector instructions don't require a strict alignment,
7160 // improve this code to support it better.
7161 Align RequiredAlign(VT.getSizeInBits() / 8);
7162 SDValue Chain = LD->getChain();
7163 // Make sure the stack object alignment is at least 16 or 32.
7165 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7166 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7167 if (MFI.isFixedObjectIndex(FI)) {
7168 // Can't change the alignment. FIXME: It's possible to compute
7169 // the exact stack offset and reference FI + adjust offset instead.
7170 // If someone *really* cares about this. That's the way to implement it.
7171 return SDValue();
7172 } else {
7173 MFI.setObjectAlignment(FI, RequiredAlign);
7174 }
7175 }
7176
7177 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7178 // Ptr + (Offset & ~15).
7179 if (Offset < 0)
7180 return SDValue();
7181 if ((Offset % RequiredAlign.value()) & 3)
7182 return SDValue();
7183 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7184 if (StartOffset) {
7185 SDLoc DL(Ptr);
7186 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7187 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7188 }
7189
7190 int EltNo = (Offset - StartOffset) >> 2;
7191 unsigned NumElems = VT.getVectorNumElements();
7192
7193 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7194 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7195 LD->getPointerInfo().getWithOffset(StartOffset));
7196
7197 SmallVector<int, 8> Mask(NumElems, EltNo);
7198
7199 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7200 }
7201
7202 return SDValue();
7203}
7204
7205// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7206static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7207 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7208 auto *BaseLd = cast<LoadSDNode>(Elt);
7209 if (!BaseLd->isSimple())
7210 return false;
7211 Ld = BaseLd;
7212 ByteOffset = 0;
7213 return true;
7214 }
7215
7216 switch (Elt.getOpcode()) {
7217 case ISD::BITCAST:
7218 case ISD::TRUNCATE:
7220 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7221 case ISD::SRL:
7222 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7223 uint64_t Amt = AmtC->getZExtValue();
7224 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7225 ByteOffset += Amt / 8;
7226 return true;
7227 }
7228 }
7229 break;
7231 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7232 SDValue Src = Elt.getOperand(0);
7233 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7234 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7235 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7236 findEltLoadSrc(Src, Ld, ByteOffset)) {
7237 uint64_t Idx = IdxC->getZExtValue();
7238 ByteOffset += Idx * (SrcSizeInBits / 8);
7239 return true;
7240 }
7241 }
7242 break;
7243 }
7244
7245 return false;
7246}
7247
7248/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7249/// elements can be replaced by a single large load which has the same value as
7250/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7251///
7252/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7254 const SDLoc &DL, SelectionDAG &DAG,
7255 const X86Subtarget &Subtarget,
7256 bool IsAfterLegalize) {
7257 if ((VT.getScalarSizeInBits() % 8) != 0)
7258 return SDValue();
7259
7260 unsigned NumElems = Elts.size();
7261
7262 int LastLoadedElt = -1;
7263 APInt LoadMask = APInt::getZero(NumElems);
7264 APInt ZeroMask = APInt::getZero(NumElems);
7265 APInt UndefMask = APInt::getZero(NumElems);
7266
7267 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7268 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7269
7270 // For each element in the initializer, see if we've found a load, zero or an
7271 // undef.
7272 for (unsigned i = 0; i < NumElems; ++i) {
7273 SDValue Elt = peekThroughBitcasts(Elts[i]);
7274 if (!Elt.getNode())
7275 return SDValue();
7276 if (Elt.isUndef()) {
7277 UndefMask.setBit(i);
7278 continue;
7279 }
7281 ZeroMask.setBit(i);
7282 continue;
7283 }
7284
7285 // Each loaded element must be the correct fractional portion of the
7286 // requested vector load.
7287 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7288 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7289 return SDValue();
7290
7291 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7292 return SDValue();
7293 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7294 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7295 return SDValue();
7296
7297 LoadMask.setBit(i);
7298 LastLoadedElt = i;
7299 }
7300 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7301 NumElems &&
7302 "Incomplete element masks");
7303
7304 // Handle Special Cases - all undef or undef/zero.
7305 if (UndefMask.popcount() == NumElems)
7306 return DAG.getUNDEF(VT);
7307 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7308 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7309 : DAG.getConstantFP(0.0, DL, VT);
7310
7311 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7312 int FirstLoadedElt = LoadMask.countr_zero();
7313 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7314 EVT EltBaseVT = EltBase.getValueType();
7315 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7316 "Register/Memory size mismatch");
7317 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7318 assert(LDBase && "Did not find base load for merging consecutive loads");
7319 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7320 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7321 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7322 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7323 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7324
7325 // TODO: Support offsetting the base load.
7326 if (ByteOffsets[FirstLoadedElt] != 0)
7327 return SDValue();
7328
7329 // Check to see if the element's load is consecutive to the base load
7330 // or offset from a previous (already checked) load.
7331 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7332 LoadSDNode *Ld = Loads[EltIdx];
7333 int64_t ByteOffset = ByteOffsets[EltIdx];
7334 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7335 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7336 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7337 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7338 }
7339 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7340 EltIdx - FirstLoadedElt);
7341 };
7342
7343 // Consecutive loads can contain UNDEFS but not ZERO elements.
7344 // Consecutive loads with UNDEFs and ZEROs elements require a
7345 // an additional shuffle stage to clear the ZERO elements.
7346 bool IsConsecutiveLoad = true;
7347 bool IsConsecutiveLoadWithZeros = true;
7348 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7349 if (LoadMask[i]) {
7350 if (!CheckConsecutiveLoad(LDBase, i)) {
7351 IsConsecutiveLoad = false;
7352 IsConsecutiveLoadWithZeros = false;
7353 break;
7354 }
7355 } else if (ZeroMask[i]) {
7356 IsConsecutiveLoad = false;
7357 }
7358 }
7359
7360 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7361 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7362 assert(LDBase->isSimple() &&
7363 "Cannot merge volatile or atomic loads.");
7364 SDValue NewLd =
7365 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7366 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7367 for (auto *LD : Loads)
7368 if (LD)
7369 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7370 return NewLd;
7371 };
7372
7373 // Check if the base load is entirely dereferenceable.
7374 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7375 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7376
7377 // LOAD - all consecutive load/undefs (must start/end with a load or be
7378 // entirely dereferenceable). If we have found an entire vector of loads and
7379 // undefs, then return a large load of the entire vector width starting at the
7380 // base pointer. If the vector contains zeros, then attempt to shuffle those
7381 // elements.
7382 if (FirstLoadedElt == 0 &&
7383 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7384 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7385 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7386 return SDValue();
7387
7388 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7389 // will lower to regular temporal loads and use the cache.
7390 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7391 VT.is256BitVector() && !Subtarget.hasInt256())
7392 return SDValue();
7393
7394 if (NumElems == 1)
7395 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7396
7397 if (!ZeroMask)
7398 return CreateLoad(VT, LDBase);
7399
7400 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7401 // vector and a zero vector to clear out the zero elements.
7402 if (!IsAfterLegalize && VT.isVector()) {
7403 unsigned NumMaskElts = VT.getVectorNumElements();
7404 if ((NumMaskElts % NumElems) == 0) {
7405 unsigned Scale = NumMaskElts / NumElems;
7406 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7407 for (unsigned i = 0; i < NumElems; ++i) {
7408 if (UndefMask[i])
7409 continue;
7410 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7411 for (unsigned j = 0; j != Scale; ++j)
7412 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7413 }
7414 SDValue V = CreateLoad(VT, LDBase);
7415 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7416 : DAG.getConstantFP(0.0, DL, VT);
7417 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7418 }
7419 }
7420 }
7421
7422 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7423 if (VT.is256BitVector() || VT.is512BitVector()) {
7424 unsigned HalfNumElems = NumElems / 2;
7425 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7426 EVT HalfVT =
7427 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7428 SDValue HalfLD =
7429 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7430 DAG, Subtarget, IsAfterLegalize);
7431 if (HalfLD)
7432 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7433 HalfLD, DAG.getVectorIdxConstant(0, DL));
7434 }
7435 }
7436
7437 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7438 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7439 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7440 LoadSizeInBits == 64) &&
7441 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7442 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7443 : MVT::getIntegerVT(LoadSizeInBits);
7444 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7445 // Allow v4f32 on SSE1 only targets.
7446 // FIXME: Add more isel patterns so we can just use VT directly.
7447 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7448 VecVT = MVT::v4f32;
7449 if (TLI.isTypeLegal(VecVT)) {
7450 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7451 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7452 SDValue ResNode = DAG.getMemIntrinsicNode(
7453 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7455 for (auto *LD : Loads)
7456 if (LD)
7457 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7458 return DAG.getBitcast(VT, ResNode);
7459 }
7460 }
7461
7462 // BROADCAST - match the smallest possible repetition pattern, load that
7463 // scalar/subvector element and then broadcast to the entire vector.
7464 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7465 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7466 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7467 unsigned RepeatSize = SubElems * BaseSizeInBits;
7468 unsigned ScalarSize = std::min(RepeatSize, 64u);
7469 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7470 continue;
7471
7472 // Don't attempt a 1:N subvector broadcast - it should be caught by
7473 // combineConcatVectorOps, else will cause infinite loops.
7474 if (RepeatSize > ScalarSize && SubElems == 1)
7475 continue;
7476
7477 bool Match = true;
7478 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7479 for (unsigned i = 0; i != NumElems && Match; ++i) {
7480 if (!LoadMask[i])
7481 continue;
7482 SDValue Elt = peekThroughBitcasts(Elts[i]);
7483 if (RepeatedLoads[i % SubElems].isUndef())
7484 RepeatedLoads[i % SubElems] = Elt;
7485 else
7486 Match &= (RepeatedLoads[i % SubElems] == Elt);
7487 }
7488
7489 // We must have loads at both ends of the repetition.
7490 Match &= !RepeatedLoads.front().isUndef();
7491 Match &= !RepeatedLoads.back().isUndef();
7492 if (!Match)
7493 continue;
7494
7495 EVT RepeatVT =
7496 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7497 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7498 : EVT::getFloatingPointVT(ScalarSize);
7499 if (RepeatSize > ScalarSize)
7500 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7501 RepeatSize / ScalarSize);
7502 EVT BroadcastVT =
7503 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7504 VT.getSizeInBits() / ScalarSize);
7505 if (TLI.isTypeLegal(BroadcastVT)) {
7506 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7507 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7508 SDValue Broadcast = RepeatLoad;
7509 if (RepeatSize > ScalarSize) {
7510 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7511 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7512 } else {
7513 if (!Subtarget.hasAVX2() &&
7515 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7516 Subtarget,
7517 /*AssumeSingleUse=*/true))
7518 return SDValue();
7519 Broadcast =
7520 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7521 }
7522 return DAG.getBitcast(VT, Broadcast);
7523 }
7524 }
7525 }
7526 }
7527
7528 return SDValue();
7529}
7530
7531// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7532// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7533// are consecutive, non-overlapping, and in the right order.
7535 SelectionDAG &DAG,
7536 const X86Subtarget &Subtarget,
7537 bool IsAfterLegalize) {
7539 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7540 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7541 Elts.push_back(Elt);
7542 continue;
7543 }
7544 return SDValue();
7545 }
7546 assert(Elts.size() == VT.getVectorNumElements());
7547 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7548 IsAfterLegalize);
7549}
7550
7552 const APInt &Undefs, LLVMContext &C) {
7553 unsigned ScalarSize = VT.getScalarSizeInBits();
7554 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7555
7556 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7557 if (VT.isFloatingPoint()) {
7558 if (ScalarSize == 16)
7559 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7560 if (ScalarSize == 32)
7561 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7562 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7563 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7564 }
7565 return Constant::getIntegerValue(Ty, Val);
7566 };
7567
7568 SmallVector<Constant *, 32> ConstantVec;
7569 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7570 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7571 : getConstantScalar(Bits[I]));
7572
7573 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7574}
7575
7576static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7577 unsigned SplatBitSize, LLVMContext &C) {
7578 unsigned ScalarSize = VT.getScalarSizeInBits();
7579
7580 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7581 if (VT.isFloatingPoint()) {
7582 if (ScalarSize == 16)
7583 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7584 if (ScalarSize == 32)
7585 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7586 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7587 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7588 }
7589 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7590 };
7591
7592 if (ScalarSize == SplatBitSize)
7593 return getConstantScalar(SplatValue);
7594
7595 unsigned NumElm = SplatBitSize / ScalarSize;
7596 SmallVector<Constant *, 32> ConstantVec;
7597 for (unsigned I = 0; I != NumElm; ++I) {
7598 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7599 ConstantVec.push_back(getConstantScalar(Val));
7600 }
7601 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7602}
7603
7605 for (auto *U : N->users()) {
7606 unsigned Opc = U->getOpcode();
7607 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7608 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7609 return false;
7610 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7611 return false;
7612 if (isTargetShuffle(Opc))
7613 return true;
7614 if (Opc == ISD::BITCAST) // Ignore bitcasts
7615 return isFoldableUseOfShuffle(U);
7616 if (N->hasOneUse()) {
7617 // TODO, there may be some general way to know if a SDNode can
7618 // be folded. We now only know whether an MI is foldable.
7619 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7620 return false;
7621 return true;
7622 }
7623 }
7624 return false;
7625}
7626
7627// If the node has a single use by a VSELECT then AVX512 targets may be able to
7628// fold as a predicated instruction.
7629static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7630 unsigned SizeInBits = V.getValueSizeInBits();
7631 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7632 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7633 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7634 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7635 return true;
7636 }
7637 }
7638 return false;
7639}
7640
7641/// Attempt to use the vbroadcast instruction to generate a splat value
7642/// from a splat BUILD_VECTOR which uses:
7643/// a. A single scalar load, or a constant.
7644/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7645///
7646/// The VBROADCAST node is returned when a pattern is found,
7647/// or SDValue() otherwise.
7649 const SDLoc &dl,
7650 const X86Subtarget &Subtarget,
7651 SelectionDAG &DAG) {
7652 // VBROADCAST requires AVX.
7653 // TODO: Splats could be generated for non-AVX CPUs using SSE
7654 // instructions, but there's less potential gain for only 128-bit vectors.
7655 if (!Subtarget.hasAVX())
7656 return SDValue();
7657
7658 MVT VT = BVOp->getSimpleValueType(0);
7659 unsigned NumElts = VT.getVectorNumElements();
7660 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7661 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7662 "Unsupported vector type for broadcast.");
7663
7664 // See if the build vector is a repeating sequence of scalars (inc. splat).
7665 SDValue Ld;
7666 BitVector UndefElements;
7667 SmallVector<SDValue, 16> Sequence;
7668 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7669 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7670 if (Sequence.size() == 1)
7671 Ld = Sequence[0];
7672 }
7673
7674 // Attempt to use VBROADCASTM
7675 // From this pattern:
7676 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7677 // b. t1 = (build_vector t0 t0)
7678 //
7679 // Create (VBROADCASTM v2i1 X)
7680 if (!Sequence.empty() && Subtarget.hasCDI()) {
7681 // If not a splat, are the upper sequence values zeroable?
7682 unsigned SeqLen = Sequence.size();
7683 bool UpperZeroOrUndef =
7684 SeqLen == 1 ||
7685 llvm::all_of(ArrayRef(Sequence).drop_front(),
7686 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7687 SDValue Op0 = Sequence[0];
7688 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7689 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7690 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7691 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7692 ? Op0.getOperand(0)
7693 : Op0.getOperand(0).getOperand(0);
7694 MVT MaskVT = BOperand.getSimpleValueType();
7695 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7696 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7697 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7698 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7699 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7700 unsigned Scale = 512 / VT.getSizeInBits();
7701 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7702 }
7703 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7704 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7705 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7706 return DAG.getBitcast(VT, Bcst);
7707 }
7708 }
7709 }
7710
7711 unsigned NumUndefElts = UndefElements.count();
7712 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7713 APInt SplatValue, Undef;
7714 unsigned SplatBitSize;
7715 bool HasUndef;
7716 // Check if this is a repeated constant pattern suitable for broadcasting.
7717 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7718 SplatBitSize > VT.getScalarSizeInBits() &&
7719 SplatBitSize < VT.getSizeInBits()) {
7720 // Avoid replacing with broadcast when it's a use of a shuffle
7721 // instruction to preserve the present custom lowering of shuffles.
7722 if (isFoldableUseOfShuffle(BVOp))
7723 return SDValue();
7724 // replace BUILD_VECTOR with broadcast of the repeated constants.
7725 LLVMContext *Ctx = DAG.getContext();
7726 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7727 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7728 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7729 // Load the constant scalar/subvector and broadcast it.
7730 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7731 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7732 SDValue CP = DAG.getConstantPool(C, PVT);
7733 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7734
7735 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7736 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7737 SDValue Ops[] = {DAG.getEntryNode(), CP};
7738 MachinePointerInfo MPI =
7740 SDValue Brdcst =
7741 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7742 MPI, Alignment, MachineMemOperand::MOLoad);
7743 return DAG.getBitcast(VT, Brdcst);
7744 }
7745 if (SplatBitSize > 64) {
7746 // Load the vector of constants and broadcast it.
7747 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7748 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7749 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7750 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7751 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7752 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7753 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7754 MachinePointerInfo MPI =
7757 Ops, VVT, MPI, Alignment,
7759 }
7760 }
7761
7762 // If we are moving a scalar into a vector (Ld must be set and all elements
7763 // but 1 are undef) and that operation is not obviously supported by
7764 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7765 // That's better than general shuffling and may eliminate a load to GPR and
7766 // move from scalar to vector register.
7767 if (!Ld || NumElts - NumUndefElts != 1)
7768 return SDValue();
7769 unsigned ScalarSize = Ld.getValueSizeInBits();
7770 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7771 return SDValue();
7772 }
7773
7774 bool ConstSplatVal =
7775 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7776 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7777
7778 // TODO: Handle broadcasts of non-constant sequences.
7779
7780 // Make sure that all of the users of a non-constant load are from the
7781 // BUILD_VECTOR node.
7782 // FIXME: Is the use count needed for non-constant, non-load case?
7783 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7784 return SDValue();
7785
7786 unsigned ScalarSize = Ld.getValueSizeInBits();
7787 bool IsGE256 = (VT.getSizeInBits() >= 256);
7788
7789 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7790 // instruction to save 8 or more bytes of constant pool data.
7791 // TODO: If multiple splats are generated to load the same constant,
7792 // it may be detrimental to overall size. There needs to be a way to detect
7793 // that condition to know if this is truly a size win.
7794 bool OptForSize = DAG.shouldOptForSize();
7795
7796 // Handle broadcasting a single constant scalar from the constant pool
7797 // into a vector.
7798 // On Sandybridge (no AVX2), it is still better to load a constant vector
7799 // from the constant pool and not to broadcast it from a scalar.
7800 // But override that restriction when optimizing for size.
7801 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7802 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7803 EVT CVT = Ld.getValueType();
7804 assert(!CVT.isVector() && "Must not broadcast a vector type");
7805
7806 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7807 // For size optimization, also splat v2f64 and v2i64, and for size opt
7808 // with AVX2, also splat i8 and i16.
7809 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7810 if (ScalarSize == 32 ||
7811 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7812 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7813 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7814 const Constant *C = nullptr;
7815 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7816 C = CI->getConstantIntValue();
7817 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7818 C = CF->getConstantFPValue();
7819
7820 assert(C && "Invalid constant type");
7821
7822 SDValue CP =
7824 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7825
7826 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7827 SDValue Ops[] = {DAG.getEntryNode(), CP};
7828 MachinePointerInfo MPI =
7830 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7831 MPI, Alignment, MachineMemOperand::MOLoad);
7832 }
7833 }
7834
7835 // Handle AVX2 in-register broadcasts.
7836 if (!IsLoad && Subtarget.hasInt256() &&
7837 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7838 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7839
7840 // The scalar source must be a normal load.
7841 if (!IsLoad)
7842 return SDValue();
7843
7844 // Make sure the non-chain result is only used by this build vector.
7845 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7846 return SDValue();
7847
7848 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7849 (Subtarget.hasVLX() && ScalarSize == 64)) {
7850 auto *LN = cast<LoadSDNode>(Ld);
7851 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7852 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7853 SDValue BCast =
7855 LN->getMemoryVT(), LN->getMemOperand());
7856 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7857 return BCast;
7858 }
7859
7860 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7861 // double since there is no vbroadcastsd xmm
7862 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7863 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7864 auto *LN = cast<LoadSDNode>(Ld);
7865 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7866 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7867 SDValue BCast =
7869 LN->getMemoryVT(), LN->getMemOperand());
7870 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7871 return BCast;
7872 }
7873
7874 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7875 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7876
7877 // Unsupported broadcast.
7878 return SDValue();
7879}
7880
7881/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7882/// underlying vector and index.
7883///
7884/// Modifies \p ExtractedFromVec to the real vector and returns the real
7885/// index.
7886static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7887 SDValue ExtIdx) {
7888 int Idx = ExtIdx->getAsZExtVal();
7889 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7890 return Idx;
7891
7892 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7893 // lowered this:
7894 // (extract_vector_elt (v8f32 %1), Constant<6>)
7895 // to:
7896 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7897 // (extract_subvector (v8f32 %0), Constant<4>),
7898 // undef)
7899 // Constant<0>)
7900 // In this case the vector is the extract_subvector expression and the index
7901 // is 2, as specified by the shuffle.
7902 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7903 SDValue ShuffleVec = SVOp->getOperand(0);
7904 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7905 assert(ShuffleVecVT.getVectorElementType() ==
7906 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7907
7908 int ShuffleIdx = SVOp->getMaskElt(Idx);
7909 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7910 ExtractedFromVec = ShuffleVec;
7911 return ShuffleIdx;
7912 }
7913 return Idx;
7914}
7915
7917 SelectionDAG &DAG) {
7918 MVT VT = Op.getSimpleValueType();
7919
7920 // Skip if insert_vec_elt is not supported.
7921 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7923 return SDValue();
7924
7925 unsigned NumElems = Op.getNumOperands();
7926 SDValue VecIn1;
7927 SDValue VecIn2;
7928 SmallVector<unsigned, 4> InsertIndices;
7929 SmallVector<int, 8> Mask(NumElems, -1);
7930
7931 for (unsigned i = 0; i != NumElems; ++i) {
7932 unsigned Opc = Op.getOperand(i).getOpcode();
7933
7934 if (Opc == ISD::UNDEF)
7935 continue;
7936
7938 // Quit if more than 1 elements need inserting.
7939 if (InsertIndices.size() > 1)
7940 return SDValue();
7941
7942 InsertIndices.push_back(i);
7943 continue;
7944 }
7945
7946 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7947 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7948
7949 // Quit if non-constant index.
7950 if (!isa<ConstantSDNode>(ExtIdx))
7951 return SDValue();
7952 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7953
7954 // Quit if extracted from vector of different type.
7955 if (ExtractedFromVec.getValueType() != VT)
7956 return SDValue();
7957
7958 if (!VecIn1.getNode())
7959 VecIn1 = ExtractedFromVec;
7960 else if (VecIn1 != ExtractedFromVec) {
7961 if (!VecIn2.getNode())
7962 VecIn2 = ExtractedFromVec;
7963 else if (VecIn2 != ExtractedFromVec)
7964 // Quit if more than 2 vectors to shuffle
7965 return SDValue();
7966 }
7967
7968 if (ExtractedFromVec == VecIn1)
7969 Mask[i] = Idx;
7970 else if (ExtractedFromVec == VecIn2)
7971 Mask[i] = Idx + NumElems;
7972 }
7973
7974 if (!VecIn1.getNode())
7975 return SDValue();
7976
7977 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7978 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7979
7980 for (unsigned Idx : InsertIndices)
7981 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7983
7984 return NV;
7985}
7986
7987// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7989 const X86Subtarget &Subtarget) {
7990 MVT VT = Op.getSimpleValueType();
7991 MVT IVT =
7992 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7994 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7995 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7996 Op.getOperand(I)));
7997 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7998 return DAG.getBitcast(VT, Res);
7999}
8000
8001// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8003 SelectionDAG &DAG,
8004 const X86Subtarget &Subtarget) {
8005
8006 MVT VT = Op.getSimpleValueType();
8007 assert((VT.getVectorElementType() == MVT::i1) &&
8008 "Unexpected type in LowerBUILD_VECTORvXi1!");
8009 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8010 ISD::isBuildVectorAllOnes(Op.getNode()))
8011 return Op;
8012
8013 uint64_t Immediate = 0;
8014 SmallVector<unsigned, 16> NonConstIdx;
8015 bool IsSplat = true;
8016 bool HasConstElts = false;
8017 int SplatIdx = -1;
8018 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8019 SDValue In = Op.getOperand(idx);
8020 if (In.isUndef())
8021 continue;
8022 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8023 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8024 HasConstElts = true;
8025 } else {
8026 NonConstIdx.push_back(idx);
8027 }
8028 if (SplatIdx < 0)
8029 SplatIdx = idx;
8030 else if (In != Op.getOperand(SplatIdx))
8031 IsSplat = false;
8032 }
8033
8034 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8035 if (IsSplat) {
8036 // The build_vector allows the scalar element to be larger than the vector
8037 // element type. We need to mask it to use as a condition unless we know
8038 // the upper bits are zero.
8039 // FIXME: Use computeKnownBits instead of checking specific opcode?
8040 SDValue Cond = Op.getOperand(SplatIdx);
8041 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8042 if (Cond.getOpcode() != ISD::SETCC)
8043 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8044 DAG.getConstant(1, dl, MVT::i8));
8045
8046 // Perform the select in the scalar domain so we can use cmov.
8047 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8048 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8049 DAG.getAllOnesConstant(dl, MVT::i32),
8050 DAG.getConstant(0, dl, MVT::i32));
8051 Select = DAG.getBitcast(MVT::v32i1, Select);
8052 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8053 } else {
8054 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8055 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8056 DAG.getAllOnesConstant(dl, ImmVT),
8057 DAG.getConstant(0, dl, ImmVT));
8058 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8059 Select = DAG.getBitcast(VecVT, Select);
8060 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8061 DAG.getVectorIdxConstant(0, dl));
8062 }
8063 }
8064
8065 // insert elements one by one
8066 SDValue DstVec;
8067 if (HasConstElts) {
8068 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8069 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8070 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8071 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8072 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8073 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8074 } else {
8075 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8076 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8077 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8078 DstVec = DAG.getBitcast(VecVT, Imm);
8079 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8080 DAG.getVectorIdxConstant(0, dl));
8081 }
8082 } else
8083 DstVec = DAG.getUNDEF(VT);
8084
8085 for (unsigned InsertIdx : NonConstIdx) {
8086 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8087 Op.getOperand(InsertIdx),
8088 DAG.getVectorIdxConstant(InsertIdx, dl));
8089 }
8090 return DstVec;
8091}
8092
8093LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8094 switch (Opcode) {
8095 case X86ISD::PACKSS:
8096 case X86ISD::PACKUS:
8097 case X86ISD::FHADD:
8098 case X86ISD::FHSUB:
8099 case X86ISD::HADD:
8100 case X86ISD::HSUB:
8101 return true;
8102 }
8103 return false;
8104}
8105
8106/// This is a helper function of LowerToHorizontalOp().
8107/// This function checks that the build_vector \p N in input implements a
8108/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8109/// may not match the layout of an x86 256-bit horizontal instruction.
8110/// In other words, if this returns true, then some extraction/insertion will
8111/// be required to produce a valid horizontal instruction.
8112///
8113/// Parameter \p Opcode defines the kind of horizontal operation to match.
8114/// For example, if \p Opcode is equal to ISD::ADD, then this function
8115/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8116/// is equal to ISD::SUB, then this function checks if this is a horizontal
8117/// arithmetic sub.
8118///
8119/// This function only analyzes elements of \p N whose indices are
8120/// in range [BaseIdx, LastIdx).
8121///
8122/// TODO: This function was originally used to match both real and fake partial
8123/// horizontal operations, but the index-matching logic is incorrect for that.
8124/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8125/// code because it is only used for partial h-op matching now?
8126static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8127 const SDLoc &DL, SelectionDAG &DAG,
8128 unsigned BaseIdx, unsigned LastIdx,
8129 SDValue &V0, SDValue &V1) {
8130 EVT VT = N->getValueType(0);
8131 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8132 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8133 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8134 "Invalid Vector in input!");
8135
8136 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8137 bool CanFold = true;
8138 unsigned ExpectedVExtractIdx = BaseIdx;
8139 unsigned NumElts = LastIdx - BaseIdx;
8140 V0 = DAG.getUNDEF(VT);
8141 V1 = DAG.getUNDEF(VT);
8142
8143 // Check if N implements a horizontal binop.
8144 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8145 SDValue Op = N->getOperand(i + BaseIdx);
8146
8147 // Skip UNDEFs.
8148 if (Op->isUndef()) {
8149 // Update the expected vector extract index.
8150 if (i * 2 == NumElts)
8151 ExpectedVExtractIdx = BaseIdx;
8152 ExpectedVExtractIdx += 2;
8153 continue;
8154 }
8155
8156 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8157
8158 if (!CanFold)
8159 break;
8160
8161 SDValue Op0 = Op.getOperand(0);
8162 SDValue Op1 = Op.getOperand(1);
8163
8164 // Try to match the following pattern:
8165 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8166 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8168 Op0.getOperand(0) == Op1.getOperand(0) &&
8169 isa<ConstantSDNode>(Op0.getOperand(1)) &&
8170 isa<ConstantSDNode>(Op1.getOperand(1)));
8171 if (!CanFold)
8172 break;
8173
8174 unsigned I0 = Op0.getConstantOperandVal(1);
8175 unsigned I1 = Op1.getConstantOperandVal(1);
8176
8177 if (i * 2 < NumElts) {
8178 if (V0.isUndef()) {
8179 V0 = Op0.getOperand(0);
8180 if (V0.getValueType() != VT)
8181 return false;
8182 }
8183 } else {
8184 if (V1.isUndef()) {
8185 V1 = Op0.getOperand(0);
8186 if (V1.getValueType() != VT)
8187 return false;
8188 }
8189 if (i * 2 == NumElts)
8190 ExpectedVExtractIdx = BaseIdx;
8191 }
8192
8193 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8194 if (I0 == ExpectedVExtractIdx)
8195 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8196 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8197 // Try to match the following dag sequence:
8198 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8199 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8200 } else
8201 CanFold = false;
8202
8203 ExpectedVExtractIdx += 2;
8204 }
8205
8206 return CanFold;
8207}
8208
8209/// Emit a sequence of two 128-bit horizontal add/sub followed by
8210/// a concat_vector.
8211///
8212/// This is a helper function of LowerToHorizontalOp().
8213/// This function expects two 256-bit vectors called V0 and V1.
8214/// At first, each vector is split into two separate 128-bit vectors.
8215/// Then, the resulting 128-bit vectors are used to implement two
8216/// horizontal binary operations.
8217///
8218/// The kind of horizontal binary operation is defined by \p X86Opcode.
8219///
8220/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8221/// the two new horizontal binop.
8222/// When Mode is set, the first horizontal binop dag node would take as input
8223/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8224/// horizontal binop dag node would take as input the lower 128-bit of V1
8225/// and the upper 128-bit of V1.
8226/// Example:
8227/// HADD V0_LO, V0_HI
8228/// HADD V1_LO, V1_HI
8229///
8230/// Otherwise, the first horizontal binop dag node takes as input the lower
8231/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8232/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8233/// Example:
8234/// HADD V0_LO, V1_LO
8235/// HADD V0_HI, V1_HI
8236///
8237/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8238/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8239/// the upper 128-bits of the result.
8240static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8241 const SDLoc &DL, SelectionDAG &DAG,
8242 unsigned X86Opcode, bool Mode,
8243 bool isUndefLO, bool isUndefHI) {
8244 MVT VT = V0.getSimpleValueType();
8245 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8246 "Invalid nodes in input!");
8247
8248 unsigned NumElts = VT.getVectorNumElements();
8249 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8250 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8251 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8252 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8253 MVT NewVT = V0_LO.getSimpleValueType();
8254
8255 SDValue LO = DAG.getUNDEF(NewVT);
8256 SDValue HI = DAG.getUNDEF(NewVT);
8257
8258 if (Mode) {
8259 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8260 if (!isUndefLO && !V0->isUndef())
8261 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8262 if (!isUndefHI && !V1->isUndef())
8263 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8264 } else {
8265 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8266 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8267 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8268
8269 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8270 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8271 }
8272
8273 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8274}
8275
8276/// Returns true iff \p BV builds a vector with the result equivalent to
8277/// the result of ADDSUB/SUBADD operation.
8278/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8279/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8280/// \p Opnd0 and \p Opnd1.
8282 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8283 SDValue &Opnd0, SDValue &Opnd1,
8284 unsigned &NumExtracts, bool &IsSubAdd,
8285 bool &HasAllowContract) {
8286 using namespace SDPatternMatch;
8287
8288 MVT VT = BV->getSimpleValueType(0);
8289 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8290 return false;
8291
8292 unsigned NumElts = VT.getVectorNumElements();
8293 SDValue InVec0 = DAG.getUNDEF(VT);
8294 SDValue InVec1 = DAG.getUNDEF(VT);
8295
8296 NumExtracts = 0;
8297 HasAllowContract = NumElts != 0;
8298
8299 // Odd-numbered elements in the input build vector are obtained from
8300 // adding/subtracting two integer/float elements.
8301 // Even-numbered elements in the input build vector are obtained from
8302 // subtracting/adding two integer/float elements.
8303 unsigned Opc[2] = {0, 0};
8304 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8305 SDValue Op = BV->getOperand(i);
8306
8307 // Skip 'undef' values.
8308 unsigned Opcode = Op.getOpcode();
8309 if (Opcode == ISD::UNDEF)
8310 continue;
8311
8312 // Early exit if we found an unexpected opcode.
8313 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8314 return false;
8315
8316 SDValue Op0 = Op.getOperand(0);
8317 SDValue Op1 = Op.getOperand(1);
8318
8319 // Try to match the following pattern:
8320 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8321 // Early exit if we cannot match that sequence.
8322 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8323 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8324 return false;
8325
8326 // We found a valid add/sub node, make sure its the same opcode as previous
8327 // elements for this parity.
8328 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8329 return false;
8330 Opc[i % 2] = Opcode;
8331
8332 // Update InVec0 and InVec1.
8333 if (InVec0.isUndef())
8334 InVec0 = Op0.getOperand(0);
8335 if (InVec1.isUndef())
8336 InVec1 = Op1.getOperand(0);
8337
8338 // Make sure that operands in input to each add/sub node always
8339 // come from a same pair of vectors.
8340 if (InVec0 != Op0.getOperand(0)) {
8341 if (Opcode == ISD::FSUB)
8342 return false;
8343
8344 // FADD is commutable. Try to commute the operands
8345 // and then test again.
8346 std::swap(Op0, Op1);
8347 if (InVec0 != Op0.getOperand(0))
8348 return false;
8349 }
8350
8351 if (InVec1 != Op1.getOperand(0))
8352 return false;
8353
8354 // Increment the number of extractions done.
8355 ++NumExtracts;
8356 HasAllowContract &= Op->getFlags().hasAllowContract();
8357 }
8358
8359 // Ensure we have found an opcode for both parities and that they are
8360 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8361 // inputs are undef.
8362 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8363 InVec0.isUndef() || InVec1.isUndef())
8364 return false;
8365
8366 IsSubAdd = Opc[0] == ISD::FADD;
8367
8368 Opnd0 = InVec0;
8369 Opnd1 = InVec1;
8370 return true;
8371}
8372
8373/// Returns true if is possible to fold MUL and an idiom that has already been
8374/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8375/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8376/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8377///
8378/// Prior to calling this function it should be known that there is some
8379/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8380/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8381/// before replacement of such SDNode with ADDSUB operation. Thus the number
8382/// of \p Opnd0 uses is expected to be equal to 2.
8383/// For example, this function may be called for the following IR:
8384/// %AB = fmul fast <2 x double> %A, %B
8385/// %Sub = fsub fast <2 x double> %AB, %C
8386/// %Add = fadd fast <2 x double> %AB, %C
8387/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8388/// <2 x i32> <i32 0, i32 3>
8389/// There is a def for %Addsub here, which potentially can be replaced by
8390/// X86ISD::ADDSUB operation:
8391/// %Addsub = X86ISD::ADDSUB %AB, %C
8392/// and such ADDSUB can further be replaced with FMADDSUB:
8393/// %Addsub = FMADDSUB %A, %B, %C.
8394///
8395/// The main reason why this method is called before the replacement of the
8396/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8397/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8398/// FMADDSUB is.
8399static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8400 SelectionDAG &DAG, SDValue &Opnd0,
8401 SDValue &Opnd1, SDValue &Opnd2,
8402 unsigned ExpectedUses,
8403 bool AllowSubAddOrAddSubContract) {
8404 if (Opnd0.getOpcode() != ISD::FMUL ||
8405 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8406 return false;
8407
8408 // FIXME: These checks must match the similar ones in
8409 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8410 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8411 // or MUL + ADDSUB to FMADDSUB.
8412 const TargetOptions &Options = DAG.getTarget().Options;
8413 bool AllowFusion =
8414 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8415 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8416 if (!AllowFusion)
8417 return false;
8418
8419 Opnd2 = Opnd1;
8420 Opnd1 = Opnd0.getOperand(1);
8421 Opnd0 = Opnd0.getOperand(0);
8422
8423 return true;
8424}
8425
8426/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8427/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8428/// X86ISD::FMSUBADD node.
8430 const SDLoc &DL,
8431 const X86Subtarget &Subtarget,
8432 SelectionDAG &DAG) {
8433 SDValue Opnd0, Opnd1;
8434 unsigned NumExtracts;
8435 bool IsSubAdd;
8436 bool HasAllowContract;
8437 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8438 HasAllowContract))
8439 return SDValue();
8440
8441 MVT VT = BV->getSimpleValueType(0);
8442
8443 // Try to generate X86ISD::FMADDSUB node here.
8444 SDValue Opnd2;
8445 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8446 HasAllowContract)) {
8447 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8448 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8449 }
8450
8451 // We only support ADDSUB.
8452 if (IsSubAdd)
8453 return SDValue();
8454
8455 // There are no known X86 targets with 512-bit ADDSUB instructions!
8456 // Convert to blend(fsub,fadd).
8457 if (VT.is512BitVector()) {
8458 SmallVector<int> Mask;
8459 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8460 Mask.push_back(I);
8461 Mask.push_back(I + E + 1);
8462 }
8463 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8464 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8465 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8466 }
8467
8468 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8469}
8470
8472 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8473 // Initialize outputs to known values.
8474 MVT VT = BV->getSimpleValueType(0);
8475 HOpcode = ISD::DELETED_NODE;
8476 V0 = DAG.getUNDEF(VT);
8477 V1 = DAG.getUNDEF(VT);
8478
8479 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8480 // half of the result is calculated independently from the 128-bit halves of
8481 // the inputs, so that makes the index-checking logic below more complicated.
8482 unsigned NumElts = VT.getVectorNumElements();
8483 unsigned GenericOpcode = ISD::DELETED_NODE;
8484 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8485 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8486 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8487 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8488 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8489 // Ignore undef elements.
8490 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8491 if (Op.isUndef())
8492 continue;
8493
8494 // If there's an opcode mismatch, we're done.
8495 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8496 return false;
8497
8498 // Initialize horizontal opcode.
8499 if (HOpcode == ISD::DELETED_NODE) {
8500 GenericOpcode = Op.getOpcode();
8501 switch (GenericOpcode) {
8502 // clang-format off
8503 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8504 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8505 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8506 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8507 default: return false;
8508 // clang-format on
8509 }
8510 }
8511
8512 SDValue Op0 = Op.getOperand(0);
8513 SDValue Op1 = Op.getOperand(1);
8514 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8516 Op0.getOperand(0) != Op1.getOperand(0) ||
8517 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8518 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8519 return false;
8520
8521 // The source vector is chosen based on which 64-bit half of the
8522 // destination vector is being calculated.
8523 if (j < NumEltsIn64Bits) {
8524 if (V0.isUndef())
8525 V0 = Op0.getOperand(0);
8526 } else {
8527 if (V1.isUndef())
8528 V1 = Op0.getOperand(0);
8529 }
8530
8531 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8532 if (SourceVec != Op0.getOperand(0))
8533 return false;
8534
8535 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8536 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8537 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8538 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8539 (j % NumEltsIn64Bits) * 2;
8540 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8541 continue;
8542
8543 // If this is not a commutative op, this does not match.
8544 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8545 return false;
8546
8547 // Addition is commutative, so try swapping the extract indexes.
8548 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8549 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8550 continue;
8551
8552 // Extract indexes do not match horizontal requirement.
8553 return false;
8554 }
8555 }
8556 // We matched. Opcode and operands are returned by reference as arguments.
8557 return true;
8558}
8559
8561 const SDLoc &DL, SelectionDAG &DAG,
8562 unsigned HOpcode, SDValue V0, SDValue V1) {
8563 // If either input vector is not the same size as the build vector,
8564 // extract/insert the low bits to the correct size.
8565 // This is free (examples: zmm --> xmm, xmm --> ymm).
8566 MVT VT = BV->getSimpleValueType(0);
8567 unsigned Width = VT.getSizeInBits();
8568 if (V0.getValueSizeInBits() > Width)
8569 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8570 else if (V0.getValueSizeInBits() < Width)
8571 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8572
8573 if (V1.getValueSizeInBits() > Width)
8574 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8575 else if (V1.getValueSizeInBits() < Width)
8576 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8577
8578 unsigned NumElts = VT.getVectorNumElements();
8579 APInt DemandedElts = APInt::getAllOnes(NumElts);
8580 for (unsigned i = 0; i != NumElts; ++i)
8581 if (BV->getOperand(i).isUndef())
8582 DemandedElts.clearBit(i);
8583
8584 // If we don't need the upper xmm, then perform as a xmm hop.
8585 unsigned HalfNumElts = NumElts / 2;
8586 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8587 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8588 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8589 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8590 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8591 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8592 }
8593
8594 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8595}
8596
8597/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8599 const X86Subtarget &Subtarget,
8600 SelectionDAG &DAG) {
8601 // We need at least 2 non-undef elements to make this worthwhile by default.
8602 unsigned NumNonUndefs =
8603 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8604 if (NumNonUndefs < 2)
8605 return SDValue();
8606
8607 // There are 4 sets of horizontal math operations distinguished by type:
8608 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8609 // subtarget feature. Try to match those "native" patterns first.
8610 MVT VT = BV->getSimpleValueType(0);
8611 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8612 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8613 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8614 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8615 unsigned HOpcode;
8616 SDValue V0, V1;
8617 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8618 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8619 }
8620
8621 // Try harder to match 256-bit ops by using extract/concat.
8622 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8623 return SDValue();
8624
8625 // Count the number of UNDEF operands in the build_vector in input.
8626 unsigned NumElts = VT.getVectorNumElements();
8627 unsigned Half = NumElts / 2;
8628 unsigned NumUndefsLO = 0;
8629 unsigned NumUndefsHI = 0;
8630 for (unsigned i = 0, e = Half; i != e; ++i)
8631 if (BV->getOperand(i)->isUndef())
8632 NumUndefsLO++;
8633
8634 for (unsigned i = Half, e = NumElts; i != e; ++i)
8635 if (BV->getOperand(i)->isUndef())
8636 NumUndefsHI++;
8637
8638 SDValue InVec0, InVec1;
8639 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8640 SDValue InVec2, InVec3;
8641 unsigned X86Opcode;
8642 bool CanFold = true;
8643
8644 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8645 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8646 InVec3) &&
8647 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8648 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8649 X86Opcode = X86ISD::HADD;
8650 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8651 InVec1) &&
8652 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8653 InVec3) &&
8654 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8655 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8656 X86Opcode = X86ISD::HSUB;
8657 else
8658 CanFold = false;
8659
8660 if (CanFold) {
8661 // Do not try to expand this build_vector into a pair of horizontal
8662 // add/sub if we can emit a pair of scalar add/sub.
8663 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8664 return SDValue();
8665
8666 // Convert this build_vector into a pair of horizontal binops followed by
8667 // a concat vector. We must adjust the outputs from the partial horizontal
8668 // matching calls above to account for undefined vector halves.
8669 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8670 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8671 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8672 bool isUndefLO = NumUndefsLO == Half;
8673 bool isUndefHI = NumUndefsHI == Half;
8674 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8675 isUndefHI);
8676 }
8677 }
8678
8679 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8680 VT == MVT::v16i16) {
8681 unsigned X86Opcode;
8682 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8683 InVec1))
8684 X86Opcode = X86ISD::HADD;
8685 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8686 InVec1))
8687 X86Opcode = X86ISD::HSUB;
8688 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8689 InVec1))
8690 X86Opcode = X86ISD::FHADD;
8691 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8692 InVec1))
8693 X86Opcode = X86ISD::FHSUB;
8694 else
8695 return SDValue();
8696
8697 // Don't try to expand this build_vector into a pair of horizontal add/sub
8698 // if we can simply emit a pair of scalar add/sub.
8699 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8700 return SDValue();
8701
8702 // Convert this build_vector into two horizontal add/sub followed by
8703 // a concat vector.
8704 bool isUndefLO = NumUndefsLO == Half;
8705 bool isUndefHI = NumUndefsHI == Half;
8706 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8707 isUndefLO, isUndefHI);
8708 }
8709
8710 return SDValue();
8711}
8712
8713static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8714 SelectionDAG &DAG);
8715
8716/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8717/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8718/// just apply the bit to the vectors.
8719/// NOTE: Its not in our interest to start make a general purpose vectorizer
8720/// from this, but enough scalar bit operations are created from the later
8721/// legalization + scalarization stages to need basic support.
8723 const X86Subtarget &Subtarget,
8724 SelectionDAG &DAG) {
8725 MVT VT = Op->getSimpleValueType(0);
8726 unsigned NumElems = VT.getVectorNumElements();
8727 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8728
8729 // Check that all elements have the same opcode.
8730 // TODO: Should we allow UNDEFS and if so how many?
8731 unsigned Opcode = Op->getOperand(0).getOpcode();
8732 for (unsigned i = 1; i < NumElems; ++i)
8733 if (Opcode != Op->getOperand(i).getOpcode())
8734 return SDValue();
8735
8736 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8737 bool IsShift = false;
8738 switch (Opcode) {
8739 default:
8740 return SDValue();
8741 case ISD::SHL:
8742 case ISD::SRL:
8743 case ISD::SRA:
8744 IsShift = true;
8745 break;
8746 case ISD::AND:
8747 case ISD::XOR:
8748 case ISD::OR:
8749 // Don't do this if the buildvector is a splat - we'd replace one
8750 // constant with an entire vector.
8751 if (Op->getSplatValue())
8752 return SDValue();
8753 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8754 return SDValue();
8755 break;
8756 }
8757
8758 SmallVector<SDValue, 4> LHSElts, RHSElts;
8759 for (SDValue Elt : Op->ops()) {
8760 SDValue LHS = Elt.getOperand(0);
8761 SDValue RHS = Elt.getOperand(1);
8762
8763 // We expect the canonicalized RHS operand to be the constant.
8764 if (!isa<ConstantSDNode>(RHS))
8765 return SDValue();
8766
8767 // Extend shift amounts.
8768 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8769 if (!IsShift)
8770 return SDValue();
8771 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8772 }
8773
8774 LHSElts.push_back(LHS);
8775 RHSElts.push_back(RHS);
8776 }
8777
8778 // Limit to shifts by uniform immediates.
8779 // TODO: Only accept vXi8/vXi64 special cases?
8780 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8781 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8782 return SDValue();
8783
8784 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8785 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8786 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8787
8788 if (!IsShift)
8789 return Res;
8790
8791 // Immediately lower the shift to ensure the constant build vector doesn't
8792 // get converted to a constant pool before the shift is lowered.
8793 return LowerShift(Res, Subtarget, DAG);
8794}
8795
8796static bool isShuffleFoldableLoad(SDValue);
8797
8798/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8799/// representing a blend.
8801 X86Subtarget const &Subtarget,
8802 SelectionDAG &DAG) {
8803 MVT VT = BVOp->getSimpleValueType(0u);
8804
8805 if (VT != MVT::v4f64)
8806 return SDValue();
8807
8808 // Collect unique operands.
8809 auto UniqueOps = SmallSet<SDValue, 16u>();
8810 for (SDValue Op : BVOp->ops()) {
8811 if (isIntOrFPConstant(Op) || Op.isUndef())
8812 return SDValue();
8813 UniqueOps.insert(Op);
8814 }
8815
8816 // Candidate BUILD_VECTOR must have 2 unique operands.
8817 if (UniqueOps.size() != 2u)
8818 return SDValue();
8819
8820 SDValue Op0 = BVOp->getOperand(0u);
8821 UniqueOps.erase(Op0);
8822 SDValue Op1 = *UniqueOps.begin();
8823
8824 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8825 isShuffleFoldableLoad(Op1)) {
8826 // Create shuffle mask.
8827 auto const NumElems = VT.getVectorNumElements();
8828 SmallVector<int, 16u> Mask(NumElems);
8829 for (auto I = 0u; I < NumElems; ++I) {
8830 SDValue Op = BVOp->getOperand(I);
8831 Mask[I] = Op == Op0 ? I : I + NumElems;
8832 }
8833 // Create shuffle of splats.
8834 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8835 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8836 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8837 }
8838
8839 return SDValue();
8840}
8841
8842/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8843/// functionality to do this, so it's all zeros, all ones, or some derivation
8844/// that is cheap to calculate.
8846 SelectionDAG &DAG,
8847 const X86Subtarget &Subtarget) {
8848 MVT VT = Op.getSimpleValueType();
8849
8850 // Vectors containing all zeros can be matched by pxor and xorps.
8851 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8852 return Op;
8853
8854 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8855 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8856 // vpcmpeqd on 256-bit vectors.
8857 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8858 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8859 return Op;
8860
8861 return getOnesVector(VT, DAG, DL);
8862 }
8863
8864 return SDValue();
8865}
8866
8867/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8868/// from a vector of source values and a vector of extraction indices.
8869/// The vectors might be manipulated to match the type of the permute op.
8870static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8871 const SDLoc &DL, SelectionDAG &DAG,
8872 const X86Subtarget &Subtarget) {
8873 MVT ShuffleVT = VT;
8874 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8875 unsigned NumElts = VT.getVectorNumElements();
8876 unsigned SizeInBits = VT.getSizeInBits();
8877
8878 // Adjust IndicesVec to match VT size.
8879 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8880 "Illegal variable permute mask size");
8881 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8882 // Narrow/widen the indices vector to the correct size.
8883 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8884 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8885 NumElts * VT.getScalarSizeInBits());
8886 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8887 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8888 SDLoc(IndicesVec), SizeInBits);
8889 // Zero-extend the index elements within the vector.
8890 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8891 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8892 IndicesVT, IndicesVec);
8893 }
8894 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8895
8896 // Handle SrcVec that don't match VT type.
8897 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8898 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8899 // Handle larger SrcVec by treating it as a larger permute.
8900 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8901 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8902 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8903 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8904 Subtarget, DAG, SDLoc(IndicesVec));
8905 SDValue NewSrcVec =
8906 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8907 if (NewSrcVec)
8908 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8909 return SDValue();
8910 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8911 // Widen smaller SrcVec to match VT.
8912 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8913 } else
8914 return SDValue();
8915 }
8916
8917 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8918 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8919 EVT SrcVT = Idx.getValueType();
8920 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8921 uint64_t IndexScale = 0;
8922 uint64_t IndexOffset = 0;
8923
8924 // If we're scaling a smaller permute op, then we need to repeat the
8925 // indices, scaling and offsetting them as well.
8926 // e.g. v4i32 -> v16i8 (Scale = 4)
8927 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8928 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8929 for (uint64_t i = 0; i != Scale; ++i) {
8930 IndexScale |= Scale << (i * NumDstBits);
8931 IndexOffset |= i << (i * NumDstBits);
8932 }
8933
8934 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8935 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8936 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8937 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8938 return Idx;
8939 };
8940
8941 unsigned Opcode = 0;
8942 switch (VT.SimpleTy) {
8943 default:
8944 break;
8945 case MVT::v16i8:
8946 if (Subtarget.hasSSSE3())
8947 Opcode = X86ISD::PSHUFB;
8948 break;
8949 case MVT::v8i16:
8950 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8951 Opcode = X86ISD::VPERMV;
8952 else if (Subtarget.hasSSSE3()) {
8953 Opcode = X86ISD::PSHUFB;
8954 ShuffleVT = MVT::v16i8;
8955 }
8956 break;
8957 case MVT::v4f32:
8958 case MVT::v4i32:
8959 if (Subtarget.hasAVX()) {
8960 Opcode = X86ISD::VPERMILPV;
8961 ShuffleVT = MVT::v4f32;
8962 } else if (Subtarget.hasSSSE3()) {
8963 Opcode = X86ISD::PSHUFB;
8964 ShuffleVT = MVT::v16i8;
8965 }
8966 break;
8967 case MVT::v2f64:
8968 case MVT::v2i64:
8969 if (Subtarget.hasAVX()) {
8970 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8971 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8972 Opcode = X86ISD::VPERMILPV;
8973 ShuffleVT = MVT::v2f64;
8974 } else if (Subtarget.hasSSE41()) {
8975 // SSE41 can compare v2i64 - select between indices 0 and 1.
8976 return DAG.getSelectCC(
8977 DL, IndicesVec,
8978 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8979 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8980 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8982 }
8983 break;
8984 case MVT::v32i8:
8985 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8986 Opcode = X86ISD::VPERMV;
8987 else if (Subtarget.hasXOP()) {
8988 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8989 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8990 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8991 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8992 return DAG.getNode(
8994 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8995 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8996 } else if (Subtarget.hasAVX()) {
8997 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8998 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8999 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9000 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9001 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9002 ArrayRef<SDValue> Ops) {
9003 // Permute Lo and Hi and then select based on index range.
9004 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9005 // care about the bit[7] as its just an index vector.
9006 SDValue Idx = Ops[2];
9007 EVT VT = Idx.getValueType();
9008 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9009 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9010 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9012 };
9013 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9014 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9015 PSHUFBBuilder);
9016 }
9017 break;
9018 case MVT::v16i16:
9019 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9020 Opcode = X86ISD::VPERMV;
9021 else if (Subtarget.hasAVX()) {
9022 // Scale to v32i8 and perform as v32i8.
9023 IndicesVec = ScaleIndices(IndicesVec, 2);
9024 return DAG.getBitcast(
9026 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9027 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9028 }
9029 break;
9030 case MVT::v8f32:
9031 case MVT::v8i32:
9032 if (Subtarget.hasAVX2())
9033 Opcode = X86ISD::VPERMV;
9034 else if (Subtarget.hasAVX()) {
9035 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9036 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9037 {0, 1, 2, 3, 0, 1, 2, 3});
9038 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9039 {4, 5, 6, 7, 4, 5, 6, 7});
9040 if (Subtarget.hasXOP())
9041 return DAG.getBitcast(
9042 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9043 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9044 // Permute Lo and Hi and then select based on index range.
9045 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9046 SDValue Res = DAG.getSelectCC(
9047 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9048 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9049 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9051 return DAG.getBitcast(VT, Res);
9052 }
9053 break;
9054 case MVT::v4i64:
9055 case MVT::v4f64:
9056 if (Subtarget.hasAVX512()) {
9057 if (!Subtarget.hasVLX()) {
9058 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9059 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9060 SDLoc(SrcVec));
9061 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9062 DAG, SDLoc(IndicesVec));
9063 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9064 DAG, Subtarget);
9065 return extract256BitVector(Res, 0, DAG, DL);
9066 }
9067 Opcode = X86ISD::VPERMV;
9068 } else if (Subtarget.hasAVX()) {
9069 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9070 SDValue LoLo =
9071 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9072 SDValue HiHi =
9073 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9074 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9075 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9076 if (Subtarget.hasXOP())
9077 return DAG.getBitcast(
9078 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9079 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9080 // Permute Lo and Hi and then select based on index range.
9081 // This works as VPERMILPD only uses index bit[1] to permute elements.
9082 SDValue Res = DAG.getSelectCC(
9083 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9084 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9085 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9087 return DAG.getBitcast(VT, Res);
9088 }
9089 break;
9090 case MVT::v64i8:
9091 if (Subtarget.hasVBMI())
9092 Opcode = X86ISD::VPERMV;
9093 break;
9094 case MVT::v32i16:
9095 if (Subtarget.hasBWI())
9096 Opcode = X86ISD::VPERMV;
9097 break;
9098 case MVT::v16f32:
9099 case MVT::v16i32:
9100 case MVT::v8f64:
9101 case MVT::v8i64:
9102 if (Subtarget.hasAVX512())
9103 Opcode = X86ISD::VPERMV;
9104 break;
9105 }
9106 if (!Opcode)
9107 return SDValue();
9108
9109 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9110 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9111 "Illegal variable permute shuffle type");
9112
9113 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9114 if (Scale > 1)
9115 IndicesVec = ScaleIndices(IndicesVec, Scale);
9116
9117 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9118 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9119
9120 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9121 SDValue Res = Opcode == X86ISD::VPERMV
9122 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9123 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9124 return DAG.getBitcast(VT, Res);
9125}
9126
9127// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9128// reasoned to be a permutation of a vector by indices in a non-constant vector.
9129// (build_vector (extract_elt V, (extract_elt I, 0)),
9130// (extract_elt V, (extract_elt I, 1)),
9131// ...
9132// ->
9133// (vpermv I, V)
9134//
9135// TODO: Handle undefs
9136// TODO: Utilize pshufb and zero mask blending to support more efficient
9137// construction of vectors with constant-0 elements.
9138static SDValue
9140 SelectionDAG &DAG,
9141 const X86Subtarget &Subtarget) {
9142 SDValue SrcVec, IndicesVec;
9143
9144 auto PeekThroughFreeze = [](SDValue N) {
9145 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9146 return N->getOperand(0);
9147 return N;
9148 };
9149 // Check for a match of the permute source vector and permute index elements.
9150 // This is done by checking that the i-th build_vector operand is of the form:
9151 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9152 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9153 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9154 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9155 return SDValue();
9156
9157 // If this is the first extract encountered in V, set the source vector,
9158 // otherwise verify the extract is from the previously defined source
9159 // vector.
9160 if (!SrcVec)
9161 SrcVec = Op.getOperand(0);
9162 else if (SrcVec != Op.getOperand(0))
9163 return SDValue();
9164 SDValue ExtractedIndex = Op->getOperand(1);
9165 // Peek through extends.
9166 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9167 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9168 ExtractedIndex = ExtractedIndex.getOperand(0);
9169 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9170 return SDValue();
9171
9172 // If this is the first extract from the index vector candidate, set the
9173 // indices vector, otherwise verify the extract is from the previously
9174 // defined indices vector.
9175 if (!IndicesVec)
9176 IndicesVec = ExtractedIndex.getOperand(0);
9177 else if (IndicesVec != ExtractedIndex.getOperand(0))
9178 return SDValue();
9179
9180 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9181 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9182 return SDValue();
9183 }
9184
9185 MVT VT = V.getSimpleValueType();
9186 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9187}
9188
9189SDValue
9190X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9191 SDLoc dl(Op);
9192
9193 MVT VT = Op.getSimpleValueType();
9194 MVT EltVT = VT.getVectorElementType();
9195 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9196 unsigned NumElems = Op.getNumOperands();
9197
9198 // Generate vectors for predicate vectors.
9199 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9200 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9201
9202 if (VT.getVectorElementType() == MVT::bf16 &&
9203 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9204 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9205
9206 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9207 return VectorCst;
9208
9209 unsigned EVTBits = EltVT.getSizeInBits();
9210 APInt UndefMask = APInt::getZero(NumElems);
9211 APInt FrozenUndefMask = APInt::getZero(NumElems);
9212 APInt ZeroMask = APInt::getZero(NumElems);
9213 APInt NonZeroMask = APInt::getZero(NumElems);
9214 bool IsAllConstants = true;
9215 bool OneUseFrozenUndefs = true;
9216 SmallSet<SDValue, 8> Values;
9217 unsigned NumConstants = NumElems;
9218 for (unsigned i = 0; i < NumElems; ++i) {
9219 SDValue Elt = Op.getOperand(i);
9220 if (Elt.isUndef()) {
9221 UndefMask.setBit(i);
9222 continue;
9223 }
9224 if (ISD::isFreezeUndef(Elt.getNode())) {
9225 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9226 FrozenUndefMask.setBit(i);
9227 continue;
9228 }
9229 Values.insert(Elt);
9230 if (!isIntOrFPConstant(Elt)) {
9231 IsAllConstants = false;
9232 NumConstants--;
9233 }
9234 if (X86::isZeroNode(Elt)) {
9235 ZeroMask.setBit(i);
9236 } else {
9237 NonZeroMask.setBit(i);
9238 }
9239 }
9240
9241 // All undef vector. Return an UNDEF.
9242 if (UndefMask.isAllOnes())
9243 return DAG.getUNDEF(VT);
9244
9245 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9246 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9247 return DAG.getFreeze(DAG.getUNDEF(VT));
9248
9249 // All undef/freeze(undef)/zero vector. Return a zero vector.
9250 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9251 return getZeroVector(VT, Subtarget, DAG, dl);
9252
9253 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9254 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9255 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9256 // and blend the FREEZE-UNDEF operands back in.
9257 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9258 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9259 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9260 SmallVector<int, 16> BlendMask(NumElems, -1);
9261 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9262 for (unsigned i = 0; i < NumElems; ++i) {
9263 if (UndefMask[i]) {
9264 BlendMask[i] = -1;
9265 continue;
9266 }
9267 BlendMask[i] = i;
9268 if (!FrozenUndefMask[i])
9269 Elts[i] = Op.getOperand(i);
9270 else
9271 BlendMask[i] += NumElems;
9272 }
9273 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9274 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9275 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9276 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9277 }
9278
9279 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9280
9281 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9282 // be better off lowering to a smaller build vector and padding with
9283 // undef/zero.
9284 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9286 unsigned UpperElems = NumElems / 2;
9287 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9288 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9289 if (NumUpperUndefsOrZeros >= UpperElems) {
9290 if (VT.is512BitVector() &&
9291 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9292 UpperElems = NumElems - (NumElems / 4);
9293 // If freeze(undef) is in any upper elements, force to zero.
9294 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9295 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9296 SDValue NewBV =
9297 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9298 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9299 }
9300 }
9301
9302 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9303 return AddSub;
9304 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9305 return HorizontalOp;
9306 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9307 return Broadcast;
9308 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9309 return BitOp;
9310 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9311 return Blend;
9312
9313 unsigned NumZero = ZeroMask.popcount();
9314 unsigned NumNonZero = NonZeroMask.popcount();
9315
9316 // If we are inserting one variable into a vector of non-zero constants, try
9317 // to avoid loading each constant element as a scalar. Load the constants as a
9318 // vector and then insert the variable scalar element. If insertion is not
9319 // supported, fall back to a shuffle to get the scalar blended with the
9320 // constants. Insertion into a zero vector is handled as a special-case
9321 // somewhere below here.
9322 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9323 FrozenUndefMask.isZero() &&
9326 // Create an all-constant vector. The variable element in the old
9327 // build vector is replaced by undef in the constant vector. Save the
9328 // variable scalar element and its index for use in the insertelement.
9329 LLVMContext &Context = *DAG.getContext();
9330 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9331 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9332 SDValue VarElt;
9333 SDValue InsIndex;
9334 for (unsigned i = 0; i != NumElems; ++i) {
9335 SDValue Elt = Op.getOperand(i);
9336 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9337 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9338 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9339 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9340 else if (!Elt.isUndef()) {
9341 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9342 "Expected one variable element in this vector");
9343 VarElt = Elt;
9344 InsIndex = DAG.getVectorIdxConstant(i, dl);
9345 }
9346 }
9347 Constant *CV = ConstantVector::get(ConstVecOps);
9348 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9349
9350 // The constants we just created may not be legal (eg, floating point). We
9351 // must lower the vector right here because we can not guarantee that we'll
9352 // legalize it before loading it. This is also why we could not just create
9353 // a new build vector here. If the build vector contains illegal constants,
9354 // it could get split back up into a series of insert elements.
9355 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9356 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9359 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9360 unsigned InsertC = InsIndex->getAsZExtVal();
9361 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9362 if (InsertC < NumEltsInLow128Bits)
9363 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9364
9365 // There's no good way to insert into the high elements of a >128-bit
9366 // vector, so use shuffles to avoid an extract/insert sequence.
9367 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9368 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9369 SmallVector<int, 8> ShuffleMask;
9370 unsigned NumElts = VT.getVectorNumElements();
9371 for (unsigned i = 0; i != NumElts; ++i)
9372 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9373 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9374 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9375 }
9376
9377 // Special case for single non-zero, non-undef, element.
9378 if (NumNonZero == 1) {
9379 unsigned Idx = NonZeroMask.countr_zero();
9380 SDValue Item = Op.getOperand(Idx);
9381
9382 // If we have a constant or non-constant insertion into the low element of
9383 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9384 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9385 // depending on what the source datatype is.
9386 if (Idx == 0) {
9387 if (NumZero == 0)
9388 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9389
9390 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9391 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9392 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9393 assert((VT.is128BitVector() || VT.is256BitVector() ||
9394 VT.is512BitVector()) &&
9395 "Expected an SSE value type!");
9396 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9397 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9398 // zero vector.
9399 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9400 }
9401
9402 // We can't directly insert an i8 or i16 into a vector, so zero extend
9403 // it to i32 first.
9404 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9405 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9406 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9407 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9408 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9409 return DAG.getBitcast(VT, Item);
9410 }
9411 }
9412
9413 // Is it a vector logical left shift?
9414 if (NumElems == 2 && Idx == 1 &&
9415 X86::isZeroNode(Op.getOperand(0)) &&
9416 !X86::isZeroNode(Op.getOperand(1))) {
9417 unsigned NumBits = VT.getSizeInBits();
9418 return getVShift(true, VT,
9420 VT, Op.getOperand(1)),
9421 NumBits/2, DAG, *this, dl);
9422 }
9423
9424 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9425 return SDValue();
9426
9427 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9428 // is a non-constant being inserted into an element other than the low one,
9429 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9430 // movd/movss) to move this into the low element, then shuffle it into
9431 // place.
9432 if (EVTBits == 32) {
9433 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9434 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9435 }
9436 }
9437
9438 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9439 if (Values.size() == 1) {
9440 if (EVTBits == 32) {
9441 // Instead of a shuffle like this:
9442 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9443 // Check if it's possible to issue this instead.
9444 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9445 unsigned Idx = NonZeroMask.countr_zero();
9446 SDValue Item = Op.getOperand(Idx);
9447 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9448 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9449 }
9450 return SDValue();
9451 }
9452
9453 // A vector full of immediates; various special cases are already
9454 // handled, so this is best done with a single constant-pool load.
9455 if (IsAllConstants)
9456 return SDValue();
9457
9458 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9459 return V;
9460
9461 // See if we can use a vector load to get all of the elements.
9462 {
9463 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9464 if (SDValue LD =
9465 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9466 return LD;
9467 }
9468
9469 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9470 // build_vector and broadcast it.
9471 // TODO: We could probably generalize this more.
9472 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9473 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9474 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9475 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9476 // Make sure all the even/odd operands match.
9477 for (unsigned i = 2; i != NumElems; ++i)
9478 if (Ops[i % 2] != Op.getOperand(i))
9479 return false;
9480 return true;
9481 };
9482 if (CanSplat(Op, NumElems, Ops)) {
9483 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9484 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9485 // Create a new build vector and cast to v2i64/v2f64.
9486 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9487 DAG.getBuildVector(NarrowVT, dl, Ops));
9488 // Broadcast from v2i64/v2f64 and cast to final VT.
9489 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9490 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9491 NewBV));
9492 }
9493 }
9494
9495 // For AVX-length vectors, build the individual 128-bit pieces and use
9496 // shuffles to put them in place.
9497 if (VT.getSizeInBits() > 128) {
9498 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9499
9500 // Build both the lower and upper subvector.
9501 SDValue Lower =
9502 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9504 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9505
9506 // Recreate the wider vector with the lower and upper part.
9507 return concatSubVectors(Lower, Upper, DAG, dl);
9508 }
9509
9510 // Let legalizer expand 2-wide build_vectors.
9511 if (EVTBits == 64) {
9512 if (NumNonZero == 1) {
9513 // One half is zero or undef.
9514 unsigned Idx = NonZeroMask.countr_zero();
9515 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9516 Op.getOperand(Idx));
9517 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9518 }
9519 return SDValue();
9520 }
9521
9522 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9523 if (EVTBits == 8 && NumElems == 16)
9524 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9525 NumZero, DAG, Subtarget))
9526 return V;
9527
9528 if (EltVT == MVT::i16 && NumElems == 8)
9529 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9530 NumZero, DAG, Subtarget))
9531 return V;
9532
9533 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9534 if (EVTBits == 32 && NumElems == 4)
9535 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9536 return V;
9537
9538 // If element VT is == 32 bits, turn it into a number of shuffles.
9539 if (NumElems == 4 && NumZero > 0) {
9540 SmallVector<SDValue, 8> Ops(NumElems);
9541 for (unsigned i = 0; i < 4; ++i) {
9542 bool isZero = !NonZeroMask[i];
9543 if (isZero)
9544 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9545 else
9546 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9547 }
9548
9549 for (unsigned i = 0; i < 2; ++i) {
9550 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9551 default: llvm_unreachable("Unexpected NonZero count");
9552 case 0:
9553 Ops[i] = Ops[i*2]; // Must be a zero vector.
9554 break;
9555 case 1:
9556 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9557 break;
9558 case 2:
9559 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9560 break;
9561 case 3:
9562 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9563 break;
9564 }
9565 }
9566
9567 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9568 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9569 int MaskVec[] = {
9570 Reverse1 ? 1 : 0,
9571 Reverse1 ? 0 : 1,
9572 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9573 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9574 };
9575 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9576 }
9577
9578 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9579
9580 // Check for a build vector from mostly shuffle plus few inserting.
9581 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9582 return Sh;
9583
9584 // For SSE 4.1, use insertps to put the high elements into the low element.
9585 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9587 if (!Op.getOperand(0).isUndef())
9588 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9589 else
9590 Result = DAG.getUNDEF(VT);
9591
9592 for (unsigned i = 1; i < NumElems; ++i) {
9593 if (Op.getOperand(i).isUndef()) continue;
9594 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9595 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9596 }
9597 return Result;
9598 }
9599
9600 // Otherwise, expand into a number of unpckl*, start by extending each of
9601 // our (non-undef) elements to the full vector width with the element in the
9602 // bottom slot of the vector (which generates no code for SSE).
9603 SmallVector<SDValue, 8> Ops(NumElems);
9604 for (unsigned i = 0; i < NumElems; ++i) {
9605 if (!Op.getOperand(i).isUndef())
9606 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9607 else
9608 Ops[i] = DAG.getUNDEF(VT);
9609 }
9610
9611 // Next, we iteratively mix elements, e.g. for v4f32:
9612 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9613 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9614 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9615 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9616 // Generate scaled UNPCKL shuffle mask.
9618 for(unsigned i = 0; i != Scale; ++i)
9619 Mask.push_back(i);
9620 for (unsigned i = 0; i != Scale; ++i)
9621 Mask.push_back(NumElems+i);
9622 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9623
9624 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9625 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9626 }
9627 return Ops[0];
9628}
9629
9630// 256-bit AVX can use the vinsertf128 instruction
9631// to create 256-bit vectors from two other 128-bit ones.
9632// TODO: Detect subvector broadcast here instead of DAG combine?
9634 SelectionDAG &DAG,
9635 const X86Subtarget &Subtarget) {
9636 MVT ResVT = Op.getSimpleValueType();
9637 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9638 "Value type must be 256-/512-bit wide");
9639
9640 unsigned NumOperands = Op.getNumOperands();
9641 unsigned NumFreezeUndef = 0;
9642 unsigned NumZero = 0;
9643 unsigned NumNonZero = 0;
9644 unsigned NonZeros = 0;
9645 SmallSet<SDValue, 4> Undefs;
9646 for (unsigned i = 0; i != NumOperands; ++i) {
9647 SDValue SubVec = Op.getOperand(i);
9648 if (SubVec.isUndef())
9649 continue;
9650 if (ISD::isFreezeUndef(SubVec.getNode())) {
9651 // If the freeze(undef) has multiple uses then we must fold to zero.
9652 if (SubVec.hasOneUse()) {
9653 ++NumFreezeUndef;
9654 } else {
9655 ++NumZero;
9656 Undefs.insert(SubVec);
9657 }
9658 }
9659 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9660 ++NumZero;
9661 else {
9662 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9663 NonZeros |= 1 << i;
9664 ++NumNonZero;
9665 }
9666 }
9667
9668 // If we have more than 2 non-zeros, build each half separately.
9669 if (NumNonZero > 2) {
9670 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9671 ArrayRef<SDUse> Ops = Op->ops();
9672 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9673 Ops.slice(0, NumOperands/2));
9674 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9675 Ops.slice(NumOperands/2));
9676 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9677 }
9678
9679 // Otherwise, build it up through insert_subvectors.
9680 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9681 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9682 : DAG.getUNDEF(ResVT));
9683
9684 // Replace Undef operands with ZeroVector.
9685 for (SDValue U : Undefs)
9687 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9688
9689 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9690 unsigned NumSubElems = SubVT.getVectorNumElements();
9691 for (unsigned i = 0; i != NumOperands; ++i) {
9692 if ((NonZeros & (1 << i)) == 0)
9693 continue;
9694
9695 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9696 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9697 }
9698
9699 return Vec;
9700}
9701
9702// Returns true if the given node is a type promotion (by concatenating i1
9703// zeros) of the result of a node that already zeros all upper bits of
9704// k-register.
9705// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9707 const X86Subtarget &Subtarget,
9708 SelectionDAG & DAG) {
9709 MVT ResVT = Op.getSimpleValueType();
9710 unsigned NumOperands = Op.getNumOperands();
9711 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9712 "Unexpected number of operands in CONCAT_VECTORS");
9713
9714 uint64_t Zeros = 0;
9715 uint64_t NonZeros = 0;
9716 for (unsigned i = 0; i != NumOperands; ++i) {
9717 SDValue SubVec = Op.getOperand(i);
9718 if (SubVec.isUndef())
9719 continue;
9720 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9721 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9722 Zeros |= (uint64_t)1 << i;
9723 else
9724 NonZeros |= (uint64_t)1 << i;
9725 }
9726
9727 unsigned NumElems = ResVT.getVectorNumElements();
9728
9729 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9730 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9731 // insert_subvector will give us two kshifts.
9732 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9733 Log2_64(NonZeros) != NumOperands - 1) {
9734 unsigned Idx = Log2_64(NonZeros);
9735 SDValue SubVec = Op.getOperand(Idx);
9736 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9737 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9738 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9739 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9740 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9741 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9742 DAG.getVectorIdxConstant(0, dl));
9743 }
9744
9745 // If there are zero or one non-zeros we can handle this very simply.
9746 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9747 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9748 if (!NonZeros)
9749 return Vec;
9750 unsigned Idx = Log2_64(NonZeros);
9751 SDValue SubVec = Op.getOperand(Idx);
9752 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9753 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9754 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9755 }
9756
9757 if (NumOperands > 2) {
9758 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9759 ArrayRef<SDUse> Ops = Op->ops();
9760 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9761 Ops.slice(0, NumOperands / 2));
9762 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9763 Ops.slice(NumOperands / 2));
9764 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9765 }
9766
9767 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9768
9769 if (ResVT.getVectorNumElements() >= 16)
9770 return Op; // The operation is legal with KUNPCK
9771
9772 SDValue Vec =
9773 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9774 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9775 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9776 DAG.getVectorIdxConstant(NumElems / 2, dl));
9777}
9778
9780 const X86Subtarget &Subtarget,
9781 SelectionDAG &DAG) {
9782 SDLoc DL(Op);
9783 MVT VT = Op.getSimpleValueType();
9784 if (VT.getVectorElementType() == MVT::i1)
9785 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9786
9787 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9788 // from two other 128-bit ones.
9789 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9790 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9791 (VT.is512BitVector() &&
9792 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9793 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9794}
9795
9796//===----------------------------------------------------------------------===//
9797// Vector shuffle lowering
9798//
9799// This is an experimental code path for lowering vector shuffles on x86. It is
9800// designed to handle arbitrary vector shuffles and blends, gracefully
9801// degrading performance as necessary. It works hard to recognize idiomatic
9802// shuffles and lower them to optimal instruction patterns without leaving
9803// a framework that allows reasonably efficient handling of all vector shuffle
9804// patterns.
9805//===----------------------------------------------------------------------===//
9806
9807/// Checks whether the vector elements referenced by two shuffle masks are
9808/// equivalent.
9809static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9810 int Idx, int ExpectedIdx) {
9811 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9812 ExpectedIdx < MaskSize && "Out of range element index");
9813 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9814 return false;
9815
9816 EVT VT = Op.getValueType();
9817 EVT ExpectedVT = ExpectedOp.getValueType();
9818
9819 // Sources must be vectors and match the mask's element count.
9820 if (!VT.isVector() || !ExpectedVT.isVector() ||
9821 (int)VT.getVectorNumElements() != MaskSize ||
9822 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9823 return false;
9824
9825 // Exact match.
9826 if (Idx == ExpectedIdx && Op == ExpectedOp)
9827 return true;
9828
9829 switch (Op.getOpcode()) {
9830 case ISD::BUILD_VECTOR:
9831 // If the values are build vectors, we can look through them to find
9832 // equivalent inputs that make the shuffles equivalent.
9833 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9834 case ISD::BITCAST: {
9836 EVT SrcVT = Src.getValueType();
9837 if (Op == ExpectedOp && SrcVT.isVector()) {
9838 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9839 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9840 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9841 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9842 Idx / Scale, ExpectedIdx / Scale);
9843 }
9844 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9845 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9846 for (unsigned I = 0; I != Scale; ++I)
9847 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9848 (Idx * Scale) + I,
9849 (ExpectedIdx * Scale) + I))
9850 return false;
9851 return true;
9852 }
9853 }
9854 break;
9855 }
9856 case ISD::VECTOR_SHUFFLE: {
9857 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9858 return Op == ExpectedOp &&
9859 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9860 }
9861 case X86ISD::VBROADCAST:
9863 return Op == ExpectedOp;
9865 if (Op == ExpectedOp) {
9866 auto *MemOp = cast<MemSDNode>(Op);
9867 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9868 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9869 }
9870 break;
9871 case X86ISD::VPERMI: {
9872 if (Op == ExpectedOp) {
9874 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9875 SDValue Src = Op.getOperand(0);
9876 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9877 Mask[ExpectedIdx]);
9878 }
9879 break;
9880 }
9881 case X86ISD::HADD:
9882 case X86ISD::HSUB:
9883 case X86ISD::FHADD:
9884 case X86ISD::FHSUB:
9885 case X86ISD::PACKSS:
9886 case X86ISD::PACKUS:
9887 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9888 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9889 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9890 int NumElts = VT.getVectorNumElements();
9891 int NumLanes = VT.getSizeInBits() / 128;
9892 int NumEltsPerLane = NumElts / NumLanes;
9893 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9894 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9895 bool SameElt =
9896 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9897 return SameLane && SameElt;
9898 }
9899 break;
9900 }
9901
9902 return false;
9903}
9904
9905/// Tiny helper function to identify a no-op mask.
9906///
9907/// This is a somewhat boring predicate function. It checks whether the mask
9908/// array input, which is assumed to be a single-input shuffle mask of the kind
9909/// used by the X86 shuffle instructions (not a fully general
9910/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9911/// in-place shuffle are 'no-op's.
9913 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9914 assert(Mask[i] >= -1 && "Out of bound mask element!");
9915 if (Mask[i] >= 0 && Mask[i] != i)
9916 return false;
9917 }
9918 return true;
9919}
9920
9921/// Test whether there are elements crossing LaneSizeInBits lanes in this
9922/// shuffle mask.
9923///
9924/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9925/// and we routinely test for these.
9926static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9927 unsigned ScalarSizeInBits,
9928 ArrayRef<int> Mask) {
9929 assert(LaneSizeInBits && ScalarSizeInBits &&
9930 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9931 "Illegal shuffle lane size");
9932 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9933 int Size = Mask.size();
9934 for (int i = 0; i < Size; ++i)
9935 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9936 return true;
9937 return false;
9938}
9939
9940/// Test whether there are elements crossing 128-bit lanes in this
9941/// shuffle mask.
9943 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9944}
9945
9946/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9947/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9948/// better support 'repeated mask + lane permute' style shuffles.
9949static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9950 unsigned ScalarSizeInBits,
9951 ArrayRef<int> Mask) {
9952 assert(LaneSizeInBits && ScalarSizeInBits &&
9953 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9954 "Illegal shuffle lane size");
9955 int NumElts = Mask.size();
9956 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9957 int NumLanes = NumElts / NumEltsPerLane;
9958 if (NumLanes > 1) {
9959 for (int i = 0; i != NumLanes; ++i) {
9960 int SrcLane = -1;
9961 for (int j = 0; j != NumEltsPerLane; ++j) {
9962 int M = Mask[(i * NumEltsPerLane) + j];
9963 if (M < 0)
9964 continue;
9965 int Lane = (M % NumElts) / NumEltsPerLane;
9966 if (SrcLane >= 0 && SrcLane != Lane)
9967 return true;
9968 SrcLane = Lane;
9969 }
9970 }
9971 }
9972 return false;
9973}
9974
9975/// Test whether a shuffle mask is equivalent within each sub-lane.
9976///
9977/// This checks a shuffle mask to see if it is performing the same
9978/// lane-relative shuffle in each sub-lane. This trivially implies
9979/// that it is also not lane-crossing. It may however involve a blend from the
9980/// same lane of a second vector.
9981///
9982/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9983/// non-trivial to compute in the face of undef lanes. The representation is
9984/// suitable for use with existing 128-bit shuffles as entries from the second
9985/// vector have been remapped to [LaneSize, 2*LaneSize).
9986static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9987 ArrayRef<int> Mask,
9988 SmallVectorImpl<int> &RepeatedMask) {
9989 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9990 RepeatedMask.assign(LaneSize, -1);
9991 int Size = Mask.size();
9992 for (int i = 0; i < Size; ++i) {
9993 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9994 if (Mask[i] < 0)
9995 continue;
9996 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9997 // This entry crosses lanes, so there is no way to model this shuffle.
9998 return false;
9999
10000 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10001 // Adjust second vector indices to start at LaneSize instead of Size.
10002 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10003 : Mask[i] % LaneSize + LaneSize;
10004 if (RepeatedMask[i % LaneSize] < 0)
10005 // This is the first non-undef entry in this slot of a 128-bit lane.
10006 RepeatedMask[i % LaneSize] = LocalM;
10007 else if (RepeatedMask[i % LaneSize] != LocalM)
10008 // Found a mismatch with the repeated mask.
10009 return false;
10010 }
10011 return true;
10012}
10013
10014/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10015static bool
10017 SmallVectorImpl<int> &RepeatedMask) {
10018 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10019}
10020
10021static bool
10023 SmallVector<int, 32> RepeatedMask;
10024 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10025}
10026
10027/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10028static bool
10030 SmallVectorImpl<int> &RepeatedMask) {
10031 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10032}
10033
10034/// Test whether a target shuffle mask is equivalent within each sub-lane.
10035/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10036static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10037 unsigned EltSizeInBits,
10038 ArrayRef<int> Mask,
10039 SmallVectorImpl<int> &RepeatedMask) {
10040 int LaneSize = LaneSizeInBits / EltSizeInBits;
10041 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10042 int Size = Mask.size();
10043 for (int i = 0; i < Size; ++i) {
10044 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10045 if (Mask[i] == SM_SentinelUndef)
10046 continue;
10047 if (Mask[i] == SM_SentinelZero) {
10048 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10049 return false;
10050 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10051 continue;
10052 }
10053 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10054 // This entry crosses lanes, so there is no way to model this shuffle.
10055 return false;
10056
10057 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10058 // later vector indices to start at multiples of LaneSize instead of Size.
10059 int LaneM = Mask[i] / Size;
10060 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10061 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10062 // This is the first non-undef entry in this slot of a 128-bit lane.
10063 RepeatedMask[i % LaneSize] = LocalM;
10064 else if (RepeatedMask[i % LaneSize] != LocalM)
10065 // Found a mismatch with the repeated mask.
10066 return false;
10067 }
10068 return true;
10069}
10070
10071/// Test whether a target shuffle mask is equivalent within each sub-lane.
10072/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10073static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10074 ArrayRef<int> Mask,
10075 SmallVectorImpl<int> &RepeatedMask) {
10076 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10077 Mask, RepeatedMask);
10078}
10079
10080/// Checks whether a shuffle mask is equivalent to an explicit list of
10081/// arguments.
10082///
10083/// This is a fast way to test a shuffle mask against a fixed pattern:
10084///
10085/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10086///
10087/// It returns true if the mask is exactly as wide as the argument list, and
10088/// each element of the mask is either -1 (signifying undef) or the value given
10089/// in the argument.
10090static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10091 SDValue V1 = SDValue(),
10092 SDValue V2 = SDValue()) {
10093 int Size = Mask.size();
10094 if (Size != (int)ExpectedMask.size())
10095 return false;
10096
10097 for (int i = 0; i < Size; ++i) {
10098 assert(Mask[i] >= -1 && "Out of bound mask element!");
10099 int MaskIdx = Mask[i];
10100 int ExpectedIdx = ExpectedMask[i];
10101 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10102 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10103 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10104 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10105 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10106 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10107 return false;
10108 }
10109 }
10110 return true;
10111}
10112
10113/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10114///
10115/// The masks must be exactly the same width.
10116///
10117/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10118/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10119///
10120/// SM_SentinelZero is accepted as a valid negative index but must match in
10121/// both, or via a known bits test.
10123 ArrayRef<int> ExpectedMask,
10124 const SelectionDAG &DAG,
10125 SDValue V1 = SDValue(),
10126 SDValue V2 = SDValue()) {
10127 int Size = Mask.size();
10128 if (Size != (int)ExpectedMask.size())
10129 return false;
10130 assert(llvm::all_of(ExpectedMask,
10131 [Size](int M) {
10132 return M == SM_SentinelZero ||
10133 isInRange(M, 0, 2 * Size);
10134 }) &&
10135 "Illegal target shuffle mask");
10136
10137 // Check for out-of-range target shuffle mask indices.
10138 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10139 return false;
10140
10141 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10142 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10143 !V1.getValueType().isVector()))
10144 V1 = SDValue();
10145 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10146 !V2.getValueType().isVector()))
10147 V2 = SDValue();
10148
10149 APInt ZeroV1 = APInt::getZero(Size);
10150 APInt ZeroV2 = APInt::getZero(Size);
10151
10152 for (int i = 0; i < Size; ++i) {
10153 int MaskIdx = Mask[i];
10154 int ExpectedIdx = ExpectedMask[i];
10155 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10156 continue;
10157 // If we failed to match an expected SM_SentinelZero then early out.
10158 if (ExpectedIdx < 0)
10159 return false;
10160 if (MaskIdx == SM_SentinelZero) {
10161 // If we need this expected index to be a zero element, then update the
10162 // relevant zero mask and perform the known bits at the end to minimize
10163 // repeated computes.
10164 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10165 if (ExpectedV &&
10166 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10167 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10168 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10169 ZeroMask.setBit(BitIdx);
10170 continue;
10171 }
10172 }
10173 if (MaskIdx >= 0) {
10174 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10175 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10176 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10177 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10178 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10179 continue;
10180 }
10181 return false;
10182 }
10183 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10184 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10185}
10186
10187// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10188// instructions.
10190 const SelectionDAG &DAG) {
10191 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10192 return false;
10193
10194 SmallVector<int, 8> Unpcklwd;
10195 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10196 /* Unary = */ false);
10197 SmallVector<int, 8> Unpckhwd;
10198 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10199 /* Unary = */ false);
10200 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10201 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10202 return IsUnpackwdMask;
10203}
10204
10206 const SelectionDAG &DAG) {
10207 // Create 128-bit vector type based on mask size.
10208 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10209 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10210
10211 // We can't assume a canonical shuffle mask, so try the commuted version too.
10212 SmallVector<int, 4> CommutedMask(Mask);
10214
10215 // Match any of unary/binary or low/high.
10216 for (unsigned i = 0; i != 4; ++i) {
10217 SmallVector<int, 16> UnpackMask;
10218 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10219 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10220 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10221 return true;
10222 }
10223 return false;
10224}
10225
10226/// Return true if a shuffle mask chooses elements identically in its top and
10227/// bottom halves. For example, any splat mask has the same top and bottom
10228/// halves. If an element is undefined in only one half of the mask, the halves
10229/// are not considered identical.
10231 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10232 unsigned HalfSize = Mask.size() / 2;
10233 for (unsigned i = 0; i != HalfSize; ++i) {
10234 if (Mask[i] != Mask[i + HalfSize])
10235 return false;
10236 }
10237 return true;
10238}
10239
10240/// Get a 4-lane 8-bit shuffle immediate for a mask.
10241///
10242/// This helper function produces an 8-bit shuffle immediate corresponding to
10243/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10244/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10245/// example.
10246///
10247/// NB: We rely heavily on "undef" masks preserving the input lane.
10248static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10249 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10250 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10251 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10252 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10253 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10254
10255 // If the mask only uses one non-undef element, then fully 'splat' it to
10256 // improve later broadcast matching.
10257 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10258 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10259
10260 int FirstElt = Mask[FirstIndex];
10261 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10262 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10263
10264 unsigned Imm = 0;
10265 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10266 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10267 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10268 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10269 return Imm;
10270}
10271
10273 SelectionDAG &DAG) {
10274 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10275}
10276
10277// Canonicalize SHUFPD mask to improve chances of further folding.
10278// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10279static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10280 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10281 "Unexpected SHUFPD mask size");
10282 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10283 "Unexpected SHUFPD mask elements");
10284
10285 // If the mask only uses one non-undef element, then fully 'splat' it to
10286 // improve later broadcast matching.
10287 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10288 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10289 "All undef shuffle mask");
10290
10291 int FirstElt = Mask[FirstIndex];
10292 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10293 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10294 unsigned Imm = 0;
10295 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10296 Imm |= FirstElt << I;
10297 return Imm;
10298 }
10299
10300 // Attempt to keep any undef elements in place to improve chances of the
10301 // shuffle becoming a (commutative) blend.
10302 unsigned Imm = 0;
10303 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10304 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10305
10306 return Imm;
10307}
10308
10310 SelectionDAG &DAG) {
10311 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10312}
10313
10314// The Shuffle result is as follow:
10315// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10316// Each Zeroable's element correspond to a particular Mask's element.
10317// As described in computeZeroableShuffleElements function.
10318//
10319// The function looks for a sub-mask that the nonzero elements are in
10320// increasing order. If such sub-mask exist. The function returns true.
10321static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10322 ArrayRef<int> Mask, const EVT &VectorType,
10323 bool &IsZeroSideLeft) {
10324 int NextElement = -1;
10325 // Check if the Mask's nonzero elements are in increasing order.
10326 for (int i = 0, e = Mask.size(); i < e; i++) {
10327 // Checks if the mask's zeros elements are built from only zeros.
10328 assert(Mask[i] >= -1 && "Out of bound mask element!");
10329 if (Mask[i] < 0)
10330 return false;
10331 if (Zeroable[i])
10332 continue;
10333 // Find the lowest non zero element
10334 if (NextElement < 0) {
10335 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10336 IsZeroSideLeft = NextElement != 0;
10337 }
10338 // Exit if the mask's non zero elements are not in increasing order.
10339 if (NextElement != Mask[i])
10340 return false;
10341 NextElement++;
10342 }
10343 return true;
10344}
10345
10346static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10348 const X86Subtarget &Subtarget,
10349 unsigned Depth = 0);
10350
10351/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10353 ArrayRef<int> Mask, SDValue V1,
10354 SDValue V2, const APInt &Zeroable,
10355 const X86Subtarget &Subtarget,
10356 SelectionDAG &DAG) {
10357 int Size = Mask.size();
10358 int LaneSize = 128 / VT.getScalarSizeInBits();
10359 const int NumBytes = VT.getSizeInBits() / 8;
10360 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10361
10362 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10363 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10364 (Subtarget.hasBWI() && VT.is512BitVector()));
10365
10366 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10367 // Sign bit set in i8 mask means zero element.
10368 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10369
10370 SDValue V;
10371 for (int i = 0; i < NumBytes; ++i) {
10372 int M = Mask[i / NumEltBytes];
10373 if (M < 0) {
10374 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10375 continue;
10376 }
10377 if (Zeroable[i / NumEltBytes]) {
10378 PSHUFBMask[i] = ZeroMask;
10379 continue;
10380 }
10381
10382 // We can only use a single input of V1 or V2.
10383 SDValue SrcV = (M >= Size ? V2 : V1);
10384 if (V && V != SrcV)
10385 return SDValue();
10386 V = SrcV;
10387 M %= Size;
10388
10389 // PSHUFB can't cross lanes, ensure this doesn't happen.
10390 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10391 return SDValue();
10392
10393 M = M % LaneSize;
10394 M = M * NumEltBytes + (i % NumEltBytes);
10395 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10396 }
10397 assert(V && "Failed to find a source input");
10398
10399 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10400 return DAG.getBitcast(
10401 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10402 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10403}
10404
10405static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10406 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10407 const SDLoc &dl);
10408
10409// X86 has dedicated shuffle that can be lowered to VEXPAND
10411 SDValue V2, ArrayRef<int> Mask,
10412 const APInt &Zeroable,
10413 const X86Subtarget &Subtarget,
10414 SelectionDAG &DAG) {
10415 bool IsLeftZeroSide = true;
10416 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10417 IsLeftZeroSide))
10418 return SDValue();
10419 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10421 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10422 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10423 unsigned NumElts = VT.getVectorNumElements();
10424 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10425 "Unexpected number of vector elements");
10426 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10427 Subtarget, DAG, DL);
10428 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10429 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10430 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10431}
10432
10433static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10434 unsigned &UnpackOpcode, bool IsUnary,
10435 ArrayRef<int> TargetMask, const SDLoc &DL,
10436 SelectionDAG &DAG,
10437 const X86Subtarget &Subtarget) {
10438 int NumElts = VT.getVectorNumElements();
10439
10440 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10441 for (int i = 0; i != NumElts; i += 2) {
10442 int M1 = TargetMask[i + 0];
10443 int M2 = TargetMask[i + 1];
10444 Undef1 &= (SM_SentinelUndef == M1);
10445 Undef2 &= (SM_SentinelUndef == M2);
10446 Zero1 &= isUndefOrZero(M1);
10447 Zero2 &= isUndefOrZero(M2);
10448 }
10449 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10450 "Zeroable shuffle detected");
10451
10452 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10453 SmallVector<int, 64> Unpckl, Unpckh;
10454 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10455 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10456 (IsUnary ? V1 : V2))) {
10457 UnpackOpcode = X86ISD::UNPCKL;
10458 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10459 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10460 return true;
10461 }
10462
10463 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10464 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10465 (IsUnary ? V1 : V2))) {
10466 UnpackOpcode = X86ISD::UNPCKH;
10467 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10468 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10469 return true;
10470 }
10471
10472 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10473 if (IsUnary && (Zero1 || Zero2)) {
10474 // Don't bother if we can blend instead.
10475 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10476 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10477 return false;
10478
10479 bool MatchLo = true, MatchHi = true;
10480 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10481 int M = TargetMask[i];
10482
10483 // Ignore if the input is known to be zero or the index is undef.
10484 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10485 (M == SM_SentinelUndef))
10486 continue;
10487
10488 MatchLo &= (M == Unpckl[i]);
10489 MatchHi &= (M == Unpckh[i]);
10490 }
10491
10492 if (MatchLo || MatchHi) {
10493 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10494 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10495 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10496 return true;
10497 }
10498 }
10499
10500 // If a binary shuffle, commute and try again.
10501 if (!IsUnary) {
10503 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10504 UnpackOpcode = X86ISD::UNPCKL;
10505 std::swap(V1, V2);
10506 return true;
10507 }
10508
10510 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10511 UnpackOpcode = X86ISD::UNPCKH;
10512 std::swap(V1, V2);
10513 return true;
10514 }
10515 }
10516
10517 return false;
10518}
10519
10520// X86 has dedicated unpack instructions that can handle specific blend
10521// operations: UNPCKH and UNPCKL.
10523 SDValue V2, ArrayRef<int> Mask,
10524 SelectionDAG &DAG) {
10525 SmallVector<int, 8> Unpckl;
10526 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10527 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10528 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10529
10530 SmallVector<int, 8> Unpckh;
10531 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10532 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10533 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10534
10535 // Commute and try again.
10537 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10538 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10539
10541 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10542 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10543
10544 return SDValue();
10545}
10546
10547/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10548/// followed by unpack 256-bit.
10550 SDValue V2, ArrayRef<int> Mask,
10551 SelectionDAG &DAG) {
10552 SmallVector<int, 32> Unpckl, Unpckh;
10553 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10554 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10555
10556 unsigned UnpackOpcode;
10557 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10558 UnpackOpcode = X86ISD::UNPCKL;
10559 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10560 UnpackOpcode = X86ISD::UNPCKH;
10561 else
10562 return SDValue();
10563
10564 // This is a "natural" unpack operation (rather than the 128-bit sectored
10565 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10566 // input in order to use the x86 instruction.
10567 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10568 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10569 V1 = DAG.getBitcast(VT, V1);
10570 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10571}
10572
10573// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10574// source into the lower elements and zeroing the upper elements.
10575static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10576 ArrayRef<int> Mask, const APInt &Zeroable,
10577 const X86Subtarget &Subtarget) {
10578 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10579 return false;
10580
10581 unsigned NumElts = Mask.size();
10582 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10583 unsigned MaxScale = 64 / EltSizeInBits;
10584
10585 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10586 unsigned SrcEltBits = EltSizeInBits * Scale;
10587 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10588 continue;
10589 unsigned NumSrcElts = NumElts / Scale;
10590 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10591 continue;
10592 unsigned UpperElts = NumElts - NumSrcElts;
10593 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10594 continue;
10595 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10596 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10597 DstVT = MVT::getIntegerVT(EltSizeInBits);
10598 if ((NumSrcElts * EltSizeInBits) >= 128) {
10599 // ISD::TRUNCATE
10600 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10601 } else {
10602 // X86ISD::VTRUNC
10603 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10604 }
10605 return true;
10606 }
10607
10608 return false;
10609}
10610
10611// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10612// element padding to the final DstVT.
10613static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10614 const X86Subtarget &Subtarget,
10615 SelectionDAG &DAG, bool ZeroUppers) {
10616 MVT SrcVT = Src.getSimpleValueType();
10617 MVT DstSVT = DstVT.getScalarType();
10618 unsigned NumDstElts = DstVT.getVectorNumElements();
10619 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10620 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10621
10622 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10623 return SDValue();
10624
10625 // Perform a direct ISD::TRUNCATE if possible.
10626 if (NumSrcElts == NumDstElts)
10627 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10628
10629 if (NumSrcElts > NumDstElts) {
10630 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10631 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10632 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10633 }
10634
10635 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10636 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10637 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10638 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10639 DstVT.getSizeInBits());
10640 }
10641
10642 // Non-VLX targets must truncate from a 512-bit type, so we need to
10643 // widen, truncate and then possibly extract the original subvector.
10644 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10645 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10646 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10647 }
10648
10649 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10650 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10651 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10652 if (DstVT != TruncVT)
10653 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10654 DstVT.getSizeInBits());
10655 return Trunc;
10656}
10657
10658// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10659//
10660// An example is the following:
10661//
10662// t0: ch = EntryToken
10663// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10664// t25: v4i32 = truncate t2
10665// t41: v8i16 = bitcast t25
10666// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10667// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10668// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10669// t18: v2i64 = bitcast t51
10670//
10671// One can just use a single vpmovdw instruction, without avx512vl we need to
10672// use the zmm variant and extract the lower subvector, padding with zeroes.
10673// TODO: Merge with lowerShuffleAsVTRUNC.
10675 SDValue V2, ArrayRef<int> Mask,
10676 const APInt &Zeroable,
10677 const X86Subtarget &Subtarget,
10678 SelectionDAG &DAG) {
10679 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10680 if (!Subtarget.hasAVX512())
10681 return SDValue();
10682
10683 unsigned NumElts = VT.getVectorNumElements();
10684 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10685 unsigned MaxScale = 64 / EltSizeInBits;
10686 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10687 unsigned SrcEltBits = EltSizeInBits * Scale;
10688 unsigned NumSrcElts = NumElts / Scale;
10689 unsigned UpperElts = NumElts - NumSrcElts;
10690 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10691 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10692 continue;
10693
10694 // Attempt to find a matching source truncation, but as a fall back VLX
10695 // cases can use the VPMOV directly.
10696 SDValue Src = peekThroughBitcasts(V1);
10697 if (Src.getOpcode() == ISD::TRUNCATE &&
10698 Src.getScalarValueSizeInBits() == SrcEltBits) {
10699 Src = Src.getOperand(0);
10700 } else if (Subtarget.hasVLX()) {
10701 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10702 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10703 Src = DAG.getBitcast(SrcVT, Src);
10704 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10705 if (Scale == 2 &&
10706 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10707 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10708 return SDValue();
10709 } else
10710 return SDValue();
10711
10712 // VPMOVWB is only available with avx512bw.
10713 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10714 return SDValue();
10715
10716 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10717 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10718 }
10719
10720 return SDValue();
10721}
10722
10723// Attempt to match binary shuffle patterns as a truncate.
10725 SDValue V2, ArrayRef<int> Mask,
10726 const APInt &Zeroable,
10727 const X86Subtarget &Subtarget,
10728 SelectionDAG &DAG) {
10729 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10730 "Unexpected VTRUNC type");
10731 if (!Subtarget.hasAVX512() ||
10732 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10733 return SDValue();
10734
10735 unsigned NumElts = VT.getVectorNumElements();
10736 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10737 unsigned MaxScale = 64 / EltSizeInBits;
10738 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10739 // TODO: Support non-BWI VPMOVWB truncations?
10740 unsigned SrcEltBits = EltSizeInBits * Scale;
10741 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10742 continue;
10743
10744 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10745 // Bail if the V2 elements are undef.
10746 unsigned NumHalfSrcElts = NumElts / Scale;
10747 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10748 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10749 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10750 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10751 continue;
10752
10753 // The elements beyond the truncation must be undef/zero.
10754 unsigned UpperElts = NumElts - NumSrcElts;
10755 if (UpperElts > 0 &&
10756 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10757 continue;
10758 bool UndefUppers =
10759 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10760
10761 // As we're using both sources then we need to concat them together
10762 // and truncate from the double-sized src.
10763 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10764
10765 // For offset truncations, ensure that the concat is cheap.
10766 SDValue Src =
10767 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10768 if (!Src) {
10769 if (Offset)
10770 continue;
10771 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10772 }
10773
10774 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10775 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10776 Src = DAG.getBitcast(SrcVT, Src);
10777
10778 // Shift the offset'd elements into place for the truncation.
10779 // TODO: Use getTargetVShiftByConstNode.
10780 if (Offset)
10781 Src = DAG.getNode(
10782 X86ISD::VSRLI, DL, SrcVT, Src,
10783 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10784
10785 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10786 }
10787 }
10788
10789 return SDValue();
10790}
10791
10792/// Check whether a compaction lowering can be done by dropping even/odd
10793/// elements and compute how many times even/odd elements must be dropped.
10794///
10795/// This handles shuffles which take every Nth element where N is a power of
10796/// two. Example shuffle masks:
10797///
10798/// (even)
10799/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10800/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10801/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10802/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10803/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10804/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10805///
10806/// (odd)
10807/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10808/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10809///
10810/// Any of these lanes can of course be undef.
10811///
10812/// This routine only supports N <= 3.
10813/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10814/// for larger N.
10815///
10816/// \returns N above, or the number of times even/odd elements must be dropped
10817/// if there is such a number. Otherwise returns zero.
10818static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10819 bool IsSingleInput) {
10820 // The modulus for the shuffle vector entries is based on whether this is
10821 // a single input or not.
10822 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10823 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10824 "We should only be called with masks with a power-of-2 size!");
10825
10826 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10827 int Offset = MatchEven ? 0 : 1;
10828
10829 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10830 // and 2^3 simultaneously. This is because we may have ambiguity with
10831 // partially undef inputs.
10832 bool ViableForN[3] = {true, true, true};
10833
10834 for (int i = 0, e = Mask.size(); i < e; ++i) {
10835 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10836 // want.
10837 if (Mask[i] < 0)
10838 continue;
10839
10840 bool IsAnyViable = false;
10841 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10842 if (ViableForN[j]) {
10843 uint64_t N = j + 1;
10844
10845 // The shuffle mask must be equal to (i * 2^N) % M.
10846 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10847 IsAnyViable = true;
10848 else
10849 ViableForN[j] = false;
10850 }
10851 // Early exit if we exhaust the possible powers of two.
10852 if (!IsAnyViable)
10853 break;
10854 }
10855
10856 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10857 if (ViableForN[j])
10858 return j + 1;
10859
10860 // Return 0 as there is no viable power of two.
10861 return 0;
10862}
10863
10864// X86 has dedicated pack instructions that can handle specific truncation
10865// operations: PACKSS and PACKUS.
10866// Checks for compaction shuffle masks if MaxStages > 1.
10867// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10868static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10869 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10870 const SelectionDAG &DAG,
10871 const X86Subtarget &Subtarget,
10872 unsigned MaxStages = 1) {
10873 unsigned NumElts = VT.getVectorNumElements();
10874 unsigned BitSize = VT.getScalarSizeInBits();
10875 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10876 "Illegal maximum compaction");
10877
10878 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10879 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10880 unsigned NumPackedBits = NumSrcBits - BitSize;
10881 N1 = peekThroughBitcasts(N1);
10882 N2 = peekThroughBitcasts(N2);
10883 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10884 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10885 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10886 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10887 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10888 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10889 return false;
10890 if (Subtarget.hasSSE41() || BitSize == 8) {
10891 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10892 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10893 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10894 V1 = N1;
10895 V2 = N2;
10896 SrcVT = PackVT;
10897 PackOpcode = X86ISD::PACKUS;
10898 return true;
10899 }
10900 }
10901 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10902 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10903 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10904 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10905 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10906 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10907 V1 = N1;
10908 V2 = N2;
10909 SrcVT = PackVT;
10910 PackOpcode = X86ISD::PACKSS;
10911 return true;
10912 }
10913 return false;
10914 };
10915
10916 // Attempt to match against wider and wider compaction patterns.
10917 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10918 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10919 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10920
10921 // Try binary shuffle.
10922 SmallVector<int, 32> BinaryMask;
10923 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10924 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10925 if (MatchPACK(V1, V2, PackVT))
10926 return true;
10927
10928 // Try unary shuffle.
10929 SmallVector<int, 32> UnaryMask;
10930 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10931 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10932 if (MatchPACK(V1, V1, PackVT))
10933 return true;
10934 }
10935
10936 return false;
10937}
10938
10940 SDValue V2, ArrayRef<int> Mask,
10941 const X86Subtarget &Subtarget,
10942 SelectionDAG &DAG) {
10943 MVT PackVT;
10944 unsigned PackOpcode;
10945 unsigned SizeBits = VT.getSizeInBits();
10946 unsigned EltBits = VT.getScalarSizeInBits();
10947 unsigned MaxStages = Log2_32(64 / EltBits);
10948 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10949 Subtarget, MaxStages))
10950 return SDValue();
10951
10952 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10953 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10954
10955 // Don't lower multi-stage packs on AVX512, truncation is better.
10956 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10957 return SDValue();
10958
10959 // Pack to the largest type possible:
10960 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10961 unsigned MaxPackBits = 16;
10962 if (CurrentEltBits > 16 &&
10963 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10964 MaxPackBits = 32;
10965
10966 // Repeatedly pack down to the target size.
10967 SDValue Res;
10968 for (unsigned i = 0; i != NumStages; ++i) {
10969 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10970 unsigned NumSrcElts = SizeBits / SrcEltBits;
10971 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10972 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10973 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10974 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10975 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10976 DAG.getBitcast(SrcVT, V2));
10977 V1 = V2 = Res;
10978 CurrentEltBits /= 2;
10979 }
10980 assert(Res && Res.getValueType() == VT &&
10981 "Failed to lower compaction shuffle");
10982 return Res;
10983}
10984
10985/// Try to emit a bitmask instruction for a shuffle.
10986///
10987/// This handles cases where we can model a blend exactly as a bitmask due to
10988/// one of the inputs being zeroable.
10990 SDValue V2, ArrayRef<int> Mask,
10991 const APInt &Zeroable,
10992 const X86Subtarget &Subtarget,
10993 SelectionDAG &DAG) {
10994 MVT MaskVT = VT;
10995 MVT EltVT = VT.getVectorElementType();
10996 SDValue Zero, AllOnes;
10997 // Use f64 if i64 isn't legal.
10998 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10999 EltVT = MVT::f64;
11000 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11001 }
11002
11003 MVT LogicVT = VT;
11004 if (EltVT.isFloatingPoint()) {
11005 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11006 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11007 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11008 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11009 } else {
11010 Zero = DAG.getConstant(0, DL, EltVT);
11011 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11012 }
11013
11014 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11015 SDValue V;
11016 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11017 if (Zeroable[i])
11018 continue;
11019 if (Mask[i] % Size != i)
11020 return SDValue(); // Not a blend.
11021 if (!V)
11022 V = Mask[i] < Size ? V1 : V2;
11023 else if (V != (Mask[i] < Size ? V1 : V2))
11024 return SDValue(); // Can only let one input through the mask.
11025
11026 VMaskOps[i] = AllOnes;
11027 }
11028 if (!V)
11029 return SDValue(); // No non-zeroable elements!
11030
11031 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11032 VMask = DAG.getBitcast(LogicVT, VMask);
11033 V = DAG.getBitcast(LogicVT, V);
11034 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11035 return DAG.getBitcast(VT, And);
11036}
11037
11038/// Try to emit a blend instruction for a shuffle using bit math.
11039///
11040/// This is used as a fallback approach when first class blend instructions are
11041/// unavailable. Currently it is only suitable for integer vectors, but could
11042/// be generalized for floating point vectors if desirable.
11044 SDValue V2, ArrayRef<int> Mask,
11045 SelectionDAG &DAG) {
11046 assert(VT.isInteger() && "Only supports integer vector types!");
11047 MVT EltVT = VT.getVectorElementType();
11048 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11049 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11051 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11052 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11053 return SDValue(); // Shuffled input!
11054 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11055 }
11056
11057 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11058 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11059}
11060
11062 SDValue PreservedSrc,
11063 const X86Subtarget &Subtarget,
11064 SelectionDAG &DAG);
11065
11068 const APInt &Zeroable, bool &ForceV1Zero,
11069 bool &ForceV2Zero, uint64_t &BlendMask) {
11070 bool V1IsZeroOrUndef =
11072 bool V2IsZeroOrUndef =
11074
11075 BlendMask = 0;
11076 ForceV1Zero = false, ForceV2Zero = false;
11077 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11078
11079 int NumElts = Mask.size();
11080 int NumLanes = VT.getSizeInBits() / 128;
11081 int NumEltsPerLane = NumElts / NumLanes;
11082 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11083
11084 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11085 // then ensure the blend mask part for that lane just references that input.
11086 bool ForceWholeLaneMasks =
11087 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11088
11089 // Attempt to generate the binary blend mask. If an input is zero then
11090 // we can use any lane.
11091 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11092 // Keep track of the inputs used per lane.
11093 bool LaneV1InUse = false;
11094 bool LaneV2InUse = false;
11095 uint64_t LaneBlendMask = 0;
11096 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11097 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11098 int M = Mask[Elt];
11099 if (M == SM_SentinelUndef)
11100 continue;
11101 if (M == Elt || (0 <= M && M < NumElts &&
11102 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11103 Mask[Elt] = Elt;
11104 LaneV1InUse = true;
11105 continue;
11106 }
11107 if (M == (Elt + NumElts) ||
11108 (NumElts <= M &&
11109 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11110 LaneBlendMask |= 1ull << LaneElt;
11111 Mask[Elt] = Elt + NumElts;
11112 LaneV2InUse = true;
11113 continue;
11114 }
11115 if (Zeroable[Elt]) {
11116 if (V1IsZeroOrUndef) {
11117 ForceV1Zero = true;
11118 Mask[Elt] = Elt;
11119 LaneV1InUse = true;
11120 continue;
11121 }
11122 if (V2IsZeroOrUndef) {
11123 ForceV2Zero = true;
11124 LaneBlendMask |= 1ull << LaneElt;
11125 Mask[Elt] = Elt + NumElts;
11126 LaneV2InUse = true;
11127 continue;
11128 }
11129 }
11130 return false;
11131 }
11132
11133 // If we only used V2 then splat the lane blend mask to avoid any demanded
11134 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11135 // blend mask bit).
11136 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11137 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11138
11139 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11140 }
11141 return true;
11142}
11143
11144/// Try to emit a blend instruction for a shuffle.
11145///
11146/// This doesn't do any checks for the availability of instructions for blending
11147/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11148/// be matched in the backend with the type given. What it does check for is
11149/// that the shuffle mask is a blend, or convertible into a blend with zero.
11151 SDValue V2, ArrayRef<int> Original,
11152 const APInt &Zeroable,
11153 const X86Subtarget &Subtarget,
11154 SelectionDAG &DAG) {
11155 uint64_t BlendMask = 0;
11156 bool ForceV1Zero = false, ForceV2Zero = false;
11157 SmallVector<int, 64> Mask(Original);
11158 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11159 BlendMask))
11160 return SDValue();
11161
11162 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11163 if (ForceV1Zero)
11164 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11165 if (ForceV2Zero)
11166 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11167
11168 unsigned NumElts = VT.getVectorNumElements();
11169
11170 switch (VT.SimpleTy) {
11171 case MVT::v4i64:
11172 case MVT::v8i32:
11173 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11174 [[fallthrough]];
11175 case MVT::v4f64:
11176 case MVT::v8f32:
11177 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11178 [[fallthrough]];
11179 case MVT::v2f64:
11180 case MVT::v2i64:
11181 case MVT::v4f32:
11182 case MVT::v4i32:
11183 case MVT::v8i16:
11184 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11185 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11186 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11187 case MVT::v16i16: {
11188 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11189 SmallVector<int, 8> RepeatedMask;
11190 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11191 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11192 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11193 BlendMask = 0;
11194 for (int i = 0; i < 8; ++i)
11195 if (RepeatedMask[i] >= 8)
11196 BlendMask |= 1ull << i;
11197 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11198 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11199 }
11200 // Use PBLENDW for lower/upper lanes and then blend lanes.
11201 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11202 // merge to VSELECT where useful.
11203 uint64_t LoMask = BlendMask & 0xFF;
11204 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11205 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11206 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11207 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11208 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11209 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11210 return DAG.getVectorShuffle(
11211 MVT::v16i16, DL, Lo, Hi,
11212 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11213 }
11214 [[fallthrough]];
11215 }
11216 case MVT::v32i8:
11217 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11218 [[fallthrough]];
11219 case MVT::v16i8: {
11220 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11221
11222 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11223 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11224 Subtarget, DAG))
11225 return Masked;
11226
11227 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11228 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11229 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11230 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11231 }
11232
11233 // If we have VPTERNLOG, we can use that as a bit blend.
11234 if (Subtarget.hasVLX())
11235 if (SDValue BitBlend =
11236 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11237 return BitBlend;
11238
11239 // Scale the blend by the number of bytes per element.
11240 int Scale = VT.getScalarSizeInBits() / 8;
11241
11242 // This form of blend is always done on bytes. Compute the byte vector
11243 // type.
11244 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11245
11246 // x86 allows load folding with blendvb from the 2nd source operand. But
11247 // we are still using LLVM select here (see comment below), so that's V1.
11248 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11249 // allow that load-folding possibility.
11250 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11252 std::swap(V1, V2);
11253 }
11254
11255 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11256 // mix of LLVM's code generator and the x86 backend. We tell the code
11257 // generator that boolean values in the elements of an x86 vector register
11258 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11259 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11260 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11261 // of the element (the remaining are ignored) and 0 in that high bit would
11262 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11263 // the LLVM model for boolean values in vector elements gets the relevant
11264 // bit set, it is set backwards and over constrained relative to x86's
11265 // actual model.
11266 SmallVector<SDValue, 32> VSELECTMask;
11267 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11268 for (int j = 0; j < Scale; ++j)
11269 VSELECTMask.push_back(
11270 Mask[i] < 0
11271 ? DAG.getUNDEF(MVT::i8)
11272 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11273
11274 V1 = DAG.getBitcast(BlendVT, V1);
11275 V2 = DAG.getBitcast(BlendVT, V2);
11276 return DAG.getBitcast(
11277 VT,
11278 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11279 V1, V2));
11280 }
11281 case MVT::v16f32:
11282 case MVT::v8f64:
11283 case MVT::v8i64:
11284 case MVT::v16i32:
11285 case MVT::v32i16:
11286 case MVT::v64i8: {
11287 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11288 bool OptForSize = DAG.shouldOptForSize();
11289 if (!OptForSize) {
11290 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11291 Subtarget, DAG))
11292 return Masked;
11293 }
11294
11295 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11296 // masked move.
11297 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11298 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11299 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11300 }
11301 default:
11302 llvm_unreachable("Not a supported integer vector type!");
11303 }
11304}
11305
11306/// Try to lower as a blend of elements from two inputs followed by
11307/// a single-input permutation.
11308///
11309/// This matches the pattern where we can blend elements from two inputs and
11310/// then reduce the shuffle to a single-input permutation.
11312 SDValue V1, SDValue V2,
11313 ArrayRef<int> Mask,
11314 SelectionDAG &DAG,
11315 bool ImmBlends = false) {
11316 // We build up the blend mask while checking whether a blend is a viable way
11317 // to reduce the shuffle.
11318 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11319 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11320
11321 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11322 if (Mask[i] < 0)
11323 continue;
11324
11325 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11326
11327 if (BlendMask[Mask[i] % Size] < 0)
11328 BlendMask[Mask[i] % Size] = Mask[i];
11329 else if (BlendMask[Mask[i] % Size] != Mask[i])
11330 return SDValue(); // Can't blend in the needed input!
11331
11332 PermuteMask[i] = Mask[i] % Size;
11333 }
11334
11335 // If only immediate blends, then bail if the blend mask can't be widened to
11336 // i16.
11337 unsigned EltSize = VT.getScalarSizeInBits();
11338 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11339 return SDValue();
11340
11341 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11342 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11343}
11344
11345/// Try to lower as an unpack of elements from two inputs followed by
11346/// a single-input permutation.
11347///
11348/// This matches the pattern where we can unpack elements from two inputs and
11349/// then reduce the shuffle to a single-input (wider) permutation.
11351 SDValue V1, SDValue V2,
11352 ArrayRef<int> Mask,
11353 SelectionDAG &DAG) {
11354 int NumElts = Mask.size();
11355 int NumLanes = VT.getSizeInBits() / 128;
11356 int NumLaneElts = NumElts / NumLanes;
11357 int NumHalfLaneElts = NumLaneElts / 2;
11358
11359 bool MatchLo = true, MatchHi = true;
11360 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11361
11362 // Determine UNPCKL/UNPCKH type and operand order.
11363 for (int Elt = 0; Elt != NumElts; ++Elt) {
11364 int M = Mask[Elt];
11365 if (M < 0)
11366 continue;
11367
11368 // Normalize the mask value depending on whether it's V1 or V2.
11369 int NormM = M;
11370 SDValue &Op = Ops[Elt & 1];
11371 if (M < NumElts && (Op.isUndef() || Op == V1))
11372 Op = V1;
11373 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11374 Op = V2;
11375 NormM -= NumElts;
11376 } else
11377 return SDValue();
11378
11379 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11380 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11381 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11382 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11383 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11384 if (MatchLoAnyLane || MatchHiAnyLane) {
11385 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11386 "Failed to match UNPCKLO/UNPCKHI");
11387 break;
11388 }
11389 }
11390 MatchLo &= MatchLoAnyLane;
11391 MatchHi &= MatchHiAnyLane;
11392 if (!MatchLo && !MatchHi)
11393 return SDValue();
11394 }
11395 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11396
11397 // Element indices have changed after unpacking. Calculate permute mask
11398 // so that they will be put back to the position as dictated by the
11399 // original shuffle mask indices.
11400 SmallVector<int, 32> PermuteMask(NumElts, -1);
11401 for (int Elt = 0; Elt != NumElts; ++Elt) {
11402 int M = Mask[Elt];
11403 if (M < 0)
11404 continue;
11405 int NormM = M;
11406 if (NumElts <= M)
11407 NormM -= NumElts;
11408 bool IsFirstOp = M < NumElts;
11409 int BaseMaskElt =
11410 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11411 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11412 PermuteMask[Elt] = BaseMaskElt;
11413 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11414 PermuteMask[Elt] = BaseMaskElt + 1;
11415 assert(PermuteMask[Elt] != -1 &&
11416 "Input mask element is defined but failed to assign permute mask");
11417 }
11418
11419 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11420 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11421 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11422}
11423
11424/// Try to lower a shuffle as a permute of the inputs followed by an
11425/// UNPCK instruction.
11426///
11427/// This specifically targets cases where we end up with alternating between
11428/// the two inputs, and so can permute them into something that feeds a single
11429/// UNPCK instruction. Note that this routine only targets integer vectors
11430/// because for floating point vectors we have a generalized SHUFPS lowering
11431/// strategy that handles everything that doesn't *exactly* match an unpack,
11432/// making this clever lowering unnecessary.
11434 SDValue V1, SDValue V2,
11435 ArrayRef<int> Mask,
11436 const X86Subtarget &Subtarget,
11437 SelectionDAG &DAG) {
11438 int Size = Mask.size();
11439 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11440
11441 // This routine only supports 128-bit integer dual input vectors.
11442 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11443 return SDValue();
11444
11445 int NumLoInputs =
11446 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11447 int NumHiInputs =
11448 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11449
11450 bool UnpackLo = NumLoInputs >= NumHiInputs;
11451
11452 auto TryUnpack = [&](int ScalarSize, int Scale) {
11453 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11454 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11455
11456 for (int i = 0; i < Size; ++i) {
11457 if (Mask[i] < 0)
11458 continue;
11459
11460 // Each element of the unpack contains Scale elements from this mask.
11461 int UnpackIdx = i / Scale;
11462
11463 // We only handle the case where V1 feeds the first slots of the unpack.
11464 // We rely on canonicalization to ensure this is the case.
11465 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11466 return SDValue();
11467
11468 // Setup the mask for this input. The indexing is tricky as we have to
11469 // handle the unpack stride.
11470 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11471 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11472 Mask[i] % Size;
11473 }
11474
11475 // If we will have to shuffle both inputs to use the unpack, check whether
11476 // we can just unpack first and shuffle the result. If so, skip this unpack.
11477 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11478 !isNoopShuffleMask(V2Mask))
11479 return SDValue();
11480
11481 // Shuffle the inputs into place.
11482 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11483 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11484
11485 // Cast the inputs to the type we will use to unpack them.
11486 MVT UnpackVT =
11487 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11488 V1 = DAG.getBitcast(UnpackVT, V1);
11489 V2 = DAG.getBitcast(UnpackVT, V2);
11490
11491 // Unpack the inputs and cast the result back to the desired type.
11492 return DAG.getBitcast(
11493 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11494 UnpackVT, V1, V2));
11495 };
11496
11497 // We try each unpack from the largest to the smallest to try and find one
11498 // that fits this mask.
11499 int OrigScalarSize = VT.getScalarSizeInBits();
11500 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11501 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11502 return Unpack;
11503
11504 // If we're shuffling with a zero vector then we're better off not doing
11505 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11508 return SDValue();
11509
11510 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11511 // initial unpack.
11512 if (NumLoInputs == 0 || NumHiInputs == 0) {
11513 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11514 "We have to have *some* inputs!");
11515 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11516
11517 // FIXME: We could consider the total complexity of the permute of each
11518 // possible unpacking. Or at the least we should consider how many
11519 // half-crossings are created.
11520 // FIXME: We could consider commuting the unpacks.
11521
11522 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11523 for (int i = 0; i < Size; ++i) {
11524 if (Mask[i] < 0)
11525 continue;
11526
11527 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11528
11529 PermMask[i] =
11530 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11531 }
11532 return DAG.getVectorShuffle(
11533 VT, DL,
11534 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11535 V1, V2),
11536 DAG.getUNDEF(VT), PermMask);
11537 }
11538
11539 return SDValue();
11540}
11541
11542/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11543/// permuting the elements of the result in place.
11545 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11546 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11547 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11548 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11549 (VT.is512BitVector() && !Subtarget.hasBWI()))
11550 return SDValue();
11551
11552 // We don't currently support lane crossing permutes.
11553 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11554 return SDValue();
11555
11556 int Scale = VT.getScalarSizeInBits() / 8;
11557 int NumLanes = VT.getSizeInBits() / 128;
11558 int NumElts = VT.getVectorNumElements();
11559 int NumEltsPerLane = NumElts / NumLanes;
11560
11561 // Determine range of mask elts.
11562 bool Blend1 = true;
11563 bool Blend2 = true;
11564 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11565 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11566 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11567 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11568 int M = Mask[Lane + Elt];
11569 if (M < 0)
11570 continue;
11571 if (M < NumElts) {
11572 Blend1 &= (M == (Lane + Elt));
11573 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11574 M = M % NumEltsPerLane;
11575 Range1.first = std::min(Range1.first, M);
11576 Range1.second = std::max(Range1.second, M);
11577 } else {
11578 M -= NumElts;
11579 Blend2 &= (M == (Lane + Elt));
11580 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11581 M = M % NumEltsPerLane;
11582 Range2.first = std::min(Range2.first, M);
11583 Range2.second = std::max(Range2.second, M);
11584 }
11585 }
11586 }
11587
11588 // Bail if we don't need both elements.
11589 // TODO - it might be worth doing this for unary shuffles if the permute
11590 // can be widened.
11591 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11592 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11593 return SDValue();
11594
11595 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11596 return SDValue();
11597
11598 // Rotate the 2 ops so we can access both ranges, then permute the result.
11599 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11600 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11601 SDValue Rotate = DAG.getBitcast(
11602 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11603 DAG.getBitcast(ByteVT, Lo),
11604 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11605 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11606 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11607 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11608 int M = Mask[Lane + Elt];
11609 if (M < 0)
11610 continue;
11611 if (M < NumElts)
11612 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11613 else
11614 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11615 }
11616 }
11617 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11618 };
11619
11620 // Check if the ranges are small enough to rotate from either direction.
11621 if (Range2.second < Range1.first)
11622 return RotateAndPermute(V1, V2, Range1.first, 0);
11623 if (Range1.second < Range2.first)
11624 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11625 return SDValue();
11626}
11627
11629 return isUndefOrEqual(Mask, 0);
11630}
11631
11633 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11634}
11635
11636/// Check if the Mask consists of the same element repeated multiple times.
11638 size_t NumUndefs = 0;
11639 std::optional<int> UniqueElt;
11640 for (int Elt : Mask) {
11641 if (Elt == SM_SentinelUndef) {
11642 NumUndefs++;
11643 continue;
11644 }
11645 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11646 return false;
11647 UniqueElt = Elt;
11648 }
11649 // Make sure the element is repeated enough times by checking the number of
11650 // undefs is small.
11651 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11652}
11653
11654/// Generic routine to decompose a shuffle and blend into independent
11655/// blends and permutes.
11656///
11657/// This matches the extremely common pattern for handling combined
11658/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11659/// operations. It will try to pick the best arrangement of shuffles and
11660/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11662 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11663 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11664 int NumElts = Mask.size();
11665 int NumLanes = VT.getSizeInBits() / 128;
11666 int NumEltsPerLane = NumElts / NumLanes;
11667
11668 // Shuffle the input elements into the desired positions in V1 and V2 and
11669 // unpack/blend them together.
11670 bool IsAlternating = true;
11671 bool V1Zero = true, V2Zero = true;
11672 SmallVector<int, 32> V1Mask(NumElts, -1);
11673 SmallVector<int, 32> V2Mask(NumElts, -1);
11674 SmallVector<int, 32> FinalMask(NumElts, -1);
11675 for (int i = 0; i < NumElts; ++i) {
11676 int M = Mask[i];
11677 if (M >= 0 && M < NumElts) {
11678 V1Mask[i] = M;
11679 FinalMask[i] = i;
11680 V1Zero &= Zeroable[i];
11681 IsAlternating &= (i & 1) == 0;
11682 } else if (M >= NumElts) {
11683 V2Mask[i] = M - NumElts;
11684 FinalMask[i] = i + NumElts;
11685 V2Zero &= Zeroable[i];
11686 IsAlternating &= (i & 1) == 1;
11687 }
11688 }
11689
11690 // If we effectively only demand the 0'th element of \p Input, and not only
11691 // as 0'th element, then broadcast said input,
11692 // and change \p InputMask to be a no-op (identity) mask.
11693 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11694 &DAG](SDValue &Input,
11695 MutableArrayRef<int> InputMask) {
11696 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11697 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11698 !X86::mayFoldLoad(Input, Subtarget)))
11699 return;
11700 if (isNoopShuffleMask(InputMask))
11701 return;
11702 assert(isBroadcastShuffleMask(InputMask) &&
11703 "Expected to demand only the 0'th element.");
11704 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11705 for (auto I : enumerate(InputMask)) {
11706 int &InputMaskElt = I.value();
11707 if (InputMaskElt >= 0)
11708 InputMaskElt = I.index();
11709 }
11710 };
11711
11712 // Currently, we may need to produce one shuffle per input, and blend results.
11713 // It is possible that the shuffle for one of the inputs is already a no-op.
11714 // See if we can simplify non-no-op shuffles into broadcasts,
11715 // which we consider to be strictly better than an arbitrary shuffle.
11716 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11718 canonicalizeBroadcastableInput(V1, V1Mask);
11719 canonicalizeBroadcastableInput(V2, V2Mask);
11720 }
11721
11722 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11723 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11724 // the shuffle may be able to fold with a load or other benefit. However, when
11725 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11726 // pre-shuffle first is a better strategy.
11727 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11728 // Only prefer immediate blends to unpack/rotate.
11729 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11730 DAG, true))
11731 return BlendPerm;
11732 // If either input vector provides only a single element which is repeated
11733 // multiple times, unpacking from both input vectors would generate worse
11734 // code. e.g. for
11735 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11736 // it is better to process t4 first to create a vector of t4[0], then unpack
11737 // that vector with t2.
11738 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11740 if (SDValue UnpackPerm =
11741 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11742 return UnpackPerm;
11744 DL, VT, V1, V2, Mask, Subtarget, DAG))
11745 return RotatePerm;
11746 // Unpack/rotate failed - try again with variable blends.
11747 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11748 DAG))
11749 return BlendPerm;
11750 if (VT.getScalarSizeInBits() >= 32)
11751 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11752 DL, VT, V1, V2, Mask, Subtarget, DAG))
11753 return PermUnpack;
11754 }
11755
11756 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11757 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11758 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11759 // than half the elements coming from each source.
11760 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11761 V1Mask.assign(NumElts, -1);
11762 V2Mask.assign(NumElts, -1);
11763 FinalMask.assign(NumElts, -1);
11764 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11765 for (int j = 0; j != NumEltsPerLane; ++j) {
11766 int M = Mask[i + j];
11767 if (M >= 0 && M < NumElts) {
11768 V1Mask[i + (j / 2)] = M;
11769 FinalMask[i + j] = i + (j / 2);
11770 } else if (M >= NumElts) {
11771 V2Mask[i + (j / 2)] = M - NumElts;
11772 FinalMask[i + j] = i + (j / 2) + NumElts;
11773 }
11774 }
11775 }
11776
11777 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11778 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11779 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11780}
11781
11782static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11783 const X86Subtarget &Subtarget,
11784 ArrayRef<int> Mask) {
11785 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11786 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11787
11788 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11789 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11790 int MaxSubElts = 64 / EltSizeInBits;
11791 unsigned RotateAmt, NumSubElts;
11792 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11793 MaxSubElts, NumSubElts, RotateAmt))
11794 return -1;
11795 unsigned NumElts = Mask.size();
11796 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11797 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11798 return RotateAmt;
11799}
11800
11801/// Lower shuffle using X86ISD::VROTLI rotations.
11803 ArrayRef<int> Mask,
11804 const X86Subtarget &Subtarget,
11805 SelectionDAG &DAG) {
11806 // Only XOP + AVX512 targets have bit rotation instructions.
11807 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11808 bool IsLegal =
11809 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11810 if (!IsLegal && Subtarget.hasSSE3())
11811 return SDValue();
11812
11813 MVT RotateVT;
11814 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11815 Subtarget, Mask);
11816 if (RotateAmt < 0)
11817 return SDValue();
11818
11819 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11820 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11821 // widen to vXi16 or more then existing lowering should will be better.
11822 if (!IsLegal) {
11823 if ((RotateAmt % 16) == 0)
11824 return SDValue();
11825 // TODO: Use getTargetVShiftByConstNode.
11826 unsigned ShlAmt = RotateAmt;
11827 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11828 V1 = DAG.getBitcast(RotateVT, V1);
11829 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11830 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11831 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11832 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11833 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11834 return DAG.getBitcast(VT, Rot);
11835 }
11836
11837 SDValue Rot =
11838 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11839 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11840 return DAG.getBitcast(VT, Rot);
11841}
11842
11843/// Try to match a vector shuffle as an element rotation.
11844///
11845/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11847 ArrayRef<int> Mask) {
11848 int NumElts = Mask.size();
11849
11850 // We need to detect various ways of spelling a rotation:
11851 // [11, 12, 13, 14, 15, 0, 1, 2]
11852 // [-1, 12, 13, 14, -1, -1, 1, -1]
11853 // [-1, -1, -1, -1, -1, -1, 1, 2]
11854 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11855 // [-1, 4, 5, 6, -1, -1, 9, -1]
11856 // [-1, 4, 5, 6, -1, -1, -1, -1]
11857 int Rotation = 0;
11858 SDValue Lo, Hi;
11859 for (int i = 0; i < NumElts; ++i) {
11860 int M = Mask[i];
11861 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11862 "Unexpected mask index.");
11863 if (M < 0)
11864 continue;
11865
11866 // Determine where a rotated vector would have started.
11867 int StartIdx = i - (M % NumElts);
11868 if (StartIdx == 0)
11869 // The identity rotation isn't interesting, stop.
11870 return -1;
11871
11872 // If we found the tail of a vector the rotation must be the missing
11873 // front. If we found the head of a vector, it must be how much of the
11874 // head.
11875 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11876
11877 if (Rotation == 0)
11878 Rotation = CandidateRotation;
11879 else if (Rotation != CandidateRotation)
11880 // The rotations don't match, so we can't match this mask.
11881 return -1;
11882
11883 // Compute which value this mask is pointing at.
11884 SDValue MaskV = M < NumElts ? V1 : V2;
11885
11886 // Compute which of the two target values this index should be assigned
11887 // to. This reflects whether the high elements are remaining or the low
11888 // elements are remaining.
11889 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11890
11891 // Either set up this value if we've not encountered it before, or check
11892 // that it remains consistent.
11893 if (!TargetV)
11894 TargetV = MaskV;
11895 else if (TargetV != MaskV)
11896 // This may be a rotation, but it pulls from the inputs in some
11897 // unsupported interleaving.
11898 return -1;
11899 }
11900
11901 // Check that we successfully analyzed the mask, and normalize the results.
11902 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11903 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11904 if (!Lo)
11905 Lo = Hi;
11906 else if (!Hi)
11907 Hi = Lo;
11908
11909 V1 = Lo;
11910 V2 = Hi;
11911
11912 return Rotation;
11913}
11914
11915/// Try to lower a vector shuffle as a byte rotation.
11916///
11917/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11918/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11919/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11920/// try to generically lower a vector shuffle through such an pattern. It
11921/// does not check for the profitability of lowering either as PALIGNR or
11922/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11923/// This matches shuffle vectors that look like:
11924///
11925/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11926///
11927/// Essentially it concatenates V1 and V2, shifts right by some number of
11928/// elements, and takes the low elements as the result. Note that while this is
11929/// specified as a *right shift* because x86 is little-endian, it is a *left
11930/// rotate* of the vector lanes.
11932 ArrayRef<int> Mask) {
11933 // Don't accept any shuffles with zero elements.
11934 if (isAnyZero(Mask))
11935 return -1;
11936
11937 // PALIGNR works on 128-bit lanes.
11938 SmallVector<int, 16> RepeatedMask;
11939 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11940 return -1;
11941
11942 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11943 if (Rotation <= 0)
11944 return -1;
11945
11946 // PALIGNR rotates bytes, so we need to scale the
11947 // rotation based on how many bytes are in the vector lane.
11948 int NumElts = RepeatedMask.size();
11949 int Scale = 16 / NumElts;
11950 return Rotation * Scale;
11951}
11952
11954 SDValue V2, ArrayRef<int> Mask,
11955 const X86Subtarget &Subtarget,
11956 SelectionDAG &DAG) {
11957 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11958
11959 SDValue Lo = V1, Hi = V2;
11960 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11961 if (ByteRotation <= 0)
11962 return SDValue();
11963
11964 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11965 // PSLLDQ/PSRLDQ.
11966 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11967 Lo = DAG.getBitcast(ByteVT, Lo);
11968 Hi = DAG.getBitcast(ByteVT, Hi);
11969
11970 // SSSE3 targets can use the palignr instruction.
11971 if (Subtarget.hasSSSE3()) {
11972 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11973 "512-bit PALIGNR requires BWI instructions");
11974 return DAG.getBitcast(
11975 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11976 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11977 }
11978
11979 assert(VT.is128BitVector() &&
11980 "Rotate-based lowering only supports 128-bit lowering!");
11981 assert(Mask.size() <= 16 &&
11982 "Can shuffle at most 16 bytes in a 128-bit vector!");
11983 assert(ByteVT == MVT::v16i8 &&
11984 "SSE2 rotate lowering only needed for v16i8!");
11985
11986 // Default SSE2 implementation
11987 int LoByteShift = 16 - ByteRotation;
11988 int HiByteShift = ByteRotation;
11989
11990 SDValue LoShift =
11991 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11992 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11993 SDValue HiShift =
11994 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11995 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11996 return DAG.getBitcast(VT,
11997 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11998}
11999
12000/// Try to lower a vector shuffle as a dword/qword rotation.
12001///
12002/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12003/// rotation of the concatenation of two vectors; This routine will
12004/// try to generically lower a vector shuffle through such an pattern.
12005///
12006/// Essentially it concatenates V1 and V2, shifts right by some number of
12007/// elements, and takes the low elements as the result. Note that while this is
12008/// specified as a *right shift* because x86 is little-endian, it is a *left
12009/// rotate* of the vector lanes.
12011 SDValue V2, ArrayRef<int> Mask,
12012 const APInt &Zeroable,
12013 const X86Subtarget &Subtarget,
12014 SelectionDAG &DAG) {
12015 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12016 "Only 32-bit and 64-bit elements are supported!");
12017
12018 // 128/256-bit vectors are only supported with VLX.
12019 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12020 && "VLX required for 128/256-bit vectors");
12021
12022 SDValue Lo = V1, Hi = V2;
12023 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12024 if (0 < Rotation)
12025 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12026 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12027
12028 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12029 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12030 // TODO: We can probably make this more aggressive and use shift-pairs like
12031 // lowerShuffleAsByteShiftMask.
12032 unsigned NumElts = Mask.size();
12033 unsigned ZeroLo = Zeroable.countr_one();
12034 unsigned ZeroHi = Zeroable.countl_one();
12035 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12036 if (!ZeroLo && !ZeroHi)
12037 return SDValue();
12038
12039 if (ZeroLo) {
12040 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12041 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12042 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12043 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12044 getZeroVector(VT, Subtarget, DAG, DL),
12045 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12046 }
12047
12048 if (ZeroHi) {
12049 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12050 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12051 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12052 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12053 getZeroVector(VT, Subtarget, DAG, DL), Src,
12054 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12055 }
12056
12057 return SDValue();
12058}
12059
12060/// Try to lower a vector shuffle as a byte shift sequence.
12062 SDValue V2, ArrayRef<int> Mask,
12063 const APInt &Zeroable,
12064 const X86Subtarget &Subtarget,
12065 SelectionDAG &DAG) {
12066 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12067 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12068
12069 // We need a shuffle that has zeros at one/both ends and a sequential
12070 // shuffle from one source within.
12071 unsigned ZeroLo = Zeroable.countr_one();
12072 unsigned ZeroHi = Zeroable.countl_one();
12073 if (!ZeroLo && !ZeroHi)
12074 return SDValue();
12075
12076 unsigned NumElts = Mask.size();
12077 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12078 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12079 return SDValue();
12080
12081 unsigned Scale = VT.getScalarSizeInBits() / 8;
12082 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12083 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12084 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12085 return SDValue();
12086
12087 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12088 Res = DAG.getBitcast(MVT::v16i8, Res);
12089
12090 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12091 // inner sequential set of elements, possibly offset:
12092 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12093 // 01234567 --> 4567zzzz --> zzzzz456
12094 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12095 if (ZeroLo == 0) {
12096 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12097 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12098 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12099 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12100 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12101 } else if (ZeroHi == 0) {
12102 unsigned Shift = Mask[ZeroLo] % NumElts;
12103 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12104 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12105 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12106 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12107 } else if (!Subtarget.hasSSSE3()) {
12108 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12109 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12110 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12111 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12112 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12113 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12114 Shift += Mask[ZeroLo] % NumElts;
12115 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12116 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12117 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12118 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12119 } else
12120 return SDValue();
12121
12122 return DAG.getBitcast(VT, Res);
12123}
12124
12125/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12126///
12127/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12128/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12129/// matches elements from one of the input vectors shuffled to the left or
12130/// right with zeroable elements 'shifted in'. It handles both the strictly
12131/// bit-wise element shifts and the byte shift across an entire 128-bit double
12132/// quad word lane.
12133///
12134/// PSHL : (little-endian) left bit shift.
12135/// [ zz, 0, zz, 2 ]
12136/// [ -1, 4, zz, -1 ]
12137/// PSRL : (little-endian) right bit shift.
12138/// [ 1, zz, 3, zz]
12139/// [ -1, -1, 7, zz]
12140/// PSLLDQ : (little-endian) left byte shift
12141/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12142/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12143/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12144/// PSRLDQ : (little-endian) right byte shift
12145/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12146/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12147/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12148static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12149 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12150 int MaskOffset, const APInt &Zeroable,
12151 const X86Subtarget &Subtarget) {
12152 int Size = Mask.size();
12153 unsigned SizeInBits = Size * ScalarSizeInBits;
12154
12155 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12156 for (int i = 0; i < Size; i += Scale)
12157 for (int j = 0; j < Shift; ++j)
12158 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12159 return false;
12160
12161 return true;
12162 };
12163
12164 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12165 for (int i = 0; i != Size; i += Scale) {
12166 unsigned Pos = Left ? i + Shift : i;
12167 unsigned Low = Left ? i : i + Shift;
12168 unsigned Len = Scale - Shift;
12169 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12170 return -1;
12171 }
12172
12173 int ShiftEltBits = ScalarSizeInBits * Scale;
12174 bool ByteShift = ShiftEltBits > 64;
12175 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12176 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12177 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12178
12179 // Normalize the scale for byte shifts to still produce an i64 element
12180 // type.
12181 Scale = ByteShift ? Scale / 2 : Scale;
12182
12183 // We need to round trip through the appropriate type for the shift.
12184 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12185 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12186 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12187 return (int)ShiftAmt;
12188 };
12189
12190 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12191 // keep doubling the size of the integer elements up to that. We can
12192 // then shift the elements of the integer vector by whole multiples of
12193 // their width within the elements of the larger integer vector. Test each
12194 // multiple to see if we can find a match with the moved element indices
12195 // and that the shifted in elements are all zeroable.
12196 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12197 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12198 for (int Shift = 1; Shift != Scale; ++Shift)
12199 for (bool Left : {true, false})
12200 if (CheckZeros(Shift, Scale, Left)) {
12201 int ShiftAmt = MatchShift(Shift, Scale, Left);
12202 if (0 < ShiftAmt)
12203 return ShiftAmt;
12204 }
12205
12206 // no match
12207 return -1;
12208}
12209
12211 SDValue V2, ArrayRef<int> Mask,
12212 const APInt &Zeroable,
12213 const X86Subtarget &Subtarget,
12214 SelectionDAG &DAG, bool BitwiseOnly) {
12215 int Size = Mask.size();
12216 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12217
12218 MVT ShiftVT;
12219 SDValue V = V1;
12220 unsigned Opcode;
12221
12222 // Try to match shuffle against V1 shift.
12223 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12224 Mask, 0, Zeroable, Subtarget);
12225
12226 // If V1 failed, try to match shuffle against V2 shift.
12227 if (ShiftAmt < 0) {
12228 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12229 Mask, Size, Zeroable, Subtarget);
12230 V = V2;
12231 }
12232
12233 if (ShiftAmt < 0)
12234 return SDValue();
12235
12236 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12237 return SDValue();
12238
12239 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12240 "Illegal integer vector type");
12241 V = DAG.getBitcast(ShiftVT, V);
12242 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12243 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12244 return DAG.getBitcast(VT, V);
12245}
12246
12247// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12248// Remainder of lower half result is zero and upper half is all undef.
12249static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12250 ArrayRef<int> Mask, uint64_t &BitLen,
12251 uint64_t &BitIdx, const APInt &Zeroable) {
12252 int Size = Mask.size();
12253 int HalfSize = Size / 2;
12254 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12255 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12256
12257 // Upper half must be undefined.
12258 if (!isUndefUpperHalf(Mask))
12259 return false;
12260
12261 // Determine the extraction length from the part of the
12262 // lower half that isn't zeroable.
12263 int Len = HalfSize;
12264 for (; Len > 0; --Len)
12265 if (!Zeroable[Len - 1])
12266 break;
12267 assert(Len > 0 && "Zeroable shuffle mask");
12268
12269 // Attempt to match first Len sequential elements from the lower half.
12270 SDValue Src;
12271 int Idx = -1;
12272 for (int i = 0; i != Len; ++i) {
12273 int M = Mask[i];
12274 if (M == SM_SentinelUndef)
12275 continue;
12276 SDValue &V = (M < Size ? V1 : V2);
12277 M = M % Size;
12278
12279 // The extracted elements must start at a valid index and all mask
12280 // elements must be in the lower half.
12281 if (i > M || M >= HalfSize)
12282 return false;
12283
12284 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12285 Src = V;
12286 Idx = M - i;
12287 continue;
12288 }
12289 return false;
12290 }
12291
12292 if (!Src || Idx < 0)
12293 return false;
12294
12295 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12296 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12297 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12298 V1 = Src;
12299 return true;
12300}
12301
12302// INSERTQ: Extract lowest Len elements from lower half of second source and
12303// insert over first source, starting at Idx.
12304// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12305static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12306 ArrayRef<int> Mask, uint64_t &BitLen,
12307 uint64_t &BitIdx) {
12308 int Size = Mask.size();
12309 int HalfSize = Size / 2;
12310 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12311
12312 // Upper half must be undefined.
12313 if (!isUndefUpperHalf(Mask))
12314 return false;
12315
12316 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12317 SDValue Base;
12318
12319 // Attempt to match first source from mask before insertion point.
12320 if (isUndefInRange(Mask, 0, Idx)) {
12321 /* EMPTY */
12322 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12323 Base = V1;
12324 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12325 Base = V2;
12326 } else {
12327 continue;
12328 }
12329
12330 // Extend the extraction length looking to match both the insertion of
12331 // the second source and the remaining elements of the first.
12332 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12333 SDValue Insert;
12334 int Len = Hi - Idx;
12335
12336 // Match insertion.
12337 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12338 Insert = V1;
12339 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12340 Insert = V2;
12341 } else {
12342 continue;
12343 }
12344
12345 // Match the remaining elements of the lower half.
12346 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12347 /* EMPTY */
12348 } else if ((!Base || (Base == V1)) &&
12349 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12350 Base = V1;
12351 } else if ((!Base || (Base == V2)) &&
12352 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12353 Size + Hi)) {
12354 Base = V2;
12355 } else {
12356 continue;
12357 }
12358
12359 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12360 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12361 V1 = Base;
12362 V2 = Insert;
12363 return true;
12364 }
12365 }
12366
12367 return false;
12368}
12369
12370/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12372 SDValue V2, ArrayRef<int> Mask,
12373 const APInt &Zeroable, SelectionDAG &DAG) {
12374 uint64_t BitLen, BitIdx;
12375 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12376 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12377 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12378 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12379
12380 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12381 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12382 V2 ? V2 : DAG.getUNDEF(VT),
12383 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12384 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12385
12386 return SDValue();
12387}
12388
12389/// Lower a vector shuffle as an any/signed/zero extension.
12390///
12391/// Given a specific number of elements, element bit width, and extension
12392/// stride, produce either an extension based on the available
12393/// features of the subtarget. The extended elements are consecutive and
12394/// begin and can start from an offsetted element index in the input; to
12395/// avoid excess shuffling the offset must either being in the bottom lane
12396/// or at the start of a higher lane. All extended elements must be from
12397/// the same lane.
12399 int Scale, int Offset,
12400 unsigned ExtOpc, SDValue InputV,
12401 ArrayRef<int> Mask,
12402 const X86Subtarget &Subtarget,
12403 SelectionDAG &DAG) {
12404 assert(Scale > 1 && "Need a scale to extend.");
12405 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12406 int EltBits = VT.getScalarSizeInBits();
12407 int NumElements = VT.getVectorNumElements();
12408 int NumEltsPerLane = 128 / EltBits;
12409 int OffsetLane = Offset / NumEltsPerLane;
12410 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12411 "Only 8, 16, and 32 bit elements can be extended.");
12412 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12413 assert(0 <= Offset && "Extension offset must be positive.");
12414 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12415 "Extension offset must be in the first lane or start an upper lane.");
12416
12417 // Check that an index is in same lane as the base offset.
12418 auto SafeOffset = [&](int Idx) {
12419 return OffsetLane == (Idx / NumEltsPerLane);
12420 };
12421
12422 // Shift along an input so that the offset base moves to the first element.
12423 auto ShuffleOffset = [&](SDValue V) {
12424 if (!Offset)
12425 return V;
12426
12427 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12428 for (int i = 0; i * Scale < NumElements; ++i) {
12429 int SrcIdx = i + Offset;
12430 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12431 }
12432 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12433 };
12434
12435 // Found a valid a/zext mask! Try various lowering strategies based on the
12436 // input type and available ISA extensions.
12437 if (Subtarget.hasSSE41()) {
12438 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12439 // PUNPCK will catch this in a later shuffle match.
12440 if (Offset && Scale == 2 && VT.is128BitVector())
12441 return SDValue();
12442 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12443 NumElements / Scale);
12444 InputV = DAG.getBitcast(VT, InputV);
12445 InputV = ShuffleOffset(InputV);
12446 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12447 return DAG.getBitcast(VT, InputV);
12448 }
12449
12450 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12451 InputV = DAG.getBitcast(VT, InputV);
12452 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12453
12454 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12455 if (ExtOpc == ISD::SIGN_EXTEND)
12456 return SDValue();
12457
12458 // For any extends we can cheat for larger element sizes and use shuffle
12459 // instructions that can fold with a load and/or copy.
12460 if (AnyExt && EltBits == 32) {
12461 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12462 -1};
12463 return DAG.getBitcast(
12464 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12465 DAG.getBitcast(MVT::v4i32, InputV),
12466 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12467 }
12468 if (AnyExt && EltBits == 16 && Scale > 2) {
12469 int PSHUFDMask[4] = {Offset / 2, -1,
12470 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12471 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12472 DAG.getBitcast(MVT::v4i32, InputV),
12473 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12474 int PSHUFWMask[4] = {1, -1, -1, -1};
12475 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12476 return DAG.getBitcast(
12477 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12478 DAG.getBitcast(MVT::v8i16, InputV),
12479 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12480 }
12481
12482 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12483 // to 64-bits.
12484 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12485 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12486 assert(VT.is128BitVector() && "Unexpected vector width!");
12487
12488 int LoIdx = Offset * EltBits;
12489 SDValue Lo = DAG.getBitcast(
12490 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12491 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12492 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12493
12494 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12495 return DAG.getBitcast(VT, Lo);
12496
12497 int HiIdx = (Offset + 1) * EltBits;
12498 SDValue Hi = DAG.getBitcast(
12499 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12500 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12501 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12502 return DAG.getBitcast(VT,
12503 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12504 }
12505
12506 // If this would require more than 2 unpack instructions to expand, use
12507 // pshufb when available. We can only use more than 2 unpack instructions
12508 // when zero extending i8 elements which also makes it easier to use pshufb.
12509 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12510 assert(NumElements == 16 && "Unexpected byte vector width!");
12511 SDValue PSHUFBMask[16];
12512 for (int i = 0; i < 16; ++i) {
12513 int Idx = Offset + (i / Scale);
12514 if ((i % Scale == 0 && SafeOffset(Idx))) {
12515 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12516 continue;
12517 }
12518 PSHUFBMask[i] =
12519 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12520 }
12521 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12522 return DAG.getBitcast(
12523 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12524 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12525 }
12526
12527 // If we are extending from an offset, ensure we start on a boundary that
12528 // we can unpack from.
12529 int AlignToUnpack = Offset % (NumElements / Scale);
12530 if (AlignToUnpack) {
12531 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12532 for (int i = AlignToUnpack; i < NumElements; ++i)
12533 ShMask[i - AlignToUnpack] = i;
12534 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12535 Offset -= AlignToUnpack;
12536 }
12537
12538 // Otherwise emit a sequence of unpacks.
12539 do {
12540 unsigned UnpackLoHi = X86ISD::UNPCKL;
12541 if (Offset >= (NumElements / 2)) {
12542 UnpackLoHi = X86ISD::UNPCKH;
12543 Offset -= (NumElements / 2);
12544 }
12545
12546 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12547 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12548 : getZeroVector(InputVT, Subtarget, DAG, DL);
12549 InputV = DAG.getBitcast(InputVT, InputV);
12550 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12551 Scale /= 2;
12552 EltBits *= 2;
12553 NumElements /= 2;
12554 } while (Scale > 1);
12555 return DAG.getBitcast(VT, InputV);
12556}
12557
12558/// Try to lower a vector shuffle as a zero extension on any microarch.
12559///
12560/// This routine will try to do everything in its power to cleverly lower
12561/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12562/// check for the profitability of this lowering, it tries to aggressively
12563/// match this pattern. It will use all of the micro-architectural details it
12564/// can to emit an efficient lowering. It handles both blends with all-zero
12565/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12566/// masking out later).
12567///
12568/// The reason we have dedicated lowering for zext-style shuffles is that they
12569/// are both incredibly common and often quite performance sensitive.
12571 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12572 const APInt &Zeroable, const X86Subtarget &Subtarget,
12573 SelectionDAG &DAG) {
12574 int Bits = VT.getSizeInBits();
12575 int NumLanes = Bits / 128;
12576 int NumElements = VT.getVectorNumElements();
12577 int NumEltsPerLane = NumElements / NumLanes;
12578 assert(VT.getScalarSizeInBits() <= 32 &&
12579 "Exceeds 32-bit integer zero extension limit");
12580 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12581
12582 // Define a helper function to check a particular ext-scale and lower to it if
12583 // valid.
12584 auto Lower = [&](int Scale) -> SDValue {
12585 SDValue InputV;
12586 bool AnyExt = true;
12587 int Offset = 0;
12588 int Matches = 0;
12589 for (int i = 0; i < NumElements; ++i) {
12590 int M = Mask[i];
12591 if (M < 0)
12592 continue; // Valid anywhere but doesn't tell us anything.
12593 if (i % Scale != 0) {
12594 // Each of the extended elements need to be zeroable.
12595 if (!Zeroable[i])
12596 return SDValue();
12597
12598 // We no longer are in the anyext case.
12599 AnyExt = false;
12600 continue;
12601 }
12602
12603 // Each of the base elements needs to be consecutive indices into the
12604 // same input vector.
12605 SDValue V = M < NumElements ? V1 : V2;
12606 M = M % NumElements;
12607 if (!InputV) {
12608 InputV = V;
12609 Offset = M - (i / Scale);
12610 } else if (InputV != V)
12611 return SDValue(); // Flip-flopping inputs.
12612
12613 // Offset must start in the lowest 128-bit lane or at the start of an
12614 // upper lane.
12615 // FIXME: Is it ever worth allowing a negative base offset?
12616 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12617 (Offset % NumEltsPerLane) == 0))
12618 return SDValue();
12619
12620 // If we are offsetting, all referenced entries must come from the same
12621 // lane.
12622 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12623 return SDValue();
12624
12625 if ((M % NumElements) != (Offset + (i / Scale)))
12626 return SDValue(); // Non-consecutive strided elements.
12627 Matches++;
12628 }
12629
12630 // If we fail to find an input, we have a zero-shuffle which should always
12631 // have already been handled.
12632 // FIXME: Maybe handle this here in case during blending we end up with one?
12633 if (!InputV)
12634 return SDValue();
12635
12636 // If we are offsetting, don't extend if we only match a single input, we
12637 // can always do better by using a basic PSHUF or PUNPCK.
12638 if (Offset != 0 && Matches < 2)
12639 return SDValue();
12640
12641 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12642 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12643 InputV, Mask, Subtarget, DAG);
12644 };
12645
12646 // The widest scale possible for extending is to a 64-bit integer.
12647 assert(Bits % 64 == 0 &&
12648 "The number of bits in a vector must be divisible by 64 on x86!");
12649 int NumExtElements = Bits / 64;
12650
12651 // Each iteration, try extending the elements half as much, but into twice as
12652 // many elements.
12653 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12654 assert(NumElements % NumExtElements == 0 &&
12655 "The input vector size must be divisible by the extended size.");
12656 if (SDValue V = Lower(NumElements / NumExtElements))
12657 return V;
12658 }
12659
12660 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12661 if (Bits != 128)
12662 return SDValue();
12663
12664 // Returns one of the source operands if the shuffle can be reduced to a
12665 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12666 auto CanZExtLowHalf = [&]() {
12667 for (int i = NumElements / 2; i != NumElements; ++i)
12668 if (!Zeroable[i])
12669 return SDValue();
12670 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12671 return V1;
12672 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12673 return V2;
12674 return SDValue();
12675 };
12676
12677 if (SDValue V = CanZExtLowHalf()) {
12678 V = DAG.getBitcast(MVT::v2i64, V);
12679 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12680 return DAG.getBitcast(VT, V);
12681 }
12682
12683 // No viable ext lowering found.
12684 return SDValue();
12685}
12686
12687/// Try to get a scalar value for a specific element of a vector.
12688///
12689/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12691 SelectionDAG &DAG) {
12692 MVT VT = V.getSimpleValueType();
12693 MVT EltVT = VT.getVectorElementType();
12694 V = peekThroughBitcasts(V);
12695
12696 // If the bitcasts shift the element size, we can't extract an equivalent
12697 // element from it.
12698 MVT NewVT = V.getSimpleValueType();
12699 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12700 return SDValue();
12701
12702 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12703 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12704 // Ensure the scalar operand is the same size as the destination.
12705 // FIXME: Add support for scalar truncation where possible.
12706 SDValue S = V.getOperand(Idx);
12707 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12708 return DAG.getBitcast(EltVT, S);
12709 }
12710
12711 return SDValue();
12712}
12713
12714/// Helper to test for a load that can be folded with x86 shuffles.
12715///
12716/// This is particularly important because the set of instructions varies
12717/// significantly based on whether the operand is a load or not.
12719 return V.hasOneUse() &&
12721}
12722
12723template<typename T>
12724static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12725 T EltVT = VT.getScalarType();
12726 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12727 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12728}
12729
12730/// Try to lower insertion of a single element into a zero vector.
12731///
12732/// This is a common pattern that we have especially efficient patterns to lower
12733/// across all subtarget feature sets.
12735 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12736 const APInt &Zeroable, const X86Subtarget &Subtarget,
12737 SelectionDAG &DAG) {
12738 MVT ExtVT = VT;
12739 MVT EltVT = VT.getVectorElementType();
12740 unsigned NumElts = VT.getVectorNumElements();
12741 unsigned EltBits = VT.getScalarSizeInBits();
12742
12743 if (isSoftF16(EltVT, Subtarget))
12744 return SDValue();
12745
12746 int V2Index =
12747 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12748 Mask.begin();
12749 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12750 bool IsV1Zeroable = true;
12751 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12752 if (i != V2Index && !Zeroable[i]) {
12753 IsV1Zeroable = false;
12754 break;
12755 }
12756
12757 // Bail if a non-zero V1 isn't used in place.
12758 if (!IsV1Zeroable) {
12759 SmallVector<int, 8> V1Mask(Mask);
12760 V1Mask[V2Index] = -1;
12761 if (!isNoopShuffleMask(V1Mask))
12762 return SDValue();
12763 }
12764
12765 // Check for a single input from a SCALAR_TO_VECTOR node.
12766 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12767 // all the smarts here sunk into that routine. However, the current
12768 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12769 // vector shuffle lowering is dead.
12770 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12771 DAG);
12772 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12773 // We need to zext the scalar if it is smaller than an i32.
12774 V2S = DAG.getBitcast(EltVT, V2S);
12775 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12776 // Using zext to expand a narrow element won't work for non-zero
12777 // insertions. But we can use a masked constant vector if we're
12778 // inserting V2 into the bottom of V1.
12779 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12780 return SDValue();
12781
12782 // Zero-extend directly to i32.
12783 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12784 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12785
12786 // If we're inserting into a constant, mask off the inserted index
12787 // and OR with the zero-extended scalar.
12788 if (!IsV1Zeroable) {
12789 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12790 Bits[V2Index] = APInt::getZero(EltBits);
12791 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12792 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12793 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12794 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12795 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12796 }
12797 }
12798 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12799 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12800 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12801 // Either not inserting from the low element of the input or the input
12802 // element size is too small to use VZEXT_MOVL to clear the high bits.
12803 return SDValue();
12804 }
12805
12806 if (!IsV1Zeroable) {
12807 // If V1 can't be treated as a zero vector we have fewer options to lower
12808 // this. We can't support integer vectors or non-zero targets cheaply.
12809 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12810 if (!VT.isFloatingPoint() || V2Index != 0)
12811 return SDValue();
12812 if (!VT.is128BitVector())
12813 return SDValue();
12814
12815 // Otherwise, use MOVSD, MOVSS or MOVSH.
12816 unsigned MovOpc = 0;
12817 if (EltVT == MVT::f16)
12818 MovOpc = X86ISD::MOVSH;
12819 else if (EltVT == MVT::f32)
12820 MovOpc = X86ISD::MOVSS;
12821 else if (EltVT == MVT::f64)
12822 MovOpc = X86ISD::MOVSD;
12823 else
12824 llvm_unreachable("Unsupported floating point element type to handle!");
12825 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12826 }
12827
12828 // This lowering only works for the low element with floating point vectors.
12829 if (VT.isFloatingPoint() && V2Index != 0)
12830 return SDValue();
12831
12832 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12833 if (ExtVT != VT)
12834 V2 = DAG.getBitcast(VT, V2);
12835
12836 if (V2Index != 0) {
12837 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12838 // the desired position. Otherwise it is more efficient to do a vector
12839 // shift left. We know that we can do a vector shift left because all
12840 // the inputs are zero.
12841 if (VT.isFloatingPoint() || NumElts <= 4) {
12842 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12843 V2Shuffle[V2Index] = 0;
12844 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12845 } else {
12846 V2 = DAG.getBitcast(MVT::v16i8, V2);
12847 V2 = DAG.getNode(
12848 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12849 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12850 V2 = DAG.getBitcast(VT, V2);
12851 }
12852 }
12853 return V2;
12854}
12855
12856/// Try to lower broadcast of a single - truncated - integer element,
12857/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12858///
12859/// This assumes we have AVX2.
12861 int BroadcastIdx,
12862 const X86Subtarget &Subtarget,
12863 SelectionDAG &DAG) {
12864 assert(Subtarget.hasAVX2() &&
12865 "We can only lower integer broadcasts with AVX2!");
12866
12867 MVT EltVT = VT.getVectorElementType();
12868 MVT V0VT = V0.getSimpleValueType();
12869
12870 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12871 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12872
12873 MVT V0EltVT = V0VT.getVectorElementType();
12874 if (!V0EltVT.isInteger())
12875 return SDValue();
12876
12877 const unsigned EltSize = EltVT.getSizeInBits();
12878 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12879
12880 // This is only a truncation if the original element type is larger.
12881 if (V0EltSize <= EltSize)
12882 return SDValue();
12883
12884 assert(((V0EltSize % EltSize) == 0) &&
12885 "Scalar type sizes must all be powers of 2 on x86!");
12886
12887 const unsigned V0Opc = V0.getOpcode();
12888 const unsigned Scale = V0EltSize / EltSize;
12889 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12890
12891 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12892 V0Opc != ISD::BUILD_VECTOR)
12893 return SDValue();
12894
12895 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12896
12897 // If we're extracting non-least-significant bits, shift so we can truncate.
12898 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12899 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12900 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12901 if (const int OffsetIdx = BroadcastIdx % Scale)
12902 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12903 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12904
12905 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12906 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12907}
12908
12909/// Test whether this can be lowered with a single SHUFPS instruction.
12910///
12911/// This is used to disable more specialized lowerings when the shufps lowering
12912/// will happen to be efficient.
12914 // This routine only handles 128-bit shufps.
12915 assert(Mask.size() == 4 && "Unsupported mask size!");
12916 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12917 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12918 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12919 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12920
12921 // To lower with a single SHUFPS we need to have the low half and high half
12922 // each requiring a single input.
12923 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12924 return false;
12925 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12926 return false;
12927
12928 return true;
12929}
12930
12931/// Test whether the specified input (0 or 1) is in-place blended by the
12932/// given mask.
12933///
12934/// This returns true if the elements from a particular input are already in the
12935/// slot required by the given mask and require no permutation.
12936static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12937 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12938 int Size = Mask.size();
12939 for (int i = 0; i < Size; ++i)
12940 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12941 return false;
12942
12943 return true;
12944}
12945
12946/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12947/// the given mask.
12948///
12950 int BroadcastableElement = 0) {
12951 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12952 int Size = Mask.size();
12953 for (int i = 0; i < Size; ++i)
12954 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12955 Mask[i] % Size != BroadcastableElement)
12956 return false;
12957 return true;
12958}
12959
12960/// If we are extracting two 128-bit halves of a vector and shuffling the
12961/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12962/// multi-shuffle lowering.
12964 SDValue N1, ArrayRef<int> Mask,
12965 SelectionDAG &DAG) {
12966 MVT VT = N0.getSimpleValueType();
12967 assert((VT.is128BitVector() &&
12968 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12969 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12970
12971 // Check that both sources are extracts of the same source vector.
12972 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12974 N0.getOperand(0) != N1.getOperand(0) ||
12975 !N0.hasOneUse() || !N1.hasOneUse())
12976 return SDValue();
12977
12978 SDValue WideVec = N0.getOperand(0);
12979 MVT WideVT = WideVec.getSimpleValueType();
12980 if (!WideVT.is256BitVector())
12981 return SDValue();
12982
12983 // Match extracts of each half of the wide source vector. Commute the shuffle
12984 // if the extract of the low half is N1.
12985 unsigned NumElts = VT.getVectorNumElements();
12986 SmallVector<int, 4> NewMask(Mask);
12987 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12988 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12989 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12991 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12992 return SDValue();
12993
12994 // Final bailout: if the mask is simple, we are better off using an extract
12995 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12996 // because that avoids a constant load from memory.
12997 if (NumElts == 4 &&
12998 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12999 return SDValue();
13000
13001 // Extend the shuffle mask with undef elements.
13002 NewMask.append(NumElts, -1);
13003
13004 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13005 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13006 NewMask);
13007 // This is free: ymm -> xmm.
13008 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13009 DAG.getVectorIdxConstant(0, DL));
13010}
13011
13012/// Try to lower broadcast of a single element.
13013///
13014/// For convenience, this code also bundles all of the subtarget feature set
13015/// filtering. While a little annoying to re-dispatch on type here, there isn't
13016/// a convenient way to factor it out.
13018 SDValue V2, ArrayRef<int> Mask,
13019 const X86Subtarget &Subtarget,
13020 SelectionDAG &DAG) {
13021 MVT EltVT = VT.getVectorElementType();
13022 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13023 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13024 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13025 return SDValue();
13026
13027 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13028 // we can only broadcast from a register with AVX2.
13029 unsigned NumEltBits = VT.getScalarSizeInBits();
13030 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13033 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13034
13035 // Check that the mask is a broadcast.
13036 int BroadcastIdx = getSplatIndex(Mask);
13037 if (BroadcastIdx < 0) {
13038 // Check for hidden broadcast.
13039 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13040 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13041 return SDValue();
13042 BroadcastIdx = 0;
13043 }
13044 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13045 "a sorted mask where the broadcast "
13046 "comes from V1.");
13047 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13048
13049 // Go up the chain of (vector) values to find a scalar load that we can
13050 // combine with the broadcast.
13051 // TODO: Combine this logic with findEltLoadSrc() used by
13052 // EltsFromConsecutiveLoads().
13053 int BitOffset = BroadcastIdx * NumEltBits;
13054 SDValue V = V1;
13055 for (;;) {
13056 switch (V.getOpcode()) {
13057 case ISD::BITCAST: {
13058 V = V.getOperand(0);
13059 continue;
13060 }
13061 case ISD::CONCAT_VECTORS: {
13062 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13063 int OpIdx = BitOffset / OpBitWidth;
13064 V = V.getOperand(OpIdx);
13065 BitOffset %= OpBitWidth;
13066 continue;
13067 }
13069 // The extraction index adds to the existing offset.
13070 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13071 unsigned Idx = V.getConstantOperandVal(1);
13072 unsigned BeginOffset = Idx * EltBitWidth;
13073 BitOffset += BeginOffset;
13074 V = V.getOperand(0);
13075 continue;
13076 }
13077 case ISD::INSERT_SUBVECTOR: {
13078 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13079 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13080 int Idx = (int)V.getConstantOperandVal(2);
13081 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13082 int BeginOffset = Idx * EltBitWidth;
13083 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13084 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13085 BitOffset -= BeginOffset;
13086 V = VInner;
13087 } else {
13088 V = VOuter;
13089 }
13090 continue;
13091 }
13092 }
13093 break;
13094 }
13095 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13096 BroadcastIdx = BitOffset / NumEltBits;
13097
13098 // Do we need to bitcast the source to retrieve the original broadcast index?
13099 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13100
13101 // Check if this is a broadcast of a scalar. We special case lowering
13102 // for scalars so that we can more effectively fold with loads.
13103 // If the original value has a larger element type than the shuffle, the
13104 // broadcast element is in essence truncated. Make that explicit to ease
13105 // folding.
13106 if (BitCastSrc && VT.isInteger())
13107 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13108 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13109 return TruncBroadcast;
13110
13111 // Also check the simpler case, where we can directly reuse the scalar.
13112 if (!BitCastSrc &&
13113 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13114 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13115 V = V.getOperand(BroadcastIdx);
13116
13117 // If we can't broadcast from a register, check that the input is a load.
13118 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13119 return SDValue();
13120 } else if (ISD::isNormalLoad(V.getNode()) &&
13121 cast<LoadSDNode>(V)->isSimple()) {
13122 // We do not check for one-use of the vector load because a broadcast load
13123 // is expected to be a win for code size, register pressure, and possibly
13124 // uops even if the original vector load is not eliminated.
13125
13126 // Reduce the vector load and shuffle to a broadcasted scalar load.
13127 auto *Ld = cast<LoadSDNode>(V);
13128 SDValue BaseAddr = Ld->getBasePtr();
13129 MVT SVT = VT.getScalarType();
13130 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13131 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13132 SDValue NewAddr =
13134
13135 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13136 // than MOVDDUP.
13137 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13138 if (Opcode == X86ISD::VBROADCAST) {
13139 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13140 SDValue Ops[] = {Ld->getChain(), NewAddr};
13141 V = DAG.getMemIntrinsicNode(
13142 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13144 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13146 return DAG.getBitcast(VT, V);
13147 }
13148 assert(SVT == MVT::f64 && "Unexpected VT!");
13149 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13151 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13153 } else if (!BroadcastFromReg) {
13154 // We can't broadcast from a vector register.
13155 return SDValue();
13156 } else if (BitOffset != 0) {
13157 // We can only broadcast from the zero-element of a vector register,
13158 // but it can be advantageous to broadcast from the zero-element of a
13159 // subvector.
13160 if (!VT.is256BitVector() && !VT.is512BitVector())
13161 return SDValue();
13162
13163 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13164 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13165 return SDValue();
13166
13167 // If we are broadcasting an element from the lowest 128-bit subvector, try
13168 // to move the element in position.
13169 if (BitOffset < 128 && NumActiveElts > 1 &&
13170 V.getScalarValueSizeInBits() == NumEltBits) {
13171 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13172 "Unexpected bit-offset");
13173 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13174 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13175 V = extractSubVector(V, 0, DAG, DL, 128);
13176 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13177 } else {
13178 // Only broadcast the zero-element of a 128-bit subvector.
13179 if ((BitOffset % 128) != 0)
13180 return SDValue();
13181
13182 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13183 "Unexpected bit-offset");
13184 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13185 "Unexpected vector size");
13186 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13187 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13188 }
13189 }
13190
13191 // On AVX we can use VBROADCAST directly for scalar sources.
13192 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13193 V = DAG.getBitcast(MVT::f64, V);
13194 if (Subtarget.hasAVX()) {
13195 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13196 return DAG.getBitcast(VT, V);
13197 }
13198 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13199 }
13200
13201 // If this is a scalar, do the broadcast on this type and bitcast.
13202 if (!V.getValueType().isVector()) {
13203 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13204 "Unexpected scalar size");
13205 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13207 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13208 }
13209
13210 // We only support broadcasting from 128-bit vectors to minimize the
13211 // number of patterns we need to deal with in isel. So extract down to
13212 // 128-bits, removing as many bitcasts as possible.
13213 if (V.getValueSizeInBits() > 128)
13215
13216 // Otherwise cast V to a vector with the same element type as VT, but
13217 // possibly narrower than VT. Then perform the broadcast.
13218 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13219 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13220 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13221}
13222
13223// Check for whether we can use INSERTPS to perform the shuffle. We only use
13224// INSERTPS when the V1 elements are already in the correct locations
13225// because otherwise we can just always use two SHUFPS instructions which
13226// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13227// perform INSERTPS if a single V1 element is out of place and all V2
13228// elements are zeroable.
13230 unsigned &InsertPSMask,
13231 const APInt &Zeroable,
13232 ArrayRef<int> Mask, SelectionDAG &DAG) {
13233 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13234 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13235 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13236
13237 // Attempt to match INSERTPS with one element from VA or VB being
13238 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13239 // are updated.
13240 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13241 ArrayRef<int> CandidateMask) {
13242 unsigned ZMask = 0;
13243 int VADstIndex = -1;
13244 int VBDstIndex = -1;
13245 bool VAUsedInPlace = false;
13246
13247 for (int i = 0; i < 4; ++i) {
13248 // Synthesize a zero mask from the zeroable elements (includes undefs).
13249 if (Zeroable[i]) {
13250 ZMask |= 1 << i;
13251 continue;
13252 }
13253
13254 // Flag if we use any VA inputs in place.
13255 if (i == CandidateMask[i]) {
13256 VAUsedInPlace = true;
13257 continue;
13258 }
13259
13260 // We can only insert a single non-zeroable element.
13261 if (VADstIndex >= 0 || VBDstIndex >= 0)
13262 return false;
13263
13264 if (CandidateMask[i] < 4) {
13265 // VA input out of place for insertion.
13266 VADstIndex = i;
13267 } else {
13268 // VB input for insertion.
13269 VBDstIndex = i;
13270 }
13271 }
13272
13273 // Don't bother if we have no (non-zeroable) element for insertion.
13274 if (VADstIndex < 0 && VBDstIndex < 0)
13275 return false;
13276
13277 // Determine element insertion src/dst indices. The src index is from the
13278 // start of the inserted vector, not the start of the concatenated vector.
13279 unsigned VBSrcIndex = 0;
13280 if (VADstIndex >= 0) {
13281 // If we have a VA input out of place, we use VA as the V2 element
13282 // insertion and don't use the original V2 at all.
13283 VBSrcIndex = CandidateMask[VADstIndex];
13284 VBDstIndex = VADstIndex;
13285 VB = VA;
13286 } else {
13287 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13288 }
13289
13290 // If no V1 inputs are used in place, then the result is created only from
13291 // the zero mask and the V2 insertion - so remove V1 dependency.
13292 if (!VAUsedInPlace)
13293 VA = DAG.getUNDEF(MVT::v4f32);
13294
13295 // Update V1, V2 and InsertPSMask accordingly.
13296 V1 = VA;
13297 V2 = VB;
13298
13299 // Insert the V2 element into the desired position.
13300 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13301 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13302 return true;
13303 };
13304
13305 if (matchAsInsertPS(V1, V2, Mask))
13306 return true;
13307
13308 // Commute and try again.
13309 SmallVector<int, 4> CommutedMask(Mask);
13311 if (matchAsInsertPS(V2, V1, CommutedMask))
13312 return true;
13313
13314 return false;
13315}
13316
13318 ArrayRef<int> Mask, const APInt &Zeroable,
13319 SelectionDAG &DAG) {
13320 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13321 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13322
13323 // Attempt to match the insertps pattern.
13324 unsigned InsertPSMask = 0;
13325 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13326 return SDValue();
13327
13328 // Insert the V2 element into the desired position.
13329 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13330 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13331}
13332
13333/// Handle lowering of 2-lane 64-bit floating point shuffles.
13334///
13335/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13336/// support for floating point shuffles but not integer shuffles. These
13337/// instructions will incur a domain crossing penalty on some chips though so
13338/// it is better to avoid lowering through this for integer vectors where
13339/// possible.
13341 const APInt &Zeroable, SDValue V1, SDValue V2,
13342 const X86Subtarget &Subtarget,
13343 SelectionDAG &DAG) {
13344 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13345 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13346 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13347
13348 if (V2.isUndef()) {
13349 // Check for being able to broadcast a single element.
13350 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13351 Mask, Subtarget, DAG))
13352 return Broadcast;
13353
13354 // Straight shuffle of a single input vector. Simulate this by using the
13355 // single input as both of the "inputs" to this instruction..
13356 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13357
13358 if (Subtarget.hasAVX()) {
13359 // If we have AVX, we can use VPERMILPS which will allow folding a load
13360 // into the shuffle.
13361 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13362 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13363 }
13364
13365 return DAG.getNode(
13366 X86ISD::SHUFP, DL, MVT::v2f64,
13367 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13368 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13369 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13370 }
13371 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13372 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13373 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13374 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13375
13376 if (Subtarget.hasAVX2())
13377 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13378 return Extract;
13379
13380 // When loading a scalar and then shuffling it into a vector we can often do
13381 // the insertion cheaply.
13383 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13384 return Insertion;
13385 // Try inverting the insertion since for v2 masks it is easy to do and we
13386 // can't reliably sort the mask one way or the other.
13387 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13388 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13390 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13391 return Insertion;
13392
13393 // Try to use one of the special instruction patterns to handle two common
13394 // blend patterns if a zero-blend above didn't work.
13395 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13396 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13397 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13398 // We can either use a special instruction to load over the low double or
13399 // to move just the low double.
13400 return DAG.getNode(
13401 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13402 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13403
13404 if (Subtarget.hasSSE41())
13405 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13406 Zeroable, Subtarget, DAG))
13407 return Blend;
13408
13409 // Use dedicated unpack instructions for masks that match their pattern.
13410 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13411 return V;
13412
13413 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13414 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13415 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13416}
13417
13418/// Handle lowering of 2-lane 64-bit integer shuffles.
13419///
13420/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13421/// the integer unit to minimize domain crossing penalties. However, for blends
13422/// it falls back to the floating point shuffle operation with appropriate bit
13423/// casting.
13425 const APInt &Zeroable, SDValue V1, SDValue V2,
13426 const X86Subtarget &Subtarget,
13427 SelectionDAG &DAG) {
13428 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13429 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13430 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13431
13432 if (V2.isUndef()) {
13433 // Check for being able to broadcast a single element.
13434 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13435 Mask, Subtarget, DAG))
13436 return Broadcast;
13437
13438 // Straight shuffle of a single input vector. For everything from SSE2
13439 // onward this has a single fast instruction with no scary immediates.
13440 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13441 V1 = DAG.getBitcast(MVT::v4i32, V1);
13442 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13443 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13444 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13445 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13446 return DAG.getBitcast(
13447 MVT::v2i64,
13448 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13449 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13450 }
13451 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13452 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13453 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13454 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13455
13456 if (Subtarget.hasAVX2())
13457 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13458 return Extract;
13459
13460 // Try to use shift instructions.
13461 if (SDValue Shift =
13462 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13463 DAG, /*BitwiseOnly*/ false))
13464 return Shift;
13465
13466 // When loading a scalar and then shuffling it into a vector we can often do
13467 // the insertion cheaply.
13469 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13470 return Insertion;
13471 // Try inverting the insertion since for v2 masks it is easy to do and we
13472 // can't reliably sort the mask one way or the other.
13473 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13475 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13476 return Insertion;
13477
13478 // We have different paths for blend lowering, but they all must use the
13479 // *exact* same predicate.
13480 bool IsBlendSupported = Subtarget.hasSSE41();
13481 if (IsBlendSupported)
13482 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13483 Zeroable, Subtarget, DAG))
13484 return Blend;
13485
13486 // Use dedicated unpack instructions for masks that match their pattern.
13487 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13488 return V;
13489
13490 // Try to use byte rotation instructions.
13491 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13492 if (Subtarget.hasSSSE3()) {
13493 if (Subtarget.hasVLX())
13494 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13495 Zeroable, Subtarget, DAG))
13496 return Rotate;
13497
13498 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13499 Subtarget, DAG))
13500 return Rotate;
13501 }
13502
13503 // If we have direct support for blends, we should lower by decomposing into
13504 // a permute. That will be faster than the domain cross.
13505 if (IsBlendSupported)
13506 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13507 Zeroable, Subtarget, DAG);
13508
13509 // We implement this with SHUFPD which is pretty lame because it will likely
13510 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13511 // However, all the alternatives are still more cycles and newer chips don't
13512 // have this problem. It would be really nice if x86 had better shuffles here.
13513 V1 = DAG.getBitcast(MVT::v2f64, V1);
13514 V2 = DAG.getBitcast(MVT::v2f64, V2);
13515 return DAG.getBitcast(MVT::v2i64,
13516 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13517}
13518
13519/// Lower a vector shuffle using the SHUFPS instruction.
13520///
13521/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13522/// It makes no assumptions about whether this is the *best* lowering, it simply
13523/// uses it.
13525 ArrayRef<int> Mask, SDValue V1,
13526 SDValue V2, SelectionDAG &DAG) {
13527 SDValue LowV = V1, HighV = V2;
13528 SmallVector<int, 4> NewMask(Mask);
13529 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13530
13531 if (NumV2Elements == 1) {
13532 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13533
13534 // Compute the index adjacent to V2Index and in the same half by toggling
13535 // the low bit.
13536 int V2AdjIndex = V2Index ^ 1;
13537
13538 if (Mask[V2AdjIndex] < 0) {
13539 // Handles all the cases where we have a single V2 element and an undef.
13540 // This will only ever happen in the high lanes because we commute the
13541 // vector otherwise.
13542 if (V2Index < 2)
13543 std::swap(LowV, HighV);
13544 NewMask[V2Index] -= 4;
13545 } else {
13546 // Handle the case where the V2 element ends up adjacent to a V1 element.
13547 // To make this work, blend them together as the first step.
13548 int V1Index = V2AdjIndex;
13549 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13550 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13551 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13552
13553 // Now proceed to reconstruct the final blend as we have the necessary
13554 // high or low half formed.
13555 if (V2Index < 2) {
13556 LowV = V2;
13557 HighV = V1;
13558 } else {
13559 HighV = V2;
13560 }
13561 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13562 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13563 }
13564 } else if (NumV2Elements == 2) {
13565 if (Mask[0] < 4 && Mask[1] < 4) {
13566 // Handle the easy case where we have V1 in the low lanes and V2 in the
13567 // high lanes.
13568 NewMask[2] -= 4;
13569 NewMask[3] -= 4;
13570 } else if (Mask[2] < 4 && Mask[3] < 4) {
13571 // We also handle the reversed case because this utility may get called
13572 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13573 // arrange things in the right direction.
13574 NewMask[0] -= 4;
13575 NewMask[1] -= 4;
13576 HighV = V1;
13577 LowV = V2;
13578 } else {
13579 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13580 // trying to place elements directly, just blend them and set up the final
13581 // shuffle to place them.
13582
13583 // The first two blend mask elements are for V1, the second two are for
13584 // V2.
13585 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13586 Mask[2] < 4 ? Mask[2] : Mask[3],
13587 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13588 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13589 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13590 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13591
13592 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13593 // a blend.
13594 LowV = HighV = V1;
13595 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13596 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13597 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13598 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13599 }
13600 } else if (NumV2Elements == 3) {
13601 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13602 // we can get here due to other paths (e.g repeated mask matching) that we
13603 // don't want to do another round of lowerVECTOR_SHUFFLE.
13605 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13606 }
13607 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13608 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13609}
13610
13611/// Lower 4-lane 32-bit floating point shuffles.
13612///
13613/// Uses instructions exclusively from the floating point unit to minimize
13614/// domain crossing penalties, as these are sufficient to implement all v4f32
13615/// shuffles.
13617 const APInt &Zeroable, SDValue V1, SDValue V2,
13618 const X86Subtarget &Subtarget,
13619 SelectionDAG &DAG) {
13620 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13621 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13622 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13623
13624 if (Subtarget.hasSSE41())
13625 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13626 Zeroable, Subtarget, DAG))
13627 return Blend;
13628
13629 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13630
13631 if (NumV2Elements == 0) {
13632 // Check for being able to broadcast a single element.
13633 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13634 Mask, Subtarget, DAG))
13635 return Broadcast;
13636
13637 // Use even/odd duplicate instructions for masks that match their pattern.
13638 if (Subtarget.hasSSE3()) {
13639 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13640 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13641 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13642 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13643 }
13644
13645 if (Subtarget.hasAVX()) {
13646 // If we have AVX, we can use VPERMILPS which will allow folding a load
13647 // into the shuffle.
13648 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13649 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13650 }
13651
13652 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13653 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13654 if (!Subtarget.hasSSE2()) {
13655 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13656 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13657 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13658 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13659 }
13660
13661 // Otherwise, use a straight shuffle of a single input vector. We pass the
13662 // input vector to both operands to simulate this with a SHUFPS.
13663 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13664 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13665 }
13666
13667 if (Subtarget.hasSSE2())
13669 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13670 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13671 return ZExt;
13672 }
13673
13674 if (Subtarget.hasAVX2())
13675 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13676 return Extract;
13677
13678 // There are special ways we can lower some single-element blends. However, we
13679 // have custom ways we can lower more complex single-element blends below that
13680 // we defer to if both this and BLENDPS fail to match, so restrict this to
13681 // when the V2 input is targeting element 0 of the mask -- that is the fast
13682 // case here.
13683 if (NumV2Elements == 1 && Mask[0] >= 4)
13685 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13686 return V;
13687
13688 if (Subtarget.hasSSE41()) {
13689 // Use INSERTPS if we can complete the shuffle efficiently.
13690 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13691 return V;
13692
13693 if (!isSingleSHUFPSMask(Mask))
13694 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13695 V2, Mask, DAG))
13696 return BlendPerm;
13697 }
13698
13699 // Use low/high mov instructions. These are only valid in SSE1 because
13700 // otherwise they are widened to v2f64 and never get here.
13701 if (!Subtarget.hasSSE2()) {
13702 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13703 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13704 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13705 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13706 }
13707
13708 // Use dedicated unpack instructions for masks that match their pattern.
13709 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13710 return V;
13711
13712 // Otherwise fall back to a SHUFPS lowering strategy.
13713 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13714}
13715
13716/// Lower 4-lane i32 vector shuffles.
13717///
13718/// We try to handle these with integer-domain shuffles where we can, but for
13719/// blends we use the floating point domain blend instructions.
13721 const APInt &Zeroable, SDValue V1, SDValue V2,
13722 const X86Subtarget &Subtarget,
13723 SelectionDAG &DAG) {
13724 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13725 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13726 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13727
13728 // Whenever we can lower this as a zext, that instruction is strictly faster
13729 // than any alternative. It also allows us to fold memory operands into the
13730 // shuffle in many cases.
13731 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13732 Zeroable, Subtarget, DAG))
13733 return ZExt;
13734
13735 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13736
13737 // Try to use shift instructions if fast.
13738 if (Subtarget.preferLowerShuffleAsShift()) {
13739 if (SDValue Shift =
13740 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13741 Subtarget, DAG, /*BitwiseOnly*/ true))
13742 return Shift;
13743 if (NumV2Elements == 0)
13744 if (SDValue Rotate =
13745 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13746 return Rotate;
13747 }
13748
13749 if (NumV2Elements == 0) {
13750 // Try to use broadcast unless the mask only has one non-undef element.
13751 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13752 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13753 Mask, Subtarget, DAG))
13754 return Broadcast;
13755 }
13756
13757 // Straight shuffle of a single input vector. For everything from SSE2
13758 // onward this has a single fast instruction with no scary immediates.
13759 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13760 // but we aren't actually going to use the UNPCK instruction because doing
13761 // so prevents folding a load into this instruction or making a copy.
13762 const int UnpackLoMask[] = {0, 0, 1, 1};
13763 const int UnpackHiMask[] = {2, 2, 3, 3};
13764 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13765 Mask = UnpackLoMask;
13766 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13767 Mask = UnpackHiMask;
13768
13769 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13770 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13771 }
13772
13773 if (Subtarget.hasAVX2())
13774 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13775 return Extract;
13776
13777 // Try to use shift instructions.
13778 if (SDValue Shift =
13779 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13780 DAG, /*BitwiseOnly*/ false))
13781 return Shift;
13782
13783 // There are special ways we can lower some single-element blends.
13784 if (NumV2Elements == 1)
13786 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13787 return V;
13788
13789 // We have different paths for blend lowering, but they all must use the
13790 // *exact* same predicate.
13791 bool IsBlendSupported = Subtarget.hasSSE41();
13792 if (IsBlendSupported)
13793 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13794 Zeroable, Subtarget, DAG))
13795 return Blend;
13796
13797 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13798 Zeroable, Subtarget, DAG))
13799 return Masked;
13800
13801 // Use dedicated unpack instructions for masks that match their pattern.
13802 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13803 return V;
13804
13805 // Try to use byte rotation instructions.
13806 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13807 if (Subtarget.hasSSSE3()) {
13808 if (Subtarget.hasVLX())
13809 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13810 Zeroable, Subtarget, DAG))
13811 return Rotate;
13812
13813 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13814 Subtarget, DAG))
13815 return Rotate;
13816 }
13817
13818 // Assume that a single SHUFPS is faster than an alternative sequence of
13819 // multiple instructions (even if the CPU has a domain penalty).
13820 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13821 if (!isSingleSHUFPSMask(Mask)) {
13822 // If we have direct support for blends, we should lower by decomposing into
13823 // a permute. That will be faster than the domain cross.
13824 if (IsBlendSupported)
13825 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13826 Zeroable, Subtarget, DAG);
13827
13828 // Try to lower by permuting the inputs into an unpack instruction.
13829 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13830 Mask, Subtarget, DAG))
13831 return Unpack;
13832 }
13833
13834 // We implement this with SHUFPS because it can blend from two vectors.
13835 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13836 // up the inputs, bypassing domain shift penalties that we would incur if we
13837 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13838 // relevant.
13839 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13840 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13841 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13842 return DAG.getBitcast(MVT::v4i32, ShufPS);
13843}
13844
13845/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13846/// shuffle lowering, and the most complex part.
13847///
13848/// The lowering strategy is to try to form pairs of input lanes which are
13849/// targeted at the same half of the final vector, and then use a dword shuffle
13850/// to place them onto the right half, and finally unpack the paired lanes into
13851/// their final position.
13852///
13853/// The exact breakdown of how to form these dword pairs and align them on the
13854/// correct sides is really tricky. See the comments within the function for
13855/// more of the details.
13856///
13857/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13858/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13859/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13860/// vector, form the analogous 128-bit 8-element Mask.
13862 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13863 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13864 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13865 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13866
13867 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13868 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13869 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13870
13871 // Attempt to directly match PSHUFLW or PSHUFHW.
13872 if (isUndefOrInRange(LoMask, 0, 4) &&
13873 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13874 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13875 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13876 }
13877 if (isUndefOrInRange(HiMask, 4, 8) &&
13878 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13879 for (int i = 0; i != 4; ++i)
13880 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13881 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13882 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13883 }
13884
13885 SmallVector<int, 4> LoInputs;
13886 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13887 array_pod_sort(LoInputs.begin(), LoInputs.end());
13888 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13889 SmallVector<int, 4> HiInputs;
13890 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13891 array_pod_sort(HiInputs.begin(), HiInputs.end());
13892 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13893 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13894 int NumHToL = LoInputs.size() - NumLToL;
13895 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13896 int NumHToH = HiInputs.size() - NumLToH;
13897 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13898 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13899 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13900 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13901
13902 // If we are shuffling values from one half - check how many different DWORD
13903 // pairs we need to create. If only 1 or 2 then we can perform this as a
13904 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13905 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13906 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13907 V = DAG.getNode(ShufWOp, DL, VT, V,
13908 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13909 V = DAG.getBitcast(PSHUFDVT, V);
13910 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13911 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13912 return DAG.getBitcast(VT, V);
13913 };
13914
13915 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13916 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13917 SmallVector<std::pair<int, int>, 4> DWordPairs;
13918 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13919
13920 // Collect the different DWORD pairs.
13921 for (int DWord = 0; DWord != 4; ++DWord) {
13922 int M0 = Mask[2 * DWord + 0];
13923 int M1 = Mask[2 * DWord + 1];
13924 M0 = (M0 >= 0 ? M0 % 4 : M0);
13925 M1 = (M1 >= 0 ? M1 % 4 : M1);
13926 if (M0 < 0 && M1 < 0)
13927 continue;
13928
13929 bool Match = false;
13930 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13931 auto &DWordPair = DWordPairs[j];
13932 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13933 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13934 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13935 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13936 PSHUFDMask[DWord] = DOffset + j;
13937 Match = true;
13938 break;
13939 }
13940 }
13941 if (!Match) {
13942 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13943 DWordPairs.push_back(std::make_pair(M0, M1));
13944 }
13945 }
13946
13947 if (DWordPairs.size() <= 2) {
13948 DWordPairs.resize(2, std::make_pair(-1, -1));
13949 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13950 DWordPairs[1].first, DWordPairs[1].second};
13951 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13952 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13953 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13954 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13955 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13956 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13957 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13958 }
13959 if ((NumHToL + NumHToH) == 0)
13960 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13961 if ((NumLToL + NumLToH) == 0)
13962 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13963 }
13964 }
13965
13966 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13967 // such inputs we can swap two of the dwords across the half mark and end up
13968 // with <=2 inputs to each half in each half. Once there, we can fall through
13969 // to the generic code below. For example:
13970 //
13971 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13972 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13973 //
13974 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13975 // and an existing 2-into-2 on the other half. In this case we may have to
13976 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13977 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13978 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13979 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13980 // half than the one we target for fixing) will be fixed when we re-enter this
13981 // path. We will also combine away any sequence of PSHUFD instructions that
13982 // result into a single instruction. Here is an example of the tricky case:
13983 //
13984 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13985 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13986 //
13987 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13988 //
13989 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13990 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13991 //
13992 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13993 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13994 //
13995 // The result is fine to be handled by the generic logic.
13996 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13997 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13998 int AOffset, int BOffset) {
13999 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14000 "Must call this with A having 3 or 1 inputs from the A half.");
14001 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14002 "Must call this with B having 1 or 3 inputs from the B half.");
14003 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14004 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14005
14006 bool ThreeAInputs = AToAInputs.size() == 3;
14007
14008 // Compute the index of dword with only one word among the three inputs in
14009 // a half by taking the sum of the half with three inputs and subtracting
14010 // the sum of the actual three inputs. The difference is the remaining
14011 // slot.
14012 int ADWord = 0, BDWord = 0;
14013 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14014 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14015 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14016 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14017 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14018 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14019 int TripleNonInputIdx =
14020 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14021 TripleDWord = TripleNonInputIdx / 2;
14022
14023 // We use xor with one to compute the adjacent DWord to whichever one the
14024 // OneInput is in.
14025 OneInputDWord = (OneInput / 2) ^ 1;
14026
14027 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14028 // and BToA inputs. If there is also such a problem with the BToB and AToB
14029 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14030 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14031 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14032 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14033 // Compute how many inputs will be flipped by swapping these DWords. We
14034 // need
14035 // to balance this to ensure we don't form a 3-1 shuffle in the other
14036 // half.
14037 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14038 llvm::count(AToBInputs, 2 * ADWord + 1);
14039 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14040 llvm::count(BToBInputs, 2 * BDWord + 1);
14041 if ((NumFlippedAToBInputs == 1 &&
14042 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14043 (NumFlippedBToBInputs == 1 &&
14044 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14045 // We choose whether to fix the A half or B half based on whether that
14046 // half has zero flipped inputs. At zero, we may not be able to fix it
14047 // with that half. We also bias towards fixing the B half because that
14048 // will more commonly be the high half, and we have to bias one way.
14049 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14050 ArrayRef<int> Inputs) {
14051 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14052 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14053 // Determine whether the free index is in the flipped dword or the
14054 // unflipped dword based on where the pinned index is. We use this bit
14055 // in an xor to conditionally select the adjacent dword.
14056 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14057 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14058 if (IsFixIdxInput == IsFixFreeIdxInput)
14059 FixFreeIdx += 1;
14060 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14061 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14062 "We need to be changing the number of flipped inputs!");
14063 int PSHUFHalfMask[] = {0, 1, 2, 3};
14064 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14065 V = DAG.getNode(
14066 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14067 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14068 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14069
14070 for (int &M : Mask)
14071 if (M >= 0 && M == FixIdx)
14072 M = FixFreeIdx;
14073 else if (M >= 0 && M == FixFreeIdx)
14074 M = FixIdx;
14075 };
14076 if (NumFlippedBToBInputs != 0) {
14077 int BPinnedIdx =
14078 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14079 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14080 } else {
14081 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14082 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14083 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14084 }
14085 }
14086 }
14087
14088 int PSHUFDMask[] = {0, 1, 2, 3};
14089 PSHUFDMask[ADWord] = BDWord;
14090 PSHUFDMask[BDWord] = ADWord;
14091 V = DAG.getBitcast(
14092 VT,
14093 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14094 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14095
14096 // Adjust the mask to match the new locations of A and B.
14097 for (int &M : Mask)
14098 if (M >= 0 && M/2 == ADWord)
14099 M = 2 * BDWord + M % 2;
14100 else if (M >= 0 && M/2 == BDWord)
14101 M = 2 * ADWord + M % 2;
14102
14103 // Recurse back into this routine to re-compute state now that this isn't
14104 // a 3 and 1 problem.
14105 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14106 };
14107 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14108 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14109 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14110 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14111
14112 // At this point there are at most two inputs to the low and high halves from
14113 // each half. That means the inputs can always be grouped into dwords and
14114 // those dwords can then be moved to the correct half with a dword shuffle.
14115 // We use at most one low and one high word shuffle to collect these paired
14116 // inputs into dwords, and finally a dword shuffle to place them.
14117 int PSHUFLMask[4] = {-1, -1, -1, -1};
14118 int PSHUFHMask[4] = {-1, -1, -1, -1};
14119 int PSHUFDMask[4] = {-1, -1, -1, -1};
14120
14121 // First fix the masks for all the inputs that are staying in their
14122 // original halves. This will then dictate the targets of the cross-half
14123 // shuffles.
14124 auto fixInPlaceInputs =
14125 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14126 MutableArrayRef<int> SourceHalfMask,
14127 MutableArrayRef<int> HalfMask, int HalfOffset) {
14128 if (InPlaceInputs.empty())
14129 return;
14130 if (InPlaceInputs.size() == 1) {
14131 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14132 InPlaceInputs[0] - HalfOffset;
14133 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14134 return;
14135 }
14136 if (IncomingInputs.empty()) {
14137 // Just fix all of the in place inputs.
14138 for (int Input : InPlaceInputs) {
14139 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14140 PSHUFDMask[Input / 2] = Input / 2;
14141 }
14142 return;
14143 }
14144
14145 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14146 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14147 InPlaceInputs[0] - HalfOffset;
14148 // Put the second input next to the first so that they are packed into
14149 // a dword. We find the adjacent index by toggling the low bit.
14150 int AdjIndex = InPlaceInputs[0] ^ 1;
14151 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14152 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14153 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14154 };
14155 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14156 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14157
14158 // Now gather the cross-half inputs and place them into a free dword of
14159 // their target half.
14160 // FIXME: This operation could almost certainly be simplified dramatically to
14161 // look more like the 3-1 fixing operation.
14162 auto moveInputsToRightHalf = [&PSHUFDMask](
14163 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14164 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14165 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14166 int DestOffset) {
14167 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14168 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14169 };
14170 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14171 int Word) {
14172 int LowWord = Word & ~1;
14173 int HighWord = Word | 1;
14174 return isWordClobbered(SourceHalfMask, LowWord) ||
14175 isWordClobbered(SourceHalfMask, HighWord);
14176 };
14177
14178 if (IncomingInputs.empty())
14179 return;
14180
14181 if (ExistingInputs.empty()) {
14182 // Map any dwords with inputs from them into the right half.
14183 for (int Input : IncomingInputs) {
14184 // If the source half mask maps over the inputs, turn those into
14185 // swaps and use the swapped lane.
14186 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14187 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14188 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14189 Input - SourceOffset;
14190 // We have to swap the uses in our half mask in one sweep.
14191 for (int &M : HalfMask)
14192 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14193 M = Input;
14194 else if (M == Input)
14195 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14196 } else {
14197 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14198 Input - SourceOffset &&
14199 "Previous placement doesn't match!");
14200 }
14201 // Note that this correctly re-maps both when we do a swap and when
14202 // we observe the other side of the swap above. We rely on that to
14203 // avoid swapping the members of the input list directly.
14204 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14205 }
14206
14207 // Map the input's dword into the correct half.
14208 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14209 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14210 else
14211 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14212 Input / 2 &&
14213 "Previous placement doesn't match!");
14214 }
14215
14216 // And just directly shift any other-half mask elements to be same-half
14217 // as we will have mirrored the dword containing the element into the
14218 // same position within that half.
14219 for (int &M : HalfMask)
14220 if (M >= SourceOffset && M < SourceOffset + 4) {
14221 M = M - SourceOffset + DestOffset;
14222 assert(M >= 0 && "This should never wrap below zero!");
14223 }
14224 return;
14225 }
14226
14227 // Ensure we have the input in a viable dword of its current half. This
14228 // is particularly tricky because the original position may be clobbered
14229 // by inputs being moved and *staying* in that half.
14230 if (IncomingInputs.size() == 1) {
14231 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14232 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14233 SourceOffset;
14234 SourceHalfMask[InputFixed - SourceOffset] =
14235 IncomingInputs[0] - SourceOffset;
14236 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14237 IncomingInputs[0] = InputFixed;
14238 }
14239 } else if (IncomingInputs.size() == 2) {
14240 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14241 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14242 // We have two non-adjacent or clobbered inputs we need to extract from
14243 // the source half. To do this, we need to map them into some adjacent
14244 // dword slot in the source mask.
14245 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14246 IncomingInputs[1] - SourceOffset};
14247
14248 // If there is a free slot in the source half mask adjacent to one of
14249 // the inputs, place the other input in it. We use (Index XOR 1) to
14250 // compute an adjacent index.
14251 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14252 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14253 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14254 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14255 InputsFixed[1] = InputsFixed[0] ^ 1;
14256 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14257 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14258 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14259 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14260 InputsFixed[0] = InputsFixed[1] ^ 1;
14261 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14262 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14263 // The two inputs are in the same DWord but it is clobbered and the
14264 // adjacent DWord isn't used at all. Move both inputs to the free
14265 // slot.
14266 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14267 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14268 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14269 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14270 } else {
14271 // The only way we hit this point is if there is no clobbering
14272 // (because there are no off-half inputs to this half) and there is no
14273 // free slot adjacent to one of the inputs. In this case, we have to
14274 // swap an input with a non-input.
14275 for (int i = 0; i < 4; ++i)
14276 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14277 "We can't handle any clobbers here!");
14278 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14279 "Cannot have adjacent inputs here!");
14280
14281 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14282 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14283
14284 // We also have to update the final source mask in this case because
14285 // it may need to undo the above swap.
14286 for (int &M : FinalSourceHalfMask)
14287 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14288 M = InputsFixed[1] + SourceOffset;
14289 else if (M == InputsFixed[1] + SourceOffset)
14290 M = (InputsFixed[0] ^ 1) + SourceOffset;
14291
14292 InputsFixed[1] = InputsFixed[0] ^ 1;
14293 }
14294
14295 // Point everything at the fixed inputs.
14296 for (int &M : HalfMask)
14297 if (M == IncomingInputs[0])
14298 M = InputsFixed[0] + SourceOffset;
14299 else if (M == IncomingInputs[1])
14300 M = InputsFixed[1] + SourceOffset;
14301
14302 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14303 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14304 }
14305 } else {
14306 llvm_unreachable("Unhandled input size!");
14307 }
14308
14309 // Now hoist the DWord down to the right half.
14310 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14311 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14312 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14313 for (int &M : HalfMask)
14314 for (int Input : IncomingInputs)
14315 if (M == Input)
14316 M = FreeDWord * 2 + Input % 2;
14317 };
14318 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14319 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14320 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14321 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14322
14323 // Now enact all the shuffles we've computed to move the inputs into their
14324 // target half.
14325 if (!isNoopShuffleMask(PSHUFLMask))
14326 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14327 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14328 if (!isNoopShuffleMask(PSHUFHMask))
14329 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14330 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14331 if (!isNoopShuffleMask(PSHUFDMask))
14332 V = DAG.getBitcast(
14333 VT,
14334 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14335 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14336
14337 // At this point, each half should contain all its inputs, and we can then
14338 // just shuffle them into their final position.
14339 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14340 "Failed to lift all the high half inputs to the low mask!");
14341 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14342 "Failed to lift all the low half inputs to the high mask!");
14343
14344 // Do a half shuffle for the low mask.
14345 if (!isNoopShuffleMask(LoMask))
14346 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14347 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14348
14349 // Do a half shuffle with the high mask after shifting its values down.
14350 for (int &M : HiMask)
14351 if (M >= 0)
14352 M -= 4;
14353 if (!isNoopShuffleMask(HiMask))
14354 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14355 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14356
14357 return V;
14358}
14359
14360/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14361/// blend if only one input is used.
14363 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14364 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14366 "Lane crossing shuffle masks not supported");
14367
14368 int NumBytes = VT.getSizeInBits() / 8;
14369 int Size = Mask.size();
14370 int Scale = NumBytes / Size;
14371
14372 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14373 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14374 V1InUse = false;
14375 V2InUse = false;
14376
14377 for (int i = 0; i < NumBytes; ++i) {
14378 int M = Mask[i / Scale];
14379 if (M < 0)
14380 continue;
14381
14382 const int ZeroMask = 0x80;
14383 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14384 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14385 if (Zeroable[i / Scale])
14386 V1Idx = V2Idx = ZeroMask;
14387
14388 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14389 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14390 V1InUse |= (ZeroMask != V1Idx);
14391 V2InUse |= (ZeroMask != V2Idx);
14392 }
14393
14394 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14395 if (V1InUse)
14396 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14397 DAG.getBuildVector(ShufVT, DL, V1Mask));
14398 if (V2InUse)
14399 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14400 DAG.getBuildVector(ShufVT, DL, V2Mask));
14401
14402 // If we need shuffled inputs from both, blend the two.
14403 SDValue V;
14404 if (V1InUse && V2InUse)
14405 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14406 else
14407 V = V1InUse ? V1 : V2;
14408
14409 // Cast the result back to the correct type.
14410 return DAG.getBitcast(VT, V);
14411}
14412
14413/// Generic lowering of 8-lane i16 shuffles.
14414///
14415/// This handles both single-input shuffles and combined shuffle/blends with
14416/// two inputs. The single input shuffles are immediately delegated to
14417/// a dedicated lowering routine.
14418///
14419/// The blends are lowered in one of three fundamental ways. If there are few
14420/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14421/// of the input is significantly cheaper when lowered as an interleaving of
14422/// the two inputs, try to interleave them. Otherwise, blend the low and high
14423/// halves of the inputs separately (making them have relatively few inputs)
14424/// and then concatenate them.
14426 const APInt &Zeroable, SDValue V1, SDValue V2,
14427 const X86Subtarget &Subtarget,
14428 SelectionDAG &DAG) {
14429 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14430 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14431 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14432
14433 // Whenever we can lower this as a zext, that instruction is strictly faster
14434 // than any alternative.
14435 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14436 Zeroable, Subtarget, DAG))
14437 return ZExt;
14438
14439 // Try to use lower using a truncation.
14440 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14441 Subtarget, DAG))
14442 return V;
14443
14444 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14445
14446 if (NumV2Inputs == 0) {
14447 // Try to use shift instructions.
14448 if (SDValue Shift =
14449 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14450 Subtarget, DAG, /*BitwiseOnly*/ false))
14451 return Shift;
14452
14453 // Check for being able to broadcast a single element.
14454 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14455 Mask, Subtarget, DAG))
14456 return Broadcast;
14457
14458 // Try to use bit rotation instructions.
14459 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14460 Subtarget, DAG))
14461 return Rotate;
14462
14463 // Use dedicated unpack instructions for masks that match their pattern.
14464 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14465 return V;
14466
14467 // Use dedicated pack instructions for masks that match their pattern.
14468 if (SDValue V =
14469 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14470 return V;
14471
14472 // Try to use byte rotation instructions.
14473 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14474 Subtarget, DAG))
14475 return Rotate;
14476
14477 // Make a copy of the mask so it can be modified.
14478 SmallVector<int, 8> MutableMask(Mask);
14479 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14480 Subtarget, DAG);
14481 }
14482
14483 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14484 "All single-input shuffles should be canonicalized to be V1-input "
14485 "shuffles.");
14486
14487 // Try to use shift instructions.
14488 if (SDValue Shift =
14489 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14490 DAG, /*BitwiseOnly*/ false))
14491 return Shift;
14492
14493 // See if we can use SSE4A Extraction / Insertion.
14494 if (Subtarget.hasSSE4A())
14495 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14496 Zeroable, DAG))
14497 return V;
14498
14499 // There are special ways we can lower some single-element blends.
14500 if (NumV2Inputs == 1)
14502 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14503 return V;
14504
14505 // We have different paths for blend lowering, but they all must use the
14506 // *exact* same predicate.
14507 bool IsBlendSupported = Subtarget.hasSSE41();
14508 if (IsBlendSupported)
14509 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14510 Zeroable, Subtarget, DAG))
14511 return Blend;
14512
14513 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14514 Zeroable, Subtarget, DAG))
14515 return Masked;
14516
14517 // Use dedicated unpack instructions for masks that match their pattern.
14518 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14519 return V;
14520
14521 // Use dedicated pack instructions for masks that match their pattern.
14522 if (SDValue V =
14523 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14524 return V;
14525
14526 // Try to use lower using a truncation.
14527 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14528 Subtarget, DAG))
14529 return V;
14530
14531 // Try to use byte rotation instructions.
14532 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14533 Subtarget, DAG))
14534 return Rotate;
14535
14536 if (SDValue BitBlend =
14537 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14538 return BitBlend;
14539
14540 // Try to use byte shift instructions to mask.
14541 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14542 Zeroable, Subtarget, DAG))
14543 return V;
14544
14545 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14546 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14547 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14548 !Subtarget.hasVLX()) {
14549 // Check if this is part of a 256-bit vector truncation.
14550 unsigned PackOpc = 0;
14551 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14554 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14555 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14556 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14557 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14558 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14559 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14560 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14561 PackOpc = X86ISD::PACKUS;
14562 } else if (Subtarget.hasSSE41()) {
14563 SmallVector<SDValue, 4> DWordClearOps(4,
14564 DAG.getConstant(0, DL, MVT::i32));
14565 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14566 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14567 SDValue DWordClearMask =
14568 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14569 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14570 DWordClearMask);
14571 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14572 DWordClearMask);
14573 PackOpc = X86ISD::PACKUS;
14574 } else if (!Subtarget.hasSSSE3()) {
14575 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14576 V1 = DAG.getBitcast(MVT::v4i32, V1);
14577 V2 = DAG.getBitcast(MVT::v4i32, V2);
14578 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14579 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14580 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14581 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14582 PackOpc = X86ISD::PACKSS;
14583 }
14584 if (PackOpc) {
14585 // Now pack things back together.
14586 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14587 if (NumEvenDrops == 2) {
14588 Result = DAG.getBitcast(MVT::v4i32, Result);
14589 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14590 }
14591 return Result;
14592 }
14593 }
14594
14595 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14596 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14597 if (NumOddDrops == 1) {
14598 bool HasSSE41 = Subtarget.hasSSE41();
14599 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14600 DAG.getBitcast(MVT::v4i32, V1),
14601 DAG.getTargetConstant(16, DL, MVT::i8));
14602 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14603 DAG.getBitcast(MVT::v4i32, V2),
14604 DAG.getTargetConstant(16, DL, MVT::i8));
14605 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14606 MVT::v8i16, V1, V2);
14607 }
14608
14609 // Try to lower by permuting the inputs into an unpack instruction.
14610 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14611 Mask, Subtarget, DAG))
14612 return Unpack;
14613
14614 // If we can't directly blend but can use PSHUFB, that will be better as it
14615 // can both shuffle and set up the inefficient blend.
14616 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14617 bool V1InUse, V2InUse;
14618 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14619 Zeroable, DAG, V1InUse, V2InUse);
14620 }
14621
14622 // We can always bit-blend if we have to so the fallback strategy is to
14623 // decompose into single-input permutes and blends/unpacks.
14624 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14625 Zeroable, Subtarget, DAG);
14626}
14627
14628/// Lower 8-lane 16-bit floating point shuffles.
14630 const APInt &Zeroable, SDValue V1, SDValue V2,
14631 const X86Subtarget &Subtarget,
14632 SelectionDAG &DAG) {
14633 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14634 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14635 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14636 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14637
14638 if (Subtarget.hasFP16()) {
14639 if (NumV2Elements == 0) {
14640 // Check for being able to broadcast a single element.
14641 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14642 Mask, Subtarget, DAG))
14643 return Broadcast;
14644 }
14645 if (NumV2Elements == 1 && Mask[0] >= 8)
14647 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14648 return V;
14649 }
14650
14651 V1 = DAG.getBitcast(MVT::v8i16, V1);
14652 V2 = DAG.getBitcast(MVT::v8i16, V2);
14653 return DAG.getBitcast(MVT::v8f16,
14654 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14655}
14656
14657// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14658// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14659// the active subvector is extracted.
14661 ArrayRef<int> OriginalMask, SDValue V1,
14662 SDValue V2, const X86Subtarget &Subtarget,
14663 SelectionDAG &DAG) {
14664 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14665 SmallVector<int, 32> Mask(OriginalMask);
14666 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14667 !isShuffleFoldableLoad(V2)) {
14669 std::swap(V1, V2);
14670 }
14671
14672 MVT MaskVT = VT.changeTypeToInteger();
14673 SDValue MaskNode;
14674 MVT ShuffleVT = VT;
14675 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14676 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14677 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14678 ShuffleVT = V1.getSimpleValueType();
14679
14680 // Adjust mask to correct indices for the second input.
14681 int NumElts = VT.getVectorNumElements();
14682 unsigned Scale = 512 / VT.getSizeInBits();
14683 SmallVector<int, 32> AdjustedMask(Mask);
14684 for (int &M : AdjustedMask)
14685 if (NumElts <= M)
14686 M += (Scale - 1) * NumElts;
14687 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14688 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14689 } else {
14690 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14691 }
14692
14693 SDValue Result;
14694 if (V2.isUndef())
14695 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14696 else
14697 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14698
14699 if (VT != ShuffleVT)
14700 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14701
14702 return Result;
14703}
14704
14705/// Generic lowering of v16i8 shuffles.
14706///
14707/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14708/// detect any complexity reducing interleaving. If that doesn't help, it uses
14709/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14710/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14711/// back together.
14713 const APInt &Zeroable, SDValue V1, SDValue V2,
14714 const X86Subtarget &Subtarget,
14715 SelectionDAG &DAG) {
14716 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14717 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14718 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14719
14720 // Try to use shift instructions.
14721 if (SDValue Shift =
14722 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14723 DAG, /*BitwiseOnly*/ false))
14724 return Shift;
14725
14726 // Try to use byte rotation instructions.
14727 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14728 Subtarget, DAG))
14729 return Rotate;
14730
14731 // Use dedicated pack instructions for masks that match their pattern.
14732 if (SDValue V =
14733 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14734 return V;
14735
14736 // Try to use a zext lowering.
14737 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14738 Zeroable, Subtarget, DAG))
14739 return ZExt;
14740
14741 // Try to use lower using a truncation.
14742 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14743 Subtarget, DAG))
14744 return V;
14745
14746 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14747 Subtarget, DAG))
14748 return V;
14749
14750 // See if we can use SSE4A Extraction / Insertion.
14751 if (Subtarget.hasSSE4A())
14752 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14753 Zeroable, DAG))
14754 return V;
14755
14756 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14757
14758 // For single-input shuffles, there are some nicer lowering tricks we can use.
14759 if (NumV2Elements == 0) {
14760 // Check for being able to broadcast a single element.
14761 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14762 Mask, Subtarget, DAG))
14763 return Broadcast;
14764
14765 // Try to use bit rotation instructions.
14766 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14767 Subtarget, DAG))
14768 return Rotate;
14769
14770 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14771 return V;
14772
14773 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14774 // Notably, this handles splat and partial-splat shuffles more efficiently.
14775 // However, it only makes sense if the pre-duplication shuffle simplifies
14776 // things significantly. Currently, this means we need to be able to
14777 // express the pre-duplication shuffle as an i16 shuffle.
14778 //
14779 // FIXME: We should check for other patterns which can be widened into an
14780 // i16 shuffle as well.
14781 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14782 for (int i = 0; i < 16; i += 2)
14783 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14784 return false;
14785
14786 return true;
14787 };
14788 auto tryToWidenViaDuplication = [&]() -> SDValue {
14789 if (!canWidenViaDuplication(Mask))
14790 return SDValue();
14791 SmallVector<int, 4> LoInputs;
14792 copy_if(Mask, std::back_inserter(LoInputs),
14793 [](int M) { return M >= 0 && M < 8; });
14794 array_pod_sort(LoInputs.begin(), LoInputs.end());
14795 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14796 SmallVector<int, 4> HiInputs;
14797 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14798 array_pod_sort(HiInputs.begin(), HiInputs.end());
14799 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14800
14801 bool TargetLo = LoInputs.size() >= HiInputs.size();
14802 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14803 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14804
14805 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14807 for (int I : InPlaceInputs) {
14808 PreDupI16Shuffle[I/2] = I/2;
14809 LaneMap[I] = I;
14810 }
14811 int j = TargetLo ? 0 : 4, je = j + 4;
14812 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14813 // Check if j is already a shuffle of this input. This happens when
14814 // there are two adjacent bytes after we move the low one.
14815 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14816 // If we haven't yet mapped the input, search for a slot into which
14817 // we can map it.
14818 while (j < je && PreDupI16Shuffle[j] >= 0)
14819 ++j;
14820
14821 if (j == je)
14822 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14823 return SDValue();
14824
14825 // Map this input with the i16 shuffle.
14826 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14827 }
14828
14829 // Update the lane map based on the mapping we ended up with.
14830 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14831 }
14832 V1 = DAG.getBitcast(
14833 MVT::v16i8,
14834 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14835 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14836
14837 // Unpack the bytes to form the i16s that will be shuffled into place.
14838 bool EvenInUse = false, OddInUse = false;
14839 for (int i = 0; i < 16; i += 2) {
14840 EvenInUse |= (Mask[i + 0] >= 0);
14841 OddInUse |= (Mask[i + 1] >= 0);
14842 if (EvenInUse && OddInUse)
14843 break;
14844 }
14845 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14846 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14847 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14848
14849 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14850 for (int i = 0; i < 16; ++i)
14851 if (Mask[i] >= 0) {
14852 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14853 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14854 if (PostDupI16Shuffle[i / 2] < 0)
14855 PostDupI16Shuffle[i / 2] = MappedMask;
14856 else
14857 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14858 "Conflicting entries in the original shuffle!");
14859 }
14860 return DAG.getBitcast(
14861 MVT::v16i8,
14862 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14863 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14864 };
14865 if (SDValue V = tryToWidenViaDuplication())
14866 return V;
14867 }
14868
14869 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14870 Zeroable, Subtarget, DAG))
14871 return Masked;
14872
14873 // Use dedicated unpack instructions for masks that match their pattern.
14874 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14875 return V;
14876
14877 // Try to use byte shift instructions to mask.
14878 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14879 Zeroable, Subtarget, DAG))
14880 return V;
14881
14882 // Check for compaction patterns.
14883 bool IsSingleInput = V2.isUndef();
14884 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14885
14886 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14887 // with PSHUFB. It is important to do this before we attempt to generate any
14888 // blends but after all of the single-input lowerings. If the single input
14889 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14890 // want to preserve that and we can DAG combine any longer sequences into
14891 // a PSHUFB in the end. But once we start blending from multiple inputs,
14892 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14893 // and there are *very* few patterns that would actually be faster than the
14894 // PSHUFB approach because of its ability to zero lanes.
14895 //
14896 // If the mask is a binary compaction, we can more efficiently perform this
14897 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14898 //
14899 // FIXME: The only exceptions to the above are blends which are exact
14900 // interleavings with direct instructions supporting them. We currently don't
14901 // handle those well here.
14902 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14903 bool V1InUse = false;
14904 bool V2InUse = false;
14905
14907 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14908
14909 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14910 // do so. This avoids using them to handle blends-with-zero which is
14911 // important as a single pshufb is significantly faster for that.
14912 if (V1InUse && V2InUse) {
14913 if (Subtarget.hasSSE41())
14914 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14915 Zeroable, Subtarget, DAG))
14916 return Blend;
14917
14918 // We can use an unpack to do the blending rather than an or in some
14919 // cases. Even though the or may be (very minorly) more efficient, we
14920 // preference this lowering because there are common cases where part of
14921 // the complexity of the shuffles goes away when we do the final blend as
14922 // an unpack.
14923 // FIXME: It might be worth trying to detect if the unpack-feeding
14924 // shuffles will both be pshufb, in which case we shouldn't bother with
14925 // this.
14927 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14928 return Unpack;
14929
14930 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14931 if (Subtarget.hasVBMI())
14932 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14933 DAG);
14934
14935 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14936 if (Subtarget.hasXOP()) {
14937 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14938 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14939 }
14940
14941 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14942 // PALIGNR will be cheaper than the second PSHUFB+OR.
14944 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14945 return V;
14946 }
14947
14948 return PSHUFB;
14949 }
14950
14951 // There are special ways we can lower some single-element blends.
14952 if (NumV2Elements == 1)
14954 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14955 return V;
14956
14957 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14958 return Blend;
14959
14960 // Check whether a compaction lowering can be done. This handles shuffles
14961 // which take every Nth element for some even N. See the helper function for
14962 // details.
14963 //
14964 // We special case these as they can be particularly efficiently handled with
14965 // the PACKUSB instruction on x86 and they show up in common patterns of
14966 // rearranging bytes to truncate wide elements.
14967 if (NumEvenDrops) {
14968 // NumEvenDrops is the power of two stride of the elements. Another way of
14969 // thinking about it is that we need to drop the even elements this many
14970 // times to get the original input.
14971
14972 // First we need to zero all the dropped bytes.
14973 assert(NumEvenDrops <= 3 &&
14974 "No support for dropping even elements more than 3 times.");
14975 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14976 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14977 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14978 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14979 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14980 WordClearMask);
14981 if (!IsSingleInput)
14982 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14983 WordClearMask);
14984
14985 // Now pack things back together.
14986 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14987 IsSingleInput ? V1 : V2);
14988 for (int i = 1; i < NumEvenDrops; ++i) {
14989 Result = DAG.getBitcast(MVT::v8i16, Result);
14990 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14991 }
14992 return Result;
14993 }
14994
14995 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14996 if (NumOddDrops == 1) {
14997 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14998 DAG.getBitcast(MVT::v8i16, V1),
14999 DAG.getTargetConstant(8, DL, MVT::i8));
15000 if (!IsSingleInput)
15001 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15002 DAG.getBitcast(MVT::v8i16, V2),
15003 DAG.getTargetConstant(8, DL, MVT::i8));
15004 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15005 IsSingleInput ? V1 : V2);
15006 }
15007
15008 // Handle multi-input cases by blending/unpacking single-input shuffles.
15009 if (NumV2Elements > 0)
15010 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15011 Zeroable, Subtarget, DAG);
15012
15013 // The fallback path for single-input shuffles widens this into two v8i16
15014 // vectors with unpacks, shuffles those, and then pulls them back together
15015 // with a pack.
15016 SDValue V = V1;
15017
15018 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15019 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15020 for (int i = 0; i < 16; ++i)
15021 if (Mask[i] >= 0)
15022 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15023
15024 SDValue VLoHalf, VHiHalf;
15025 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15026 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15027 // i16s.
15028 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15029 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15030 // Use a mask to drop the high bytes.
15031 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15032 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15033 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15034
15035 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15036 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15037
15038 // Squash the masks to point directly into VLoHalf.
15039 for (int &M : LoBlendMask)
15040 if (M >= 0)
15041 M /= 2;
15042 for (int &M : HiBlendMask)
15043 if (M >= 0)
15044 M /= 2;
15045 } else {
15046 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15047 // VHiHalf so that we can blend them as i16s.
15048 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15049
15050 VLoHalf = DAG.getBitcast(
15051 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15052 VHiHalf = DAG.getBitcast(
15053 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15054 }
15055
15056 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15057 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15058
15059 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15060}
15061
15062/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15063///
15064/// This routine breaks down the specific type of 128-bit shuffle and
15065/// dispatches to the lowering routines accordingly.
15067 MVT VT, SDValue V1, SDValue V2,
15068 const APInt &Zeroable,
15069 const X86Subtarget &Subtarget,
15070 SelectionDAG &DAG) {
15071 if (VT == MVT::v8bf16) {
15072 V1 = DAG.getBitcast(MVT::v8i16, V1);
15073 V2 = DAG.getBitcast(MVT::v8i16, V2);
15074 return DAG.getBitcast(VT,
15075 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15076 }
15077
15078 switch (VT.SimpleTy) {
15079 case MVT::v2i64:
15080 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15081 case MVT::v2f64:
15082 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15083 case MVT::v4i32:
15084 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15085 case MVT::v4f32:
15086 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15087 case MVT::v8i16:
15088 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15089 case MVT::v8f16:
15090 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15091 case MVT::v16i8:
15092 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15093
15094 default:
15095 llvm_unreachable("Unimplemented!");
15096 }
15097}
15098
15099/// Generic routine to split vector shuffle into half-sized shuffles.
15100///
15101/// This routine just extracts two subvectors, shuffles them independently, and
15102/// then concatenates them back together. This should work effectively with all
15103/// AVX vector shuffle types.
15105 SDValue V2, ArrayRef<int> Mask,
15106 SelectionDAG &DAG, bool SimpleOnly) {
15107 assert(VT.getSizeInBits() >= 256 &&
15108 "Only for 256-bit or wider vector shuffles!");
15109 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15110 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15111
15112 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15113 if (VT == MVT::v8f32) {
15114 SDValue BC1 = peekThroughBitcasts(V1);
15115 SDValue BC2 = peekThroughBitcasts(V2);
15116 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15117 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15118 DAG, SimpleOnly))
15119 return DAG.getBitcast(VT, Split);
15120 }
15121 }
15122
15123 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15124 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15125
15126 int NumElements = VT.getVectorNumElements();
15127 int SplitNumElements = NumElements / 2;
15128 MVT ScalarVT = VT.getVectorElementType();
15129 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15130
15131 // Use splitVector/extractSubVector so that split build-vectors just build two
15132 // narrower build vectors. This helps shuffling with splats and zeros.
15133 auto SplitVector = [&](SDValue V) {
15134 SDValue LoV, HiV;
15135 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15136 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15137 DAG.getBitcast(SplitVT, HiV));
15138 };
15139
15140 SDValue LoV1, HiV1, LoV2, HiV2;
15141 std::tie(LoV1, HiV1) = SplitVector(V1);
15142 std::tie(LoV2, HiV2) = SplitVector(V2);
15143
15144 // Now create two 4-way blends of these half-width vectors.
15145 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15146 bool &UseHiV1, bool &UseLoV2,
15147 bool &UseHiV2) {
15148 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15149 for (int i = 0; i < SplitNumElements; ++i) {
15150 int M = HalfMask[i];
15151 if (M >= NumElements) {
15152 if (M >= NumElements + SplitNumElements)
15153 UseHiV2 = true;
15154 else
15155 UseLoV2 = true;
15156 } else if (M >= 0) {
15157 if (M >= SplitNumElements)
15158 UseHiV1 = true;
15159 else
15160 UseLoV1 = true;
15161 }
15162 }
15163 };
15164
15165 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15166 if (!SimpleOnly)
15167 return true;
15168
15169 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15170 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15171
15172 return !(UseHiV1 || UseHiV2);
15173 };
15174
15175 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15176 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15177 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15178 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15179 for (int i = 0; i < SplitNumElements; ++i) {
15180 int M = HalfMask[i];
15181 if (M >= NumElements) {
15182 V2BlendMask[i] = M - NumElements;
15183 BlendMask[i] = SplitNumElements + i;
15184 } else if (M >= 0) {
15185 V1BlendMask[i] = M;
15186 BlendMask[i] = i;
15187 }
15188 }
15189
15190 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15191 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15192
15193 // Because the lowering happens after all combining takes place, we need to
15194 // manually combine these blend masks as much as possible so that we create
15195 // a minimal number of high-level vector shuffle nodes.
15196 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15197
15198 // First try just blending the halves of V1 or V2.
15199 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15200 return DAG.getUNDEF(SplitVT);
15201 if (!UseLoV2 && !UseHiV2)
15202 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15203 if (!UseLoV1 && !UseHiV1)
15204 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15205
15206 SDValue V1Blend, V2Blend;
15207 if (UseLoV1 && UseHiV1) {
15208 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15209 } else {
15210 // We only use half of V1 so map the usage down into the final blend mask.
15211 V1Blend = UseLoV1 ? LoV1 : HiV1;
15212 for (int i = 0; i < SplitNumElements; ++i)
15213 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15214 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15215 }
15216 if (UseLoV2 && UseHiV2) {
15217 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15218 } else {
15219 // We only use half of V2 so map the usage down into the final blend mask.
15220 V2Blend = UseLoV2 ? LoV2 : HiV2;
15221 for (int i = 0; i < SplitNumElements; ++i)
15222 if (BlendMask[i] >= SplitNumElements)
15223 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15224 }
15225 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15226 };
15227
15228 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15229 return SDValue();
15230
15231 SDValue Lo = HalfBlend(LoMask);
15232 SDValue Hi = HalfBlend(HiMask);
15233 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15234}
15235
15236/// Either split a vector in halves or decompose the shuffles and the
15237/// blend/unpack.
15238///
15239/// This is provided as a good fallback for many lowerings of non-single-input
15240/// shuffles with more than one 128-bit lane. In those cases, we want to select
15241/// between splitting the shuffle into 128-bit components and stitching those
15242/// back together vs. extracting the single-input shuffles and blending those
15243/// results.
15245 SDValue V2, ArrayRef<int> Mask,
15246 const APInt &Zeroable,
15247 const X86Subtarget &Subtarget,
15248 SelectionDAG &DAG) {
15249 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15250 "shuffles as it could then recurse on itself.");
15251 int Size = Mask.size();
15252
15253 // If this can be modeled as a broadcast of two elements followed by a blend,
15254 // prefer that lowering. This is especially important because broadcasts can
15255 // often fold with memory operands.
15256 auto DoBothBroadcast = [&] {
15257 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15258 for (int M : Mask)
15259 if (M >= Size) {
15260 if (V2BroadcastIdx < 0)
15261 V2BroadcastIdx = M - Size;
15262 else if ((M - Size) != V2BroadcastIdx &&
15263 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15264 return false;
15265 } else if (M >= 0) {
15266 if (V1BroadcastIdx < 0)
15267 V1BroadcastIdx = M;
15268 else if (M != V1BroadcastIdx &&
15269 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15270 return false;
15271 }
15272 return true;
15273 };
15274 if (DoBothBroadcast())
15275 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15276 Subtarget, DAG);
15277
15278 // If the inputs all stem from a single 128-bit lane of each input, then we
15279 // split them rather than blending because the split will decompose to
15280 // unusually few instructions.
15281 int LaneCount = VT.getSizeInBits() / 128;
15282 int LaneSize = Size / LaneCount;
15283 SmallBitVector LaneInputs[2];
15284 LaneInputs[0].resize(LaneCount, false);
15285 LaneInputs[1].resize(LaneCount, false);
15286 for (int i = 0; i < Size; ++i)
15287 if (Mask[i] >= 0)
15288 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15289 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15290 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15291 /*SimpleOnly*/ false);
15292
15293 // Without AVX2, if we can freely split the subvectors then we're better off
15294 // performing half width shuffles.
15295 if (!Subtarget.hasAVX2()) {
15296 SDValue BC1 = peekThroughBitcasts(V1);
15297 SDValue BC2 = peekThroughBitcasts(V2);
15298 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15299 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15300 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15301 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15302 if (SplatOrSplitV1 && SplatOrSplitV2)
15303 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15304 /*SimpleOnly*/ false);
15305 }
15306
15307 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15308 // requires that the decomposed single-input shuffles don't end up here.
15309 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15310 Subtarget, DAG);
15311}
15312
15313// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15314// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15316 SDValue V1, SDValue V2,
15317 ArrayRef<int> Mask,
15318 SelectionDAG &DAG) {
15319 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15320
15321 int LHSMask[4] = {-1, -1, -1, -1};
15322 int RHSMask[4] = {-1, -1, -1, -1};
15323 int SHUFPDMask[4] = {-1, -1, -1, -1};
15324
15325 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15326 // perform the shuffle once the lanes have been shuffled in place.
15327 for (int i = 0; i != 4; ++i) {
15328 int M = Mask[i];
15329 if (M < 0)
15330 continue;
15331 int LaneBase = i & ~1;
15332 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15333 LaneMask[LaneBase + (M & 1)] = M;
15334 SHUFPDMask[i] = M & 1;
15335 }
15336
15337 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15338 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15339 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15340 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15341}
15342
15343/// Lower a vector shuffle crossing multiple 128-bit lanes as
15344/// a lane permutation followed by a per-lane permutation.
15345///
15346/// This is mainly for cases where we can have non-repeating permutes
15347/// in each lane.
15348///
15349/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15350/// we should investigate merging them.
15352 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15353 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15354 int NumElts = VT.getVectorNumElements();
15355 int NumLanes = VT.getSizeInBits() / 128;
15356 int NumEltsPerLane = NumElts / NumLanes;
15357 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15358
15359 /// Attempts to find a sublane permute with the given size
15360 /// that gets all elements into their target lanes.
15361 ///
15362 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15363 /// If unsuccessful, returns false and may overwrite InLaneMask.
15364 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15365 int NumSublanesPerLane = NumSublanes / NumLanes;
15366 int NumEltsPerSublane = NumElts / NumSublanes;
15367
15368 SmallVector<int, 16> CrossLaneMask;
15369 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15370 // CrossLaneMask but one entry == one sublane.
15371 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15372 APInt DemandedCrossLane = APInt::getZero(NumElts);
15373
15374 for (int i = 0; i != NumElts; ++i) {
15375 int M = Mask[i];
15376 if (M < 0)
15377 continue;
15378
15379 int SrcSublane = M / NumEltsPerSublane;
15380 int DstLane = i / NumEltsPerLane;
15381
15382 // We only need to get the elements into the right lane, not sublane.
15383 // So search all sublanes that make up the destination lane.
15384 bool Found = false;
15385 int DstSubStart = DstLane * NumSublanesPerLane;
15386 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15387 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15388 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15389 continue;
15390
15391 Found = true;
15392 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15393 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15394 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15395 DemandedCrossLane.setBit(InLaneMask[i]);
15396 break;
15397 }
15398 if (!Found)
15399 return SDValue();
15400 }
15401
15402 // Fill CrossLaneMask using CrossLaneMaskLarge.
15403 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15404
15405 if (!CanUseSublanes) {
15406 // If we're only shuffling a single lowest lane and the rest are identity
15407 // then don't bother.
15408 // TODO - isShuffleMaskInputInPlace could be extended to something like
15409 // this.
15410 int NumIdentityLanes = 0;
15411 bool OnlyShuffleLowestLane = true;
15412 for (int i = 0; i != NumLanes; ++i) {
15413 int LaneOffset = i * NumEltsPerLane;
15414 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15415 i * NumEltsPerLane))
15416 NumIdentityLanes++;
15417 else if (CrossLaneMask[LaneOffset] != 0)
15418 OnlyShuffleLowestLane = false;
15419 }
15420 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15421 return SDValue();
15422 }
15423
15424 // Simplify CrossLaneMask based on the actual demanded elements.
15425 if (V1.hasOneUse())
15426 for (int i = 0; i != NumElts; ++i)
15427 if (!DemandedCrossLane[i])
15428 CrossLaneMask[i] = SM_SentinelUndef;
15429
15430 // Avoid returning the same shuffle operation. For example,
15431 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15432 // undef:v16i16
15433 if (CrossLaneMask == Mask || InLaneMask == Mask)
15434 return SDValue();
15435
15436 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15437 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15438 InLaneMask);
15439 };
15440
15441 // First attempt a solution with full lanes.
15442 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15443 return V;
15444
15445 // The rest of the solutions use sublanes.
15446 if (!CanUseSublanes)
15447 return SDValue();
15448
15449 // Then attempt a solution with 64-bit sublanes (vpermq).
15450 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15451 return V;
15452
15453 // If that doesn't work and we have fast variable cross-lane shuffle,
15454 // attempt 32-bit sublanes (vpermd).
15455 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15456 return SDValue();
15457
15458 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15459}
15460
15461/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15462static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15463 SmallVector<int> &InLaneMask) {
15464 int Size = Mask.size();
15465 InLaneMask.assign(Mask.begin(), Mask.end());
15466 for (int i = 0; i < Size; ++i) {
15467 int &M = InLaneMask[i];
15468 if (M < 0)
15469 continue;
15470 if (((M % Size) / LaneSize) != (i / LaneSize))
15471 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15472 }
15473}
15474
15475/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15476/// source with a lane permutation.
15477///
15478/// This lowering strategy results in four instructions in the worst case for a
15479/// single-input cross lane shuffle which is lower than any other fully general
15480/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15481/// shuffle pattern should be handled prior to trying this lowering.
15483 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15484 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15485 // FIXME: This should probably be generalized for 512-bit vectors as well.
15486 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15487 int Size = Mask.size();
15488 int LaneSize = Size / 2;
15489
15490 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15491 // Only do this if the elements aren't all from the lower lane,
15492 // otherwise we're (probably) better off doing a split.
15493 if (VT == MVT::v4f64 &&
15494 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15495 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15496
15497 // If there are only inputs from one 128-bit lane, splitting will in fact be
15498 // less expensive. The flags track whether the given lane contains an element
15499 // that crosses to another lane.
15500 bool AllLanes;
15501 if (!Subtarget.hasAVX2()) {
15502 bool LaneCrossing[2] = {false, false};
15503 for (int i = 0; i < Size; ++i)
15504 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15505 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15506 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15507 } else {
15508 bool LaneUsed[2] = {false, false};
15509 for (int i = 0; i < Size; ++i)
15510 if (Mask[i] >= 0)
15511 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15512 AllLanes = LaneUsed[0] && LaneUsed[1];
15513 }
15514
15515 // TODO - we could support shuffling V2 in the Flipped input.
15516 assert(V2.isUndef() &&
15517 "This last part of this routine only works on single input shuffles");
15518
15519 SmallVector<int> InLaneMask;
15520 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15521
15522 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15523 "In-lane shuffle mask expected");
15524
15525 // If we're not using both lanes in each lane and the inlane mask is not
15526 // repeating, then we're better off splitting.
15527 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15528 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15529 /*SimpleOnly*/ false);
15530
15531 // Flip the lanes, and shuffle the results which should now be in-lane.
15532 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15533 SDValue Flipped = DAG.getBitcast(PVT, V1);
15534 Flipped =
15535 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15536 Flipped = DAG.getBitcast(VT, Flipped);
15537 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15538}
15539
15540/// Handle lowering 2-lane 128-bit shuffles.
15542 SDValue V2, ArrayRef<int> Mask,
15543 const APInt &Zeroable,
15544 const X86Subtarget &Subtarget,
15545 SelectionDAG &DAG) {
15546 if (V2.isUndef()) {
15547 // Attempt to match VBROADCAST*128 subvector broadcast load.
15548 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15549 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15550 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15552 MVT MemVT = VT.getHalfNumVectorElementsVT();
15553 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15554 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
15556 VT, MemVT, Ld, Ofs, DAG))
15557 return BcstLd;
15558 }
15559
15560 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15561 if (Subtarget.hasAVX2())
15562 return SDValue();
15563 }
15564
15565 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15566
15567 SmallVector<int, 4> WidenedMask;
15568 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15569 return SDValue();
15570
15571 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15572 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15573
15574 // Try to use an insert into a zero vector.
15575 if (WidenedMask[0] == 0 && IsHighZero) {
15576 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15577 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15578 DAG.getVectorIdxConstant(0, DL));
15579 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15580 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15581 DAG.getVectorIdxConstant(0, DL));
15582 }
15583
15584 // TODO: If minimizing size and one of the inputs is a zero vector and the
15585 // the zero vector has only one use, we could use a VPERM2X128 to save the
15586 // instruction bytes needed to explicitly generate the zero vector.
15587
15588 // Blends are faster and handle all the non-lane-crossing cases.
15589 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15590 Subtarget, DAG))
15591 return Blend;
15592
15593 // If either input operand is a zero vector, use VPERM2X128 because its mask
15594 // allows us to replace the zero input with an implicit zero.
15595 if (!IsLowZero && !IsHighZero) {
15596 // Check for patterns which can be matched with a single insert of a 128-bit
15597 // subvector.
15598 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15599 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15600
15601 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15602 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15603 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
15604 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15605 SDValue SubVec =
15606 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15607 DAG.getVectorIdxConstant(0, DL));
15608 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15609 DAG.getVectorIdxConstant(2, DL));
15610 }
15611 }
15612
15613 // Try to use SHUF128 if possible.
15614 if (Subtarget.hasVLX()) {
15615 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15616 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15617 ((WidenedMask[1] % 2) << 1);
15618 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15619 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15620 }
15621 }
15622 }
15623
15624 // Otherwise form a 128-bit permutation. After accounting for undefs,
15625 // convert the 64-bit shuffle mask selection values into 128-bit
15626 // selection bits by dividing the indexes by 2 and shifting into positions
15627 // defined by a vperm2*128 instruction's immediate control byte.
15628
15629 // The immediate permute control byte looks like this:
15630 // [1:0] - select 128 bits from sources for low half of destination
15631 // [2] - ignore
15632 // [3] - zero low half of destination
15633 // [5:4] - select 128 bits from sources for high half of destination
15634 // [6] - ignore
15635 // [7] - zero high half of destination
15636
15637 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15638 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15639
15640 unsigned PermMask = 0;
15641 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15642 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15643
15644 // Check the immediate mask and replace unused sources with undef.
15645 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15646 V1 = DAG.getUNDEF(VT);
15647 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15648 V2 = DAG.getUNDEF(VT);
15649
15650 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15651 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15652}
15653
15654/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15655/// shuffling each lane.
15656///
15657/// This attempts to create a repeated lane shuffle where each lane uses one
15658/// or two of the lanes of the inputs. The lanes of the input vectors are
15659/// shuffled in one or two independent shuffles to get the lanes into the
15660/// position needed by the final shuffle.
15662 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15663 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15664 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15665
15666 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15667 return SDValue();
15668
15669 int NumElts = Mask.size();
15670 int NumLanes = VT.getSizeInBits() / 128;
15671 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15672 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15673 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15674
15675 // First pass will try to fill in the RepeatMask from lanes that need two
15676 // sources.
15677 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15678 int Srcs[2] = {-1, -1};
15679 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15680 for (int i = 0; i != NumLaneElts; ++i) {
15681 int M = Mask[(Lane * NumLaneElts) + i];
15682 if (M < 0)
15683 continue;
15684 // Determine which of the possible input lanes (NumLanes from each source)
15685 // this element comes from. Assign that as one of the sources for this
15686 // lane. We can assign up to 2 sources for this lane. If we run out
15687 // sources we can't do anything.
15688 int LaneSrc = M / NumLaneElts;
15689 int Src;
15690 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15691 Src = 0;
15692 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15693 Src = 1;
15694 else
15695 return SDValue();
15696
15697 Srcs[Src] = LaneSrc;
15698 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15699 }
15700
15701 // If this lane has two sources, see if it fits with the repeat mask so far.
15702 if (Srcs[1] < 0)
15703 continue;
15704
15705 LaneSrcs[Lane][0] = Srcs[0];
15706 LaneSrcs[Lane][1] = Srcs[1];
15707
15708 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15709 assert(M1.size() == M2.size() && "Unexpected mask size");
15710 for (int i = 0, e = M1.size(); i != e; ++i)
15711 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15712 return false;
15713 return true;
15714 };
15715
15716 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15717 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15718 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15719 int M = Mask[i];
15720 if (M < 0)
15721 continue;
15722 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15723 "Unexpected mask element");
15724 MergedMask[i] = M;
15725 }
15726 };
15727
15728 if (MatchMasks(InLaneMask, RepeatMask)) {
15729 // Merge this lane mask into the final repeat mask.
15730 MergeMasks(InLaneMask, RepeatMask);
15731 continue;
15732 }
15733
15734 // Didn't find a match. Swap the operands and try again.
15735 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15737
15738 if (MatchMasks(InLaneMask, RepeatMask)) {
15739 // Merge this lane mask into the final repeat mask.
15740 MergeMasks(InLaneMask, RepeatMask);
15741 continue;
15742 }
15743
15744 // Couldn't find a match with the operands in either order.
15745 return SDValue();
15746 }
15747
15748 // Now handle any lanes with only one source.
15749 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15750 // If this lane has already been processed, skip it.
15751 if (LaneSrcs[Lane][0] >= 0)
15752 continue;
15753
15754 for (int i = 0; i != NumLaneElts; ++i) {
15755 int M = Mask[(Lane * NumLaneElts) + i];
15756 if (M < 0)
15757 continue;
15758
15759 // If RepeatMask isn't defined yet we can define it ourself.
15760 if (RepeatMask[i] < 0)
15761 RepeatMask[i] = M % NumLaneElts;
15762
15763 if (RepeatMask[i] < NumElts) {
15764 if (RepeatMask[i] != M % NumLaneElts)
15765 return SDValue();
15766 LaneSrcs[Lane][0] = M / NumLaneElts;
15767 } else {
15768 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15769 return SDValue();
15770 LaneSrcs[Lane][1] = M / NumLaneElts;
15771 }
15772 }
15773
15774 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15775 return SDValue();
15776 }
15777
15778 SmallVector<int, 16> NewMask(NumElts, -1);
15779 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15780 int Src = LaneSrcs[Lane][0];
15781 for (int i = 0; i != NumLaneElts; ++i) {
15782 int M = -1;
15783 if (Src >= 0)
15784 M = Src * NumLaneElts + i;
15785 NewMask[Lane * NumLaneElts + i] = M;
15786 }
15787 }
15788 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15789 // Ensure we didn't get back the shuffle we started with.
15790 // FIXME: This is a hack to make up for some splat handling code in
15791 // getVectorShuffle.
15792 if (isa<ShuffleVectorSDNode>(NewV1) &&
15793 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15794 return SDValue();
15795
15796 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15797 int Src = LaneSrcs[Lane][1];
15798 for (int i = 0; i != NumLaneElts; ++i) {
15799 int M = -1;
15800 if (Src >= 0)
15801 M = Src * NumLaneElts + i;
15802 NewMask[Lane * NumLaneElts + i] = M;
15803 }
15804 }
15805 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15806 // Ensure we didn't get back the shuffle we started with.
15807 // FIXME: This is a hack to make up for some splat handling code in
15808 // getVectorShuffle.
15809 if (isa<ShuffleVectorSDNode>(NewV2) &&
15810 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15811 return SDValue();
15812
15813 for (int i = 0; i != NumElts; ++i) {
15814 if (Mask[i] < 0) {
15815 NewMask[i] = -1;
15816 continue;
15817 }
15818 NewMask[i] = RepeatMask[i % NumLaneElts];
15819 if (NewMask[i] < 0)
15820 continue;
15821
15822 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15823 }
15824 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15825}
15826
15827/// If the input shuffle mask results in a vector that is undefined in all upper
15828/// or lower half elements and that mask accesses only 2 halves of the
15829/// shuffle's operands, return true. A mask of half the width with mask indexes
15830/// adjusted to access the extracted halves of the original shuffle operands is
15831/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15832/// lower half of each input operand is accessed.
15833static bool
15835 int &HalfIdx1, int &HalfIdx2) {
15836 assert((Mask.size() == HalfMask.size() * 2) &&
15837 "Expected input mask to be twice as long as output");
15838
15839 // Exactly one half of the result must be undef to allow narrowing.
15840 bool UndefLower = isUndefLowerHalf(Mask);
15841 bool UndefUpper = isUndefUpperHalf(Mask);
15842 if (UndefLower == UndefUpper)
15843 return false;
15844
15845 unsigned HalfNumElts = HalfMask.size();
15846 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15847 HalfIdx1 = -1;
15848 HalfIdx2 = -1;
15849 for (unsigned i = 0; i != HalfNumElts; ++i) {
15850 int M = Mask[i + MaskIndexOffset];
15851 if (M < 0) {
15852 HalfMask[i] = M;
15853 continue;
15854 }
15855
15856 // Determine which of the 4 half vectors this element is from.
15857 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15858 int HalfIdx = M / HalfNumElts;
15859
15860 // Determine the element index into its half vector source.
15861 int HalfElt = M % HalfNumElts;
15862
15863 // We can shuffle with up to 2 half vectors, set the new 'half'
15864 // shuffle mask accordingly.
15865 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15866 HalfMask[i] = HalfElt;
15867 HalfIdx1 = HalfIdx;
15868 continue;
15869 }
15870 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15871 HalfMask[i] = HalfElt + HalfNumElts;
15872 HalfIdx2 = HalfIdx;
15873 continue;
15874 }
15875
15876 // Too many half vectors referenced.
15877 return false;
15878 }
15879
15880 return true;
15881}
15882
15883/// Given the output values from getHalfShuffleMask(), create a half width
15884/// shuffle of extracted vectors followed by an insert back to full width.
15886 ArrayRef<int> HalfMask, int HalfIdx1,
15887 int HalfIdx2, bool UndefLower,
15888 SelectionDAG &DAG, bool UseConcat = false) {
15889 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15890 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15891
15892 MVT VT = V1.getSimpleValueType();
15893 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15894 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15895
15896 auto getHalfVector = [&](int HalfIdx) {
15897 if (HalfIdx < 0)
15898 return DAG.getUNDEF(HalfVT);
15899 SDValue V = (HalfIdx < 2 ? V1 : V2);
15900 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15901 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15902 DAG.getVectorIdxConstant(HalfIdx, DL));
15903 };
15904
15905 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15906 SDValue Half1 = getHalfVector(HalfIdx1);
15907 SDValue Half2 = getHalfVector(HalfIdx2);
15908 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15909 if (UseConcat) {
15910 SDValue Op0 = V;
15911 SDValue Op1 = DAG.getUNDEF(HalfVT);
15912 if (UndefLower)
15913 std::swap(Op0, Op1);
15914 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15915 }
15916
15917 unsigned Offset = UndefLower ? HalfNumElts : 0;
15918 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15920}
15921
15922/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15923/// This allows for fast cases such as subvector extraction/insertion
15924/// or shuffling smaller vector types which can lower more efficiently.
15926 SDValue V2, ArrayRef<int> Mask,
15927 const X86Subtarget &Subtarget,
15928 SelectionDAG &DAG) {
15929 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15930 "Expected 256-bit or 512-bit vector");
15931
15932 bool UndefLower = isUndefLowerHalf(Mask);
15933 if (!UndefLower && !isUndefUpperHalf(Mask))
15934 return SDValue();
15935
15936 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15937 "Completely undef shuffle mask should have been simplified already");
15938
15939 // Upper half is undef and lower half is whole upper subvector.
15940 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15941 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15942 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15943 if (!UndefLower &&
15944 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15945 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15946 DAG.getVectorIdxConstant(HalfNumElts, DL));
15947 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15948 DAG.getVectorIdxConstant(0, DL));
15949 }
15950
15951 // Lower half is undef and upper half is whole lower subvector.
15952 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15953 if (UndefLower &&
15954 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15955 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15956 DAG.getVectorIdxConstant(0, DL));
15957 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15958 DAG.getVectorIdxConstant(HalfNumElts, DL));
15959 }
15960
15961 int HalfIdx1, HalfIdx2;
15962 SmallVector<int, 8> HalfMask(HalfNumElts);
15963 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15964 return SDValue();
15965
15966 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15967
15968 // Only shuffle the halves of the inputs when useful.
15969 unsigned NumLowerHalves =
15970 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15971 unsigned NumUpperHalves =
15972 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15973 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15974
15975 // Determine the larger pattern of undef/halves, then decide if it's worth
15976 // splitting the shuffle based on subtarget capabilities and types.
15977 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15978 if (!UndefLower) {
15979 // XXXXuuuu: no insert is needed.
15980 // Always extract lowers when setting lower - these are all free subreg ops.
15981 if (NumUpperHalves == 0)
15982 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15983 UndefLower, DAG);
15984
15985 if (NumUpperHalves == 1) {
15986 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15987 if (Subtarget.hasAVX2()) {
15988 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15989 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15990 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15991 (!isSingleSHUFPSMask(HalfMask) ||
15992 Subtarget.hasFastVariableCrossLaneShuffle()))
15993 return SDValue();
15994 // If this is an unary shuffle (assume that the 2nd operand is
15995 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15996 // are better off extracting the upper half of 1 operand and using a
15997 // narrow shuffle.
15998 if (EltWidth == 64 && V2.isUndef())
15999 return SDValue();
16000 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16001 // full width pshufb, and then merge.
16002 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16003 return SDValue();
16004 }
16005 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16006 if (Subtarget.hasAVX512() && VT.is512BitVector())
16007 return SDValue();
16008 // Extract + narrow shuffle is better than the wide alternative.
16009 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16010 UndefLower, DAG);
16011 }
16012
16013 // Don't extract both uppers, instead shuffle and then extract.
16014 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16015 return SDValue();
16016 }
16017
16018 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16019 if (NumUpperHalves == 0) {
16020 // AVX2 has efficient 64-bit element cross-lane shuffles.
16021 // TODO: Refine to account for unary shuffle, splat, and other masks?
16022 if (Subtarget.hasAVX2() && EltWidth == 64)
16023 return SDValue();
16024 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16025 if (Subtarget.hasAVX512() && VT.is512BitVector())
16026 return SDValue();
16027 // Narrow shuffle + insert is better than the wide alternative.
16028 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16029 UndefLower, DAG);
16030 }
16031
16032 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16033 return SDValue();
16034}
16035
16036/// Handle case where shuffle sources are coming from the same 128-bit lane and
16037/// every lane can be represented as the same repeating mask - allowing us to
16038/// shuffle the sources with the repeating shuffle and then permute the result
16039/// to the destination lanes.
16041 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16042 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16043 int NumElts = VT.getVectorNumElements();
16044 int NumLanes = VT.getSizeInBits() / 128;
16045 int NumLaneElts = NumElts / NumLanes;
16046
16047 // On AVX2 we may be able to just shuffle the lowest elements and then
16048 // broadcast the result.
16049 if (Subtarget.hasAVX2()) {
16050 for (unsigned BroadcastSize : {16, 32, 64}) {
16051 if (BroadcastSize <= VT.getScalarSizeInBits())
16052 continue;
16053 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16054
16055 // Attempt to match a repeating pattern every NumBroadcastElts,
16056 // accounting for UNDEFs but only references the lowest 128-bit
16057 // lane of the inputs.
16058 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16059 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16060 for (int j = 0; j != NumBroadcastElts; ++j) {
16061 int M = Mask[i + j];
16062 if (M < 0)
16063 continue;
16064 int &R = RepeatMask[j];
16065 if (0 != ((M % NumElts) / NumLaneElts))
16066 return false;
16067 if (0 <= R && R != M)
16068 return false;
16069 R = M;
16070 }
16071 return true;
16072 };
16073
16074 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16075 if (!FindRepeatingBroadcastMask(RepeatMask))
16076 continue;
16077
16078 // Shuffle the (lowest) repeated elements in place for broadcast.
16079 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16080
16081 // Shuffle the actual broadcast.
16082 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16083 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16084 for (int j = 0; j != NumBroadcastElts; ++j)
16085 BroadcastMask[i + j] = j;
16086
16087 // Avoid returning the same shuffle operation. For example,
16088 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16089 if (BroadcastMask == Mask)
16090 return SDValue();
16091
16092 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16093 BroadcastMask);
16094 }
16095 }
16096
16097 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16098 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16099 return SDValue();
16100
16101 // Bail if we already have a repeated lane shuffle mask.
16102 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16103 return SDValue();
16104
16105 // Helper to look for repeated mask in each split sublane, and that those
16106 // sublanes can then be permuted into place.
16107 auto ShuffleSubLanes = [&](int SubLaneScale) {
16108 int NumSubLanes = NumLanes * SubLaneScale;
16109 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16110
16111 // Check that all the sources are coming from the same lane and see if we
16112 // can form a repeating shuffle mask (local to each sub-lane). At the same
16113 // time, determine the source sub-lane for each destination sub-lane.
16114 int TopSrcSubLane = -1;
16115 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16116 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16117 SubLaneScale,
16118 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16119
16120 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16121 // Extract the sub-lane mask, check that it all comes from the same lane
16122 // and normalize the mask entries to come from the first lane.
16123 int SrcLane = -1;
16124 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16125 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16126 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16127 if (M < 0)
16128 continue;
16129 int Lane = (M % NumElts) / NumLaneElts;
16130 if ((0 <= SrcLane) && (SrcLane != Lane))
16131 return SDValue();
16132 SrcLane = Lane;
16133 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16134 SubLaneMask[Elt] = LocalM;
16135 }
16136
16137 // Whole sub-lane is UNDEF.
16138 if (SrcLane < 0)
16139 continue;
16140
16141 // Attempt to match against the candidate repeated sub-lane masks.
16142 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16143 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16144 for (int i = 0; i != NumSubLaneElts; ++i) {
16145 if (M1[i] < 0 || M2[i] < 0)
16146 continue;
16147 if (M1[i] != M2[i])
16148 return false;
16149 }
16150 return true;
16151 };
16152
16153 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16154 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16155 continue;
16156
16157 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16158 for (int i = 0; i != NumSubLaneElts; ++i) {
16159 int M = SubLaneMask[i];
16160 if (M < 0)
16161 continue;
16162 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16163 "Unexpected mask element");
16164 RepeatedSubLaneMask[i] = M;
16165 }
16166
16167 // Track the top most source sub-lane - by setting the remaining to
16168 // UNDEF we can greatly simplify shuffle matching.
16169 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16170 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16171 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16172 break;
16173 }
16174
16175 // Bail if we failed to find a matching repeated sub-lane mask.
16176 if (Dst2SrcSubLanes[DstSubLane] < 0)
16177 return SDValue();
16178 }
16179 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16180 "Unexpected source lane");
16181
16182 // Create a repeating shuffle mask for the entire vector.
16183 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16184 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16185 int Lane = SubLane / SubLaneScale;
16186 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16187 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16188 int M = RepeatedSubLaneMask[Elt];
16189 if (M < 0)
16190 continue;
16191 int Idx = (SubLane * NumSubLaneElts) + Elt;
16192 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16193 }
16194 }
16195
16196 // Shuffle each source sub-lane to its destination.
16197 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16198 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16199 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16200 if (SrcSubLane < 0)
16201 continue;
16202 for (int j = 0; j != NumSubLaneElts; ++j)
16203 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16204 }
16205
16206 // Avoid returning the same shuffle operation.
16207 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16208 if (RepeatedMask == Mask || SubLaneMask == Mask)
16209 return SDValue();
16210
16211 SDValue RepeatedShuffle =
16212 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16213
16214 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16215 SubLaneMask);
16216 };
16217
16218 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16219 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16220 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16221 // Otherwise we can only permute whole 128-bit lanes.
16222 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16223 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16224 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16225 MinSubLaneScale = 2;
16226 MaxSubLaneScale =
16227 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16228 }
16229 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16230 MinSubLaneScale = MaxSubLaneScale = 4;
16231
16232 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16233 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16234 return Shuffle;
16235
16236 return SDValue();
16237}
16238
16240 bool &ForceV1Zero, bool &ForceV2Zero,
16241 unsigned &ShuffleImm, ArrayRef<int> Mask,
16242 const APInt &Zeroable) {
16243 int NumElts = VT.getVectorNumElements();
16244 assert(VT.getScalarSizeInBits() == 64 &&
16245 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16246 "Unexpected data type for VSHUFPD");
16247 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16248 "Illegal shuffle mask");
16249
16250 bool ZeroLane[2] = { true, true };
16251 for (int i = 0; i < NumElts; ++i)
16252 ZeroLane[i & 1] &= Zeroable[i];
16253
16254 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16255 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16256 bool IsSHUFPD = true;
16257 bool IsCommutable = true;
16258 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16259 for (int i = 0; i < NumElts; ++i) {
16260 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16261 continue;
16262 if (Mask[i] < 0)
16263 return false;
16264 int Val = (i & 6) + NumElts * (i & 1);
16265 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16266 if (Mask[i] < Val || Mask[i] > Val + 1)
16267 IsSHUFPD = false;
16268 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16269 IsCommutable = false;
16270 SHUFPDMask[i] = Mask[i] % 2;
16271 }
16272
16273 if (!IsSHUFPD && !IsCommutable)
16274 return false;
16275
16276 if (!IsSHUFPD && IsCommutable)
16277 std::swap(V1, V2);
16278
16279 ForceV1Zero = ZeroLane[0];
16280 ForceV2Zero = ZeroLane[1];
16281 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16282 return true;
16283}
16284
16286 SDValue V2, ArrayRef<int> Mask,
16287 const APInt &Zeroable,
16288 const X86Subtarget &Subtarget,
16289 SelectionDAG &DAG) {
16290 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16291 "Unexpected data type for VSHUFPD");
16292
16293 unsigned Immediate = 0;
16294 bool ForceV1Zero = false, ForceV2Zero = false;
16295 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16296 Mask, Zeroable))
16297 return SDValue();
16298
16299 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16300 if (ForceV1Zero)
16301 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16302 if (ForceV2Zero)
16303 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16304
16305 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16306 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16307}
16308
16309// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16310// by zeroable elements in the remaining 24 elements. Turn this into two
16311// vmovqb instructions shuffled together.
16313 SDValue V1, SDValue V2,
16314 ArrayRef<int> Mask,
16315 const APInt &Zeroable,
16316 SelectionDAG &DAG) {
16317 assert(VT == MVT::v32i8 && "Unexpected type!");
16318
16319 // The first 8 indices should be every 8th element.
16320 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16321 return SDValue();
16322
16323 // Remaining elements need to be zeroable.
16324 if (Zeroable.countl_one() < (Mask.size() - 8))
16325 return SDValue();
16326
16327 V1 = DAG.getBitcast(MVT::v4i64, V1);
16328 V2 = DAG.getBitcast(MVT::v4i64, V2);
16329
16330 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16331 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16332
16333 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16334 // the upper bits of the result using an unpckldq.
16335 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16336 { 0, 1, 2, 3, 16, 17, 18, 19,
16337 4, 5, 6, 7, 20, 21, 22, 23 });
16338 // Insert the unpckldq into a zero vector to widen to v32i8.
16339 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16340 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16341 DAG.getVectorIdxConstant(0, DL));
16342}
16343
16344// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16345// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16346// =>
16347// ul = unpckl v1, v2
16348// uh = unpckh v1, v2
16349// a = vperm ul, uh
16350// b = vperm ul, uh
16351//
16352// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16353// and permute. We cannot directly match v3 because it is split into two
16354// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16355// pair of 256-bit shuffles and makes sure the masks are consecutive.
16356//
16357// Once unpck and permute nodes are created, the permute corresponding to this
16358// shuffle is returned, while the other permute replaces the other half of the
16359// shuffle in the selection dag.
16361 SDValue V1, SDValue V2,
16362 ArrayRef<int> Mask,
16363 SelectionDAG &DAG) {
16364 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16365 VT != MVT::v32i8)
16366 return SDValue();
16367 // <B0, B1, B0+1, B1+1, ..., >
16368 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16369 unsigned Begin1) {
16370 size_t Size = Mask.size();
16371 assert(Size % 2 == 0 && "Expected even mask size");
16372 for (unsigned I = 0; I < Size; I += 2) {
16373 if (Mask[I] != (int)(Begin0 + I / 2) ||
16374 Mask[I + 1] != (int)(Begin1 + I / 2))
16375 return false;
16376 }
16377 return true;
16378 };
16379 // Check which half is this shuffle node
16380 int NumElts = VT.getVectorNumElements();
16381 size_t FirstQtr = NumElts / 2;
16382 size_t ThirdQtr = NumElts + NumElts / 2;
16383 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16384 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16385 if (!IsFirstHalf && !IsSecondHalf)
16386 return SDValue();
16387
16388 // Find the intersection between shuffle users of V1 and V2.
16389 SmallVector<SDNode *, 2> Shuffles;
16390 for (SDNode *User : V1->users())
16391 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16392 User->getOperand(1) == V2)
16393 Shuffles.push_back(User);
16394 // Limit user size to two for now.
16395 if (Shuffles.size() != 2)
16396 return SDValue();
16397 // Find out which half of the 512-bit shuffles is each smaller shuffle
16398 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16399 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16400 SDNode *FirstHalf;
16401 SDNode *SecondHalf;
16402 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16403 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16404 FirstHalf = Shuffles[0];
16405 SecondHalf = Shuffles[1];
16406 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16407 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16408 FirstHalf = Shuffles[1];
16409 SecondHalf = Shuffles[0];
16410 } else {
16411 return SDValue();
16412 }
16413 // Lower into unpck and perm. Return the perm of this shuffle and replace
16414 // the other.
16415 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16416 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16417 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16418 DAG.getTargetConstant(0x20, DL, MVT::i8));
16419 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16420 DAG.getTargetConstant(0x31, DL, MVT::i8));
16421 if (IsFirstHalf) {
16422 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16423 return Perm1;
16424 }
16425 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16426 return Perm2;
16427}
16428
16429/// Handle lowering of 4-lane 64-bit floating point shuffles.
16430///
16431/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16432/// isn't available.
16434 const APInt &Zeroable, SDValue V1, SDValue V2,
16435 const X86Subtarget &Subtarget,
16436 SelectionDAG &DAG) {
16437 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16438 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16439 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16440
16441 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16442 Subtarget, DAG))
16443 return V;
16444
16445 if (V2.isUndef()) {
16446 // Check for being able to broadcast a single element.
16447 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16448 Mask, Subtarget, DAG))
16449 return Broadcast;
16450
16451 // Use low duplicate instructions for masks that match their pattern.
16452 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16453 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16454
16455 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16456 // Non-half-crossing single input shuffles can be lowered with an
16457 // interleaved permutation.
16458 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16459 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16460 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16461 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16462 }
16463
16464 // With AVX2 we have direct support for this permutation.
16465 if (Subtarget.hasAVX2())
16466 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16467 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16468
16469 // Try to create an in-lane repeating shuffle mask and then shuffle the
16470 // results into the target lanes.
16472 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16473 return V;
16474
16475 // Try to permute the lanes and then use a per-lane permute.
16476 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16477 Mask, DAG, Subtarget))
16478 return V;
16479
16480 // Otherwise, fall back.
16481 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16482 DAG, Subtarget);
16483 }
16484
16485 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16486 Zeroable, Subtarget, DAG))
16487 return Blend;
16488
16489 // Use dedicated unpack instructions for masks that match their pattern.
16490 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16491 return V;
16492
16493 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16494 Zeroable, Subtarget, DAG))
16495 return Op;
16496
16497 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16498 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16499 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16500 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16501
16502 // If we have lane crossing shuffles AND they don't all come from the lower
16503 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16504 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16505 // canonicalize to a blend of splat which isn't necessary for this combine.
16506 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16507 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16508 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16509 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16510 (!Subtarget.hasAVX2() ||
16511 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16512 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16513
16514 // If we have one input in place, then we can permute the other input and
16515 // blend the result.
16516 if (V1IsInPlace || V2IsInPlace)
16517 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16518 Zeroable, Subtarget, DAG);
16519
16520 // Try to create an in-lane repeating shuffle mask and then shuffle the
16521 // results into the target lanes.
16523 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16524 return V;
16525
16526 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16527 // shuffle. However, if we have AVX2 and either inputs are already in place,
16528 // we will be able to shuffle even across lanes the other input in a single
16529 // instruction so skip this pattern.
16530 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16532 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16533 return V;
16534
16535 // If we have VLX support, we can use VEXPAND.
16536 if (Subtarget.hasVLX())
16537 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16538 Zeroable, Subtarget, DAG))
16539 return V;
16540
16541 // If we have AVX2 then we always want to lower with a blend because an v4 we
16542 // can fully permute the elements.
16543 if (Subtarget.hasAVX2())
16544 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16545 Zeroable, Subtarget, DAG);
16546
16547 // Otherwise fall back on generic lowering.
16548 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16549 Subtarget, DAG);
16550}
16551
16552/// Handle lowering of 4-lane 64-bit integer shuffles.
16553///
16554/// This routine is only called when we have AVX2 and thus a reasonable
16555/// instruction set for v4i64 shuffling..
16557 const APInt &Zeroable, SDValue V1, SDValue V2,
16558 const X86Subtarget &Subtarget,
16559 SelectionDAG &DAG) {
16560 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16561 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16562 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16563 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16564
16565 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16566 Subtarget, DAG))
16567 return V;
16568
16569 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16570 Zeroable, Subtarget, DAG))
16571 return Blend;
16572
16573 // Check for being able to broadcast a single element.
16574 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16575 Subtarget, DAG))
16576 return Broadcast;
16577
16578 // Try to use shift instructions if fast.
16579 if (Subtarget.preferLowerShuffleAsShift())
16580 if (SDValue Shift =
16581 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16582 Subtarget, DAG, /*BitwiseOnly*/ true))
16583 return Shift;
16584
16585 if (V2.isUndef()) {
16586 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16587 // can use lower latency instructions that will operate on both lanes.
16588 SmallVector<int, 2> RepeatedMask;
16589 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16590 SmallVector<int, 4> PSHUFDMask;
16591 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16592 return DAG.getBitcast(
16593 MVT::v4i64,
16594 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16595 DAG.getBitcast(MVT::v8i32, V1),
16596 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16597 }
16598
16599 // AVX2 provides a direct instruction for permuting a single input across
16600 // lanes.
16601 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16602 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16603 }
16604
16605 // Try to use shift instructions.
16606 if (SDValue Shift =
16607 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16608 DAG, /*BitwiseOnly*/ false))
16609 return Shift;
16610
16611 // If we have VLX support, we can use VALIGN or VEXPAND.
16612 if (Subtarget.hasVLX()) {
16613 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16614 Zeroable, Subtarget, DAG))
16615 return Rotate;
16616
16617 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16618 Zeroable, Subtarget, DAG))
16619 return V;
16620 }
16621
16622 // Try to use PALIGNR.
16623 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16624 Subtarget, DAG))
16625 return Rotate;
16626
16627 // Use dedicated unpack instructions for masks that match their pattern.
16628 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16629 return V;
16630
16631 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16632 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16633
16634 // If we have one input in place, then we can permute the other input and
16635 // blend the result.
16636 if (V1IsInPlace || V2IsInPlace)
16637 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16638 Zeroable, Subtarget, DAG);
16639
16640 // Try to create an in-lane repeating shuffle mask and then shuffle the
16641 // results into the target lanes.
16643 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16644 return V;
16645
16646 // Try to lower to PERMQ(BLENDD(V1,V2)).
16647 if (SDValue V =
16648 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16649 return V;
16650
16651 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16652 // shuffle. However, if we have AVX2 and either inputs are already in place,
16653 // we will be able to shuffle even across lanes the other input in a single
16654 // instruction so skip this pattern.
16655 if (!V1IsInPlace && !V2IsInPlace)
16657 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16658 return Result;
16659
16660 // Otherwise fall back on generic blend lowering.
16661 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16662 Zeroable, Subtarget, DAG);
16663}
16664
16665/// Handle lowering of 8-lane 32-bit floating point shuffles.
16666///
16667/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16668/// isn't available.
16670 const APInt &Zeroable, SDValue V1, SDValue V2,
16671 const X86Subtarget &Subtarget,
16672 SelectionDAG &DAG) {
16673 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16674 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16675 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16676
16677 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16678 Zeroable, Subtarget, DAG))
16679 return Blend;
16680
16681 // Check for being able to broadcast a single element.
16682 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16683 Subtarget, DAG))
16684 return Broadcast;
16685
16686 if (!Subtarget.hasAVX2()) {
16687 SmallVector<int> InLaneMask;
16688 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16689
16690 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16691 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16692 /*SimpleOnly*/ true))
16693 return R;
16694 }
16695 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16696 Zeroable, Subtarget, DAG))
16697 return DAG.getBitcast(MVT::v8f32, ZExt);
16698
16699 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16700 // options to efficiently lower the shuffle.
16701 SmallVector<int, 4> RepeatedMask;
16702 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16703 assert(RepeatedMask.size() == 4 &&
16704 "Repeated masks must be half the mask width!");
16705
16706 // Use even/odd duplicate instructions for masks that match their pattern.
16707 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16708 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16709 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16710 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16711
16712 if (V2.isUndef())
16713 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16714 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16715
16716 // Use dedicated unpack instructions for masks that match their pattern.
16717 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16718 return V;
16719
16720 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16721 // have already handled any direct blends.
16722 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16723 }
16724
16725 // Try to create an in-lane repeating shuffle mask and then shuffle the
16726 // results into the target lanes.
16728 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16729 return V;
16730
16731 // If we have a single input shuffle with different shuffle patterns in the
16732 // two 128-bit lanes use the variable mask to VPERMILPS.
16733 if (V2.isUndef()) {
16734 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16735 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16736 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16737 }
16738 if (Subtarget.hasAVX2()) {
16739 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16740 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16741 }
16742 // Otherwise, fall back.
16743 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16744 DAG, Subtarget);
16745 }
16746
16747 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16748 // shuffle.
16750 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16751 return Result;
16752
16753 // If we have VLX support, we can use VEXPAND.
16754 if (Subtarget.hasVLX())
16755 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16756 Zeroable, Subtarget, DAG))
16757 return V;
16758
16759 // Try to match an interleave of two v8f32s and lower them as unpck and
16760 // permutes using ymms. This needs to go before we try to split the vectors.
16761 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16762 if ((Subtarget.hasAVX2() ||
16765 !Subtarget.hasAVX512())
16766 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16767 Mask, DAG))
16768 return V;
16769
16770 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16771 // since after split we get a more efficient code using vpunpcklwd and
16772 // vpunpckhwd instrs than vblend.
16773 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16774 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16775 Subtarget, DAG);
16776
16777 // If we have AVX2 then we always want to lower with a blend because at v8 we
16778 // can fully permute the elements.
16779 if (Subtarget.hasAVX2())
16780 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16781 Zeroable, Subtarget, DAG);
16782
16783 // Otherwise fall back on generic lowering.
16784 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16785 Subtarget, DAG);
16786}
16787
16788/// Handle lowering of 8-lane 32-bit integer shuffles.
16789///
16790/// This routine is only called when we have AVX2 and thus a reasonable
16791/// instruction set for v8i32 shuffling..
16793 const APInt &Zeroable, SDValue V1, SDValue V2,
16794 const X86Subtarget &Subtarget,
16795 SelectionDAG &DAG) {
16796 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16797 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16798 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16799 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16800
16801 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16802
16803 // Whenever we can lower this as a zext, that instruction is strictly faster
16804 // than any alternative. It also allows us to fold memory operands into the
16805 // shuffle in many cases.
16806 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16807 Zeroable, Subtarget, DAG))
16808 return ZExt;
16809
16810 // Try to match an interleave of two v8i32s and lower them as unpck and
16811 // permutes using ymms. This needs to go before we try to split the vectors.
16812 if (!Subtarget.hasAVX512())
16813 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16814 Mask, DAG))
16815 return V;
16816
16817 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16818 // since after split we get a more efficient code than vblend by using
16819 // vpunpcklwd and vpunpckhwd instrs.
16820 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16821 !Subtarget.hasAVX512())
16822 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16823 Subtarget, DAG);
16824
16825 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16826 Zeroable, Subtarget, DAG))
16827 return Blend;
16828
16829 // Check for being able to broadcast a single element.
16830 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16831 Subtarget, DAG))
16832 return Broadcast;
16833
16834 // Try to use shift instructions if fast.
16835 if (Subtarget.preferLowerShuffleAsShift()) {
16836 if (SDValue Shift =
16837 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16838 Subtarget, DAG, /*BitwiseOnly*/ true))
16839 return Shift;
16840 if (NumV2Elements == 0)
16841 if (SDValue Rotate =
16842 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16843 return Rotate;
16844 }
16845
16846 // If the shuffle mask is repeated in each 128-bit lane we can use more
16847 // efficient instructions that mirror the shuffles across the two 128-bit
16848 // lanes.
16849 SmallVector<int, 4> RepeatedMask;
16850 bool Is128BitLaneRepeatedShuffle =
16851 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16852 if (Is128BitLaneRepeatedShuffle) {
16853 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16854 if (V2.isUndef())
16855 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16856 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16857
16858 // Use dedicated unpack instructions for masks that match their pattern.
16859 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16860 return V;
16861 }
16862
16863 // Try to use shift instructions.
16864 if (SDValue Shift =
16865 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16866 DAG, /*BitwiseOnly*/ false))
16867 return Shift;
16868
16869 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16870 if (SDValue Rotate =
16871 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16872 return Rotate;
16873
16874 // If we have VLX support, we can use VALIGN or EXPAND.
16875 if (Subtarget.hasVLX()) {
16876 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16877 Zeroable, Subtarget, DAG))
16878 return Rotate;
16879
16880 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16881 Zeroable, Subtarget, DAG))
16882 return V;
16883 }
16884
16885 // Try to use byte rotation instructions.
16886 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16887 Subtarget, DAG))
16888 return Rotate;
16889
16890 // Try to create an in-lane repeating shuffle mask and then shuffle the
16891 // results into the target lanes.
16893 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16894 return V;
16895
16896 if (V2.isUndef()) {
16897 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16898 // because that should be faster than the variable permute alternatives.
16899 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16900 return V;
16901
16902 // If the shuffle patterns aren't repeated but it's a single input, directly
16903 // generate a cross-lane VPERMD instruction.
16904 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16905 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16906 }
16907
16908 // Assume that a single SHUFPS is faster than an alternative sequence of
16909 // multiple instructions (even if the CPU has a domain penalty).
16910 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16911 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16912 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16913 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16914 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16915 CastV1, CastV2, DAG);
16916 return DAG.getBitcast(MVT::v8i32, ShufPS);
16917 }
16918
16919 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16920 // shuffle.
16922 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16923 return Result;
16924
16925 // Otherwise fall back on generic blend lowering.
16926 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16927 Zeroable, Subtarget, DAG);
16928}
16929
16930/// Handle lowering of 16-lane 16-bit integer shuffles.
16931///
16932/// This routine is only called when we have AVX2 and thus a reasonable
16933/// instruction set for v16i16 shuffling..
16935 const APInt &Zeroable, SDValue V1, SDValue V2,
16936 const X86Subtarget &Subtarget,
16937 SelectionDAG &DAG) {
16938 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16939 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16940 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16941 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16942
16943 // Whenever we can lower this as a zext, that instruction is strictly faster
16944 // than any alternative. It also allows us to fold memory operands into the
16945 // shuffle in many cases.
16947 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16948 return ZExt;
16949
16950 // Check for being able to broadcast a single element.
16951 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16952 Subtarget, DAG))
16953 return Broadcast;
16954
16955 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16956 Zeroable, Subtarget, DAG))
16957 return Blend;
16958
16959 // Use dedicated unpack instructions for masks that match their pattern.
16960 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16961 return V;
16962
16963 // Use dedicated pack instructions for masks that match their pattern.
16964 if (SDValue V =
16965 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16966 return V;
16967
16968 // Try to use lower using a truncation.
16969 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16970 Subtarget, DAG))
16971 return V;
16972
16973 // Try to use shift instructions.
16974 if (SDValue Shift =
16975 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16976 Subtarget, DAG, /*BitwiseOnly*/ false))
16977 return Shift;
16978
16979 // Try to use byte rotation instructions.
16980 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16981 Subtarget, DAG))
16982 return Rotate;
16983
16984 // Try to create an in-lane repeating shuffle mask and then shuffle the
16985 // results into the target lanes.
16987 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16988 return V;
16989
16990 if (V2.isUndef()) {
16991 // Try to use bit rotation instructions.
16992 if (SDValue Rotate =
16993 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16994 return Rotate;
16995
16996 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16997 // because that should be faster than the variable permute alternatives.
16998 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
16999 return V;
17000
17001 // There are no generalized cross-lane shuffle operations available on i16
17002 // element types.
17003 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17005 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17006 return V;
17007
17008 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17009 DAG, Subtarget);
17010 }
17011
17012 SmallVector<int, 8> RepeatedMask;
17013 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17014 // As this is a single-input shuffle, the repeated mask should be
17015 // a strictly valid v8i16 mask that we can pass through to the v8i16
17016 // lowering to handle even the v16 case.
17018 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17019 }
17020 }
17021
17022 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17023 Zeroable, Subtarget, DAG))
17024 return PSHUFB;
17025
17026 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17027 if (Subtarget.hasBWI())
17028 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17029
17030 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17031 // shuffle.
17033 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17034 return Result;
17035
17036 // Try to permute the lanes and then use a per-lane permute.
17038 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17039 return V;
17040
17041 // Try to match an interleave of two v16i16s and lower them as unpck and
17042 // permutes using ymms.
17043 if (!Subtarget.hasAVX512())
17044 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17045 Mask, DAG))
17046 return V;
17047
17048 // Otherwise fall back on generic lowering.
17049 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17050 Subtarget, DAG);
17051}
17052
17053/// Handle lowering of 32-lane 8-bit integer shuffles.
17054///
17055/// This routine is only called when we have AVX2 and thus a reasonable
17056/// instruction set for v32i8 shuffling..
17058 const APInt &Zeroable, SDValue V1, SDValue V2,
17059 const X86Subtarget &Subtarget,
17060 SelectionDAG &DAG) {
17061 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17062 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17063 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17064 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17065
17066 // Whenever we can lower this as a zext, that instruction is strictly faster
17067 // than any alternative. It also allows us to fold memory operands into the
17068 // shuffle in many cases.
17069 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17070 Zeroable, Subtarget, DAG))
17071 return ZExt;
17072
17073 // Check for being able to broadcast a single element.
17074 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17075 Subtarget, DAG))
17076 return Broadcast;
17077
17078 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17079 Zeroable, Subtarget, DAG))
17080 return Blend;
17081
17082 // Use dedicated unpack instructions for masks that match their pattern.
17083 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17084 return V;
17085
17086 // Use dedicated pack instructions for masks that match their pattern.
17087 if (SDValue V =
17088 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17089 return V;
17090
17091 // Try to use lower using a truncation.
17092 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17093 Subtarget, DAG))
17094 return V;
17095
17096 // Try to use shift instructions.
17097 if (SDValue Shift =
17098 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17099 DAG, /*BitwiseOnly*/ false))
17100 return Shift;
17101
17102 // Try to use byte rotation instructions.
17103 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17104 Subtarget, DAG))
17105 return Rotate;
17106
17107 // Try to use bit rotation instructions.
17108 if (V2.isUndef())
17109 if (SDValue Rotate =
17110 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17111 return Rotate;
17112
17113 // Try to create an in-lane repeating shuffle mask and then shuffle the
17114 // results into the target lanes.
17116 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17117 return V;
17118
17119 // There are no generalized cross-lane shuffle operations available on i8
17120 // element types.
17121 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17122 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17123 // because that should be faster than the variable permute alternatives.
17124 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17125 return V;
17126
17128 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17129 return V;
17130
17131 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17132 DAG, Subtarget);
17133 }
17134
17135 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17136 Zeroable, Subtarget, DAG))
17137 return PSHUFB;
17138
17139 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17140 if (Subtarget.hasVBMI())
17141 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17142
17143 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17144 // shuffle.
17146 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17147 return Result;
17148
17149 // Try to permute the lanes and then use a per-lane permute.
17151 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17152 return V;
17153
17154 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17155 // by zeroable elements in the remaining 24 elements. Turn this into two
17156 // vmovqb instructions shuffled together.
17157 if (Subtarget.hasVLX())
17158 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17159 Mask, Zeroable, DAG))
17160 return V;
17161
17162 // Try to match an interleave of two v32i8s and lower them as unpck and
17163 // permutes using ymms.
17164 if (!Subtarget.hasAVX512())
17165 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17166 Mask, DAG))
17167 return V;
17168
17169 // Otherwise fall back on generic lowering.
17170 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17171 Subtarget, DAG);
17172}
17173
17174/// High-level routine to lower various 256-bit x86 vector shuffles.
17175///
17176/// This routine either breaks down the specific type of a 256-bit x86 vector
17177/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17178/// together based on the available instructions.
17180 SDValue V1, SDValue V2, const APInt &Zeroable,
17181 const X86Subtarget &Subtarget,
17182 SelectionDAG &DAG) {
17183 // If we have a single input to the zero element, insert that into V1 if we
17184 // can do so cheaply.
17185 int NumElts = VT.getVectorNumElements();
17186 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17187
17188 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17190 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17191 return Insertion;
17192
17193 // Handle special cases where the lower or upper half is UNDEF.
17194 if (SDValue V =
17195 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17196 return V;
17197
17198 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17199 // can check for those subtargets here and avoid much of the subtarget
17200 // querying in the per-vector-type lowering routines. With AVX1 we have
17201 // essentially *zero* ability to manipulate a 256-bit vector with integer
17202 // types. Since we'll use floating point types there eventually, just
17203 // immediately cast everything to a float and operate entirely in that domain.
17204 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17205 int ElementBits = VT.getScalarSizeInBits();
17206 if (ElementBits < 32) {
17207 // No floating point type available, if we can't use the bit operations
17208 // for masking/blending then decompose into 128-bit vectors.
17209 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17210 Subtarget, DAG))
17211 return V;
17212 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17213 return V;
17214 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17215 }
17216
17217 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17219 V1 = DAG.getBitcast(FpVT, V1);
17220 V2 = DAG.getBitcast(FpVT, V2);
17221 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17222 }
17223
17224 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17225 V1 = DAG.getBitcast(MVT::v16i16, V1);
17226 V2 = DAG.getBitcast(MVT::v16i16, V2);
17227 return DAG.getBitcast(VT,
17228 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17229 }
17230
17231 switch (VT.SimpleTy) {
17232 case MVT::v4f64:
17233 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17234 case MVT::v4i64:
17235 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17236 case MVT::v8f32:
17237 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17238 case MVT::v8i32:
17239 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17240 case MVT::v16i16:
17241 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17242 case MVT::v32i8:
17243 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17244
17245 default:
17246 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17247 }
17248}
17249
17250/// Try to lower a vector shuffle as a 128-bit shuffles.
17252 const APInt &Zeroable, SDValue V1, SDValue V2,
17253 const X86Subtarget &Subtarget,
17254 SelectionDAG &DAG) {
17255 assert(VT.getScalarSizeInBits() == 64 &&
17256 "Unexpected element type size for 128bit shuffle.");
17257
17258 // To handle 256 bit vector requires VLX and most probably
17259 // function lowerV2X128VectorShuffle() is better solution.
17260 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17261
17262 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17263 SmallVector<int, 4> Widened128Mask;
17264 if (!canWidenShuffleElements(Mask, Widened128Mask))
17265 return SDValue();
17266 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17267
17268 // Try to use an insert into a zero vector.
17269 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17270 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17271 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17272 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17273 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17274 DAG.getVectorIdxConstant(0, DL));
17275 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17276 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17277 DAG.getVectorIdxConstant(0, DL));
17278 }
17279
17280 // Check for patterns which can be matched with a single insert of a 256-bit
17281 // subvector.
17282 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17283 if (OnlyUsesV1 ||
17284 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17285 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17286 SDValue SubVec =
17287 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17288 DAG.getVectorIdxConstant(0, DL));
17289 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17290 DAG.getVectorIdxConstant(4, DL));
17291 }
17292
17293 // See if this is an insertion of the lower 128-bits of V2 into V1.
17294 bool IsInsert = true;
17295 int V2Index = -1;
17296 for (int i = 0; i < 4; ++i) {
17297 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17298 if (Widened128Mask[i] < 0)
17299 continue;
17300
17301 // Make sure all V1 subvectors are in place.
17302 if (Widened128Mask[i] < 4) {
17303 if (Widened128Mask[i] != i) {
17304 IsInsert = false;
17305 break;
17306 }
17307 } else {
17308 // Make sure we only have a single V2 index and its the lowest 128-bits.
17309 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17310 IsInsert = false;
17311 break;
17312 }
17313 V2Index = i;
17314 }
17315 }
17316 if (IsInsert && V2Index >= 0) {
17317 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17318 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17319 DAG.getVectorIdxConstant(0, DL));
17320 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17321 }
17322
17323 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17324 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17325 // possible we at least ensure the lanes stay sequential to help later
17326 // combines.
17327 SmallVector<int, 2> Widened256Mask;
17328 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17329 Widened128Mask.clear();
17330 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17331 }
17332
17333 // Try to lower to vshuf64x2/vshuf32x4.
17334 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17335 int PermMask[4] = {-1, -1, -1, -1};
17336 // Ensure elements came from the same Op.
17337 for (int i = 0; i < 4; ++i) {
17338 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17339 if (Widened128Mask[i] < 0)
17340 continue;
17341
17342 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17343 unsigned OpIndex = i / 2;
17344 if (Ops[OpIndex].isUndef())
17345 Ops[OpIndex] = Op;
17346 else if (Ops[OpIndex] != Op)
17347 return SDValue();
17348
17349 PermMask[i] = Widened128Mask[i] % 4;
17350 }
17351
17352 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17353 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17354}
17355
17356/// Handle lowering of 8-lane 64-bit floating point shuffles.
17358 const APInt &Zeroable, SDValue V1, SDValue V2,
17359 const X86Subtarget &Subtarget,
17360 SelectionDAG &DAG) {
17361 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17362 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17363 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17364
17365 if (V2.isUndef()) {
17366 // Use low duplicate instructions for masks that match their pattern.
17367 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17368 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17369
17370 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17371 // Non-half-crossing single input shuffles can be lowered with an
17372 // interleaved permutation.
17373 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17374 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17375 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17376 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17377 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17378 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17379 }
17380
17381 SmallVector<int, 4> RepeatedMask;
17382 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17383 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17384 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17385 }
17386
17387 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17388 V2, Subtarget, DAG))
17389 return Shuf128;
17390
17391 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17392 return Unpck;
17393
17394 // Check if the blend happens to exactly fit that of SHUFPD.
17395 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17396 Zeroable, Subtarget, DAG))
17397 return Op;
17398
17399 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17400 Subtarget, DAG))
17401 return V;
17402
17403 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17404 Zeroable, Subtarget, DAG))
17405 return Blend;
17406
17407 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17408}
17409
17410/// Handle lowering of 16-lane 32-bit floating point shuffles.
17412 const APInt &Zeroable, SDValue V1, SDValue V2,
17413 const X86Subtarget &Subtarget,
17414 SelectionDAG &DAG) {
17415 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17416 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17417 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17418
17419 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17420 // options to efficiently lower the shuffle.
17421 SmallVector<int, 4> RepeatedMask;
17422 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17423 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17424
17425 // Use even/odd duplicate instructions for masks that match their pattern.
17426 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17427 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17428 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17429 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17430
17431 if (V2.isUndef())
17432 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17433 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17434
17435 // Use dedicated unpack instructions for masks that match their pattern.
17436 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17437 return V;
17438
17439 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17440 Zeroable, Subtarget, DAG))
17441 return Blend;
17442
17443 // Otherwise, fall back to a SHUFPS sequence.
17444 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17445 }
17446
17447 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17448 Zeroable, Subtarget, DAG))
17449 return Blend;
17450
17452 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17453 return DAG.getBitcast(MVT::v16f32, ZExt);
17454
17455 // Try to create an in-lane repeating shuffle mask and then shuffle the
17456 // results into the target lanes.
17458 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17459 return V;
17460
17461 // If we have a single input shuffle with different shuffle patterns in the
17462 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17463 if (V2.isUndef() &&
17464 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17465 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17466 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17467 }
17468
17469 // If we have AVX512F support, we can use VEXPAND.
17470 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17471 Zeroable, Subtarget, DAG))
17472 return V;
17473
17474 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17475}
17476
17477/// Handle lowering of 8-lane 64-bit integer shuffles.
17479 const APInt &Zeroable, SDValue V1, SDValue V2,
17480 const X86Subtarget &Subtarget,
17481 SelectionDAG &DAG) {
17482 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17483 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17484 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17485
17486 // Try to use shift instructions if fast.
17487 if (Subtarget.preferLowerShuffleAsShift())
17488 if (SDValue Shift =
17489 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17490 Subtarget, DAG, /*BitwiseOnly*/ true))
17491 return Shift;
17492
17493 if (V2.isUndef()) {
17494 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17495 // can use lower latency instructions that will operate on all four
17496 // 128-bit lanes.
17497 SmallVector<int, 2> Repeated128Mask;
17498 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17499 SmallVector<int, 4> PSHUFDMask;
17500 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17501 return DAG.getBitcast(
17502 MVT::v8i64,
17503 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17504 DAG.getBitcast(MVT::v16i32, V1),
17505 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17506 }
17507
17508 SmallVector<int, 4> Repeated256Mask;
17509 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17510 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17511 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17512 }
17513
17514 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17515 V2, Subtarget, DAG))
17516 return Shuf128;
17517
17518 // Try to use shift instructions.
17519 if (SDValue Shift =
17520 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17521 DAG, /*BitwiseOnly*/ false))
17522 return Shift;
17523
17524 // Try to use VALIGN.
17525 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17526 Zeroable, Subtarget, DAG))
17527 return Rotate;
17528
17529 // Try to use PALIGNR.
17530 if (Subtarget.hasBWI())
17531 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17532 Subtarget, DAG))
17533 return Rotate;
17534
17535 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17536 return Unpck;
17537
17538 // If we have AVX512F support, we can use VEXPAND.
17539 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17540 Subtarget, DAG))
17541 return V;
17542
17543 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17544 Zeroable, Subtarget, DAG))
17545 return Blend;
17546
17547 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17548}
17549
17550/// Handle lowering of 16-lane 32-bit integer shuffles.
17552 const APInt &Zeroable, SDValue V1, SDValue V2,
17553 const X86Subtarget &Subtarget,
17554 SelectionDAG &DAG) {
17555 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17556 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17557 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17558
17559 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17560
17561 // Whenever we can lower this as a zext, that instruction is strictly faster
17562 // than any alternative. It also allows us to fold memory operands into the
17563 // shuffle in many cases.
17565 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17566 return ZExt;
17567
17568 // Try to use shift instructions if fast.
17569 if (Subtarget.preferLowerShuffleAsShift()) {
17570 if (SDValue Shift =
17571 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17572 Subtarget, DAG, /*BitwiseOnly*/ true))
17573 return Shift;
17574 if (NumV2Elements == 0)
17575 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17576 Subtarget, DAG))
17577 return Rotate;
17578 }
17579
17580 // If the shuffle mask is repeated in each 128-bit lane we can use more
17581 // efficient instructions that mirror the shuffles across the four 128-bit
17582 // lanes.
17583 SmallVector<int, 4> RepeatedMask;
17584 bool Is128BitLaneRepeatedShuffle =
17585 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17586 if (Is128BitLaneRepeatedShuffle) {
17587 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17588 if (V2.isUndef())
17589 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17590 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17591
17592 // Use dedicated unpack instructions for masks that match their pattern.
17593 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17594 return V;
17595 }
17596
17597 // Try to use shift instructions.
17598 if (SDValue Shift =
17599 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17600 Subtarget, DAG, /*BitwiseOnly*/ false))
17601 return Shift;
17602
17603 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17604 if (SDValue Rotate =
17605 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17606 return Rotate;
17607
17608 // Try to use VALIGN.
17609 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17610 Zeroable, Subtarget, DAG))
17611 return Rotate;
17612
17613 // Try to use byte rotation instructions.
17614 if (Subtarget.hasBWI())
17615 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17616 Subtarget, DAG))
17617 return Rotate;
17618
17619 // Assume that a single SHUFPS is faster than using a permv shuffle.
17620 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17621 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17622 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17623 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17624 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17625 CastV1, CastV2, DAG);
17626 return DAG.getBitcast(MVT::v16i32, ShufPS);
17627 }
17628
17629 // Try to create an in-lane repeating shuffle mask and then shuffle the
17630 // results into the target lanes.
17632 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17633 return V;
17634
17635 // If we have AVX512F support, we can use VEXPAND.
17636 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17637 Zeroable, Subtarget, DAG))
17638 return V;
17639
17640 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17641 Zeroable, Subtarget, DAG))
17642 return Blend;
17643
17644 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17645}
17646
17647/// Handle lowering of 32-lane 16-bit integer shuffles.
17649 const APInt &Zeroable, SDValue V1, SDValue V2,
17650 const X86Subtarget &Subtarget,
17651 SelectionDAG &DAG) {
17652 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17653 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17654 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17655 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17656
17657 // Whenever we can lower this as a zext, that instruction is strictly faster
17658 // than any alternative. It also allows us to fold memory operands into the
17659 // shuffle in many cases.
17661 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17662 return ZExt;
17663
17664 // Use dedicated unpack instructions for masks that match their pattern.
17665 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17666 return V;
17667
17668 // Use dedicated pack instructions for masks that match their pattern.
17669 if (SDValue V =
17670 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17671 return V;
17672
17673 // Try to use shift instructions.
17674 if (SDValue Shift =
17675 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17676 Subtarget, DAG, /*BitwiseOnly*/ false))
17677 return Shift;
17678
17679 // Try to use byte rotation instructions.
17680 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17681 Subtarget, DAG))
17682 return Rotate;
17683
17684 if (V2.isUndef()) {
17685 // Try to use bit rotation instructions.
17686 if (SDValue Rotate =
17687 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17688 return Rotate;
17689
17690 SmallVector<int, 8> RepeatedMask;
17691 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17692 // As this is a single-input shuffle, the repeated mask should be
17693 // a strictly valid v8i16 mask that we can pass through to the v8i16
17694 // lowering to handle even the v32 case.
17695 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17696 RepeatedMask, Subtarget, DAG);
17697 }
17698 }
17699
17700 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17701 Zeroable, Subtarget, DAG))
17702 return Blend;
17703
17704 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17705 Zeroable, Subtarget, DAG))
17706 return PSHUFB;
17707
17708 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17709 // shuffle.
17710 if (!V2.isUndef())
17712 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17713 return Result;
17714
17715 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17716}
17717
17718/// Handle lowering of 64-lane 8-bit integer shuffles.
17720 const APInt &Zeroable, SDValue V1, SDValue V2,
17721 const X86Subtarget &Subtarget,
17722 SelectionDAG &DAG) {
17723 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17724 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17725 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17726 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17727
17728 // Whenever we can lower this as a zext, that instruction is strictly faster
17729 // than any alternative. It also allows us to fold memory operands into the
17730 // shuffle in many cases.
17732 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17733 return ZExt;
17734
17735 // Use dedicated unpack instructions for masks that match their pattern.
17736 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17737 return V;
17738
17739 // Use dedicated pack instructions for masks that match their pattern.
17740 if (SDValue V =
17741 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17742 return V;
17743
17744 // Try to use shift instructions.
17745 if (SDValue Shift =
17746 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17747 DAG, /*BitwiseOnly*/ false))
17748 return Shift;
17749
17750 // Try to use byte rotation instructions.
17751 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17752 Subtarget, DAG))
17753 return Rotate;
17754
17755 // Try to use bit rotation instructions.
17756 if (V2.isUndef())
17757 if (SDValue Rotate =
17758 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17759 return Rotate;
17760
17761 // Lower as AND if possible.
17762 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17763 Zeroable, Subtarget, DAG))
17764 return Masked;
17765
17766 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17767 Zeroable, Subtarget, DAG))
17768 return PSHUFB;
17769
17770 // Try to create an in-lane repeating shuffle mask and then shuffle the
17771 // results into the target lanes.
17773 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17774 return V;
17775
17777 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17778 return Result;
17779
17780 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17781 Zeroable, Subtarget, DAG))
17782 return Blend;
17783
17784 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17785 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17786 // PALIGNR will be cheaper than the second PSHUFB+OR.
17787 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17788 Mask, Subtarget, DAG))
17789 return V;
17790
17791 // If we can't directly blend but can use PSHUFB, that will be better as it
17792 // can both shuffle and set up the inefficient blend.
17793 bool V1InUse, V2InUse;
17794 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17795 DAG, V1InUse, V2InUse);
17796 }
17797
17798 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17799 // shuffle.
17800 if (!V2.isUndef())
17802 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17803 return Result;
17804
17805 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17806 if (Subtarget.hasVBMI())
17807 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17808
17809 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17810}
17811
17812/// High-level routine to lower various 512-bit x86 vector shuffles.
17813///
17814/// This routine either breaks down the specific type of a 512-bit x86 vector
17815/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17816/// together based on the available instructions.
17818 MVT VT, SDValue V1, SDValue V2,
17819 const APInt &Zeroable,
17820 const X86Subtarget &Subtarget,
17821 SelectionDAG &DAG) {
17822 assert(Subtarget.hasAVX512() &&
17823 "Cannot lower 512-bit vectors w/ basic ISA!");
17824
17825 // If we have a single input to the zero element, insert that into V1 if we
17826 // can do so cheaply.
17827 int NumElts = Mask.size();
17828 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17829
17830 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17832 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17833 return Insertion;
17834
17835 // Handle special cases where the lower or upper half is UNDEF.
17836 if (SDValue V =
17837 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17838 return V;
17839
17840 // Check for being able to broadcast a single element.
17841 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17842 Subtarget, DAG))
17843 return Broadcast;
17844
17845 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17846 // Try using bit ops for masking and blending before falling back to
17847 // splitting.
17848 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17849 Subtarget, DAG))
17850 return V;
17851 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17852 return V;
17853
17854 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17855 }
17856
17857 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17858 if (!Subtarget.hasBWI())
17859 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17860 /*SimpleOnly*/ false);
17861
17862 V1 = DAG.getBitcast(MVT::v32i16, V1);
17863 V2 = DAG.getBitcast(MVT::v32i16, V2);
17864 return DAG.getBitcast(VT,
17865 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17866 }
17867
17868 // Dispatch to each element type for lowering. If we don't have support for
17869 // specific element type shuffles at 512 bits, immediately split them and
17870 // lower them. Each lowering routine of a given type is allowed to assume that
17871 // the requisite ISA extensions for that element type are available.
17872 switch (VT.SimpleTy) {
17873 case MVT::v8f64:
17874 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17875 case MVT::v16f32:
17876 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17877 case MVT::v8i64:
17878 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17879 case MVT::v16i32:
17880 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17881 case MVT::v32i16:
17882 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17883 case MVT::v64i8:
17884 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17885
17886 default:
17887 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17888 }
17889}
17890
17892 MVT VT, SDValue V1, SDValue V2,
17893 const X86Subtarget &Subtarget,
17894 SelectionDAG &DAG) {
17895 // Shuffle should be unary.
17896 if (!V2.isUndef())
17897 return SDValue();
17898
17899 int ShiftAmt = -1;
17900 int NumElts = Mask.size();
17901 for (int i = 0; i != NumElts; ++i) {
17902 int M = Mask[i];
17903 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17904 "Unexpected mask index.");
17905 if (M < 0)
17906 continue;
17907
17908 // The first non-undef element determines our shift amount.
17909 if (ShiftAmt < 0) {
17910 ShiftAmt = M - i;
17911 // Need to be shifting right.
17912 if (ShiftAmt <= 0)
17913 return SDValue();
17914 }
17915 // All non-undef elements must shift by the same amount.
17916 if (ShiftAmt != M - i)
17917 return SDValue();
17918 }
17919 assert(ShiftAmt >= 0 && "All undef?");
17920
17921 // Great we found a shift right.
17922 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17923 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17924 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17925 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17926 DAG.getVectorIdxConstant(0, DL));
17927}
17928
17929// Determine if this shuffle can be implemented with a KSHIFT instruction.
17930// Returns the shift amount if possible or -1 if not. This is a simplified
17931// version of matchShuffleAsShift.
17932static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17933 int MaskOffset, const APInt &Zeroable) {
17934 int Size = Mask.size();
17935
17936 auto CheckZeros = [&](int Shift, bool Left) {
17937 for (int j = 0; j < Shift; ++j)
17938 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17939 return false;
17940
17941 return true;
17942 };
17943
17944 auto MatchShift = [&](int Shift, bool Left) {
17945 unsigned Pos = Left ? Shift : 0;
17946 unsigned Low = Left ? 0 : Shift;
17947 unsigned Len = Size - Shift;
17948 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17949 };
17950
17951 for (int Shift = 1; Shift != Size; ++Shift)
17952 for (bool Left : {true, false})
17953 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17955 return Shift;
17956 }
17957
17958 return -1;
17959}
17960
17961
17962// Lower vXi1 vector shuffles.
17963// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17964// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17965// vector, shuffle and then truncate it back.
17967 MVT VT, SDValue V1, SDValue V2,
17968 const APInt &Zeroable,
17969 const X86Subtarget &Subtarget,
17970 SelectionDAG &DAG) {
17971 assert(Subtarget.hasAVX512() &&
17972 "Cannot lower 512-bit vectors w/o basic ISA!");
17973
17974 int NumElts = Mask.size();
17975 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17976
17977 // Try to recognize shuffles that are just padding a subvector with zeros.
17978 int SubvecElts = 0;
17979 int Src = -1;
17980 for (int i = 0; i != NumElts; ++i) {
17981 if (Mask[i] >= 0) {
17982 // Grab the source from the first valid mask. All subsequent elements need
17983 // to use this same source.
17984 if (Src < 0)
17985 Src = Mask[i] / NumElts;
17986 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17987 break;
17988 }
17989
17990 ++SubvecElts;
17991 }
17992 assert(SubvecElts != NumElts && "Identity shuffle?");
17993
17994 // Clip to a power 2.
17995 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17996
17997 // Make sure the number of zeroable bits in the top at least covers the bits
17998 // not covered by the subvector.
17999 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18000 assert(Src >= 0 && "Expected a source!");
18001 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18002 SDValue Extract =
18003 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18004 DAG.getVectorIdxConstant(0, DL));
18005 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18006 DAG.getConstant(0, DL, VT), Extract,
18007 DAG.getVectorIdxConstant(0, DL));
18008 }
18009
18010 // Try a simple shift right with undef elements. Later we'll try with zeros.
18011 if (SDValue Shift =
18012 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18013 return Shift;
18014
18015 // Try to match KSHIFTs.
18016 unsigned Offset = 0;
18017 for (SDValue V : {V1, V2}) {
18018 unsigned Opcode;
18019 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18020 if (ShiftAmt >= 0) {
18021 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18022 MVT WideVT = Res.getSimpleValueType();
18023 // Widened right shifts need two shifts to ensure we shift in zeroes.
18024 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18025 int WideElts = WideVT.getVectorNumElements();
18026 // Shift left to put the original vector in the MSBs of the new size.
18027 Res =
18028 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18029 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18030 // Increase the shift amount to account for the left shift.
18031 ShiftAmt += WideElts - NumElts;
18032 }
18033
18034 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18035 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18036 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18037 DAG.getVectorIdxConstant(0, DL));
18038 }
18039 Offset += NumElts; // Increment for next iteration.
18040 }
18041
18042 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18043 // ops instead.
18044 // TODO: What other unary shuffles would benefit from this?
18045 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18046 SDValue Op0 = V1.getOperand(0);
18047 SDValue Op1 = V1.getOperand(1);
18048 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
18049 EVT OpVT = Op0.getValueType();
18050 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18051 return DAG.getSetCC(
18052 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18053 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18054 }
18055
18056 MVT ExtVT;
18057 switch (VT.SimpleTy) {
18058 default:
18059 llvm_unreachable("Expected a vector of i1 elements");
18060 case MVT::v2i1:
18061 ExtVT = MVT::v2i64;
18062 break;
18063 case MVT::v4i1:
18064 ExtVT = MVT::v4i32;
18065 break;
18066 case MVT::v8i1:
18067 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18068 // shuffle.
18069 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18070 break;
18071 case MVT::v16i1:
18072 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18073 // 256-bit operation available.
18074 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18075 break;
18076 case MVT::v32i1:
18077 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18078 // 256-bit operation available.
18079 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18080 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18081 break;
18082 case MVT::v64i1:
18083 // Fall back to scalarization. FIXME: We can do better if the shuffle
18084 // can be partitioned cleanly.
18085 if (!Subtarget.useBWIRegs())
18086 return SDValue();
18087 ExtVT = MVT::v64i8;
18088 break;
18089 }
18090
18091 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18092 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18093
18094 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18095 // i1 was sign extended we can use X86ISD::CVT2MASK.
18096 int NumElems = VT.getVectorNumElements();
18097 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18098 (Subtarget.hasDQI() && (NumElems < 32)))
18099 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18100 Shuffle, ISD::SETGT);
18101
18102 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18103}
18104
18105/// Helper function that returns true if the shuffle mask should be
18106/// commuted to improve canonicalization.
18108 int NumElements = Mask.size();
18109
18110 int NumV1Elements = 0, NumV2Elements = 0;
18111 for (int M : Mask)
18112 if (M < 0)
18113 continue;
18114 else if (M < NumElements)
18115 ++NumV1Elements;
18116 else
18117 ++NumV2Elements;
18118
18119 // Commute the shuffle as needed such that more elements come from V1 than
18120 // V2. This allows us to match the shuffle pattern strictly on how many
18121 // elements come from V1 without handling the symmetric cases.
18122 if (NumV2Elements > NumV1Elements)
18123 return true;
18124
18125 assert(NumV1Elements > 0 && "No V1 indices");
18126
18127 if (NumV2Elements == 0)
18128 return false;
18129
18130 // When the number of V1 and V2 elements are the same, try to minimize the
18131 // number of uses of V2 in the low half of the vector. When that is tied,
18132 // ensure that the sum of indices for V1 is equal to or lower than the sum
18133 // indices for V2. When those are equal, try to ensure that the number of odd
18134 // indices for V1 is lower than the number of odd indices for V2.
18135 if (NumV1Elements == NumV2Elements) {
18136 int LowV1Elements = 0, LowV2Elements = 0;
18137 for (int M : Mask.slice(0, NumElements / 2))
18138 if (M >= NumElements)
18139 ++LowV2Elements;
18140 else if (M >= 0)
18141 ++LowV1Elements;
18142 if (LowV2Elements > LowV1Elements)
18143 return true;
18144 if (LowV2Elements == LowV1Elements) {
18145 int SumV1Indices = 0, SumV2Indices = 0;
18146 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18147 if (Mask[i] >= NumElements)
18148 SumV2Indices += i;
18149 else if (Mask[i] >= 0)
18150 SumV1Indices += i;
18151 if (SumV2Indices < SumV1Indices)
18152 return true;
18153 if (SumV2Indices == SumV1Indices) {
18154 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18155 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18156 if (Mask[i] >= NumElements)
18157 NumV2OddIndices += i % 2;
18158 else if (Mask[i] >= 0)
18159 NumV1OddIndices += i % 2;
18160 if (NumV2OddIndices < NumV1OddIndices)
18161 return true;
18162 }
18163 }
18164 }
18165
18166 return false;
18167}
18168
18170 const X86Subtarget &Subtarget) {
18171 if (!Subtarget.hasAVX512())
18172 return false;
18173
18174 if (!V.getValueType().isSimple())
18175 return false;
18176
18177 MVT VT = V.getSimpleValueType().getScalarType();
18178 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18179 return false;
18180
18181 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18182 // are preferable to blendw/blendvb/masked-mov.
18183 if ((VT == MVT::i16 || VT == MVT::i8) &&
18184 V.getSimpleValueType().getSizeInBits() < 512)
18185 return false;
18186
18187 auto HasMaskOperation = [&](SDValue V) {
18188 // TODO: Currently we only check limited opcode. We probably extend
18189 // it to all binary operation by checking TLI.isBinOp().
18190 switch (V->getOpcode()) {
18191 default:
18192 return false;
18193 case ISD::ADD:
18194 case ISD::SUB:
18195 case ISD::AND:
18196 case ISD::XOR:
18197 case ISD::OR:
18198 case ISD::SMAX:
18199 case ISD::SMIN:
18200 case ISD::UMAX:
18201 case ISD::UMIN:
18202 case ISD::ABS:
18203 case ISD::SHL:
18204 case ISD::SRL:
18205 case ISD::SRA:
18206 case ISD::MUL:
18207 break;
18208 }
18209 if (!V->hasOneUse())
18210 return false;
18211
18212 return true;
18213 };
18214
18215 if (HasMaskOperation(V))
18216 return true;
18217
18218 return false;
18219}
18220
18221// Forward declaration.
18224 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18225 const X86Subtarget &Subtarget);
18226
18227 /// Top-level lowering for x86 vector shuffles.
18228///
18229/// This handles decomposition, canonicalization, and lowering of all x86
18230/// vector shuffles. Most of the specific lowering strategies are encapsulated
18231/// above in helper routines. The canonicalization attempts to widen shuffles
18232/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18233/// s.t. only one of the two inputs needs to be tested, etc.
18235 SelectionDAG &DAG) {
18236 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18237 ArrayRef<int> OrigMask = SVOp->getMask();
18238 SDValue V1 = Op.getOperand(0);
18239 SDValue V2 = Op.getOperand(1);
18240 MVT VT = Op.getSimpleValueType();
18241 int NumElements = VT.getVectorNumElements();
18242 SDLoc DL(Op);
18243 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18244
18245 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18246 "Can't lower MMX shuffles");
18247
18248 bool V1IsUndef = V1.isUndef();
18249 bool V2IsUndef = V2.isUndef();
18250 if (V1IsUndef && V2IsUndef)
18251 return DAG.getUNDEF(VT);
18252
18253 // When we create a shuffle node we put the UNDEF node to second operand,
18254 // but in some cases the first operand may be transformed to UNDEF.
18255 // In this case we should just commute the node.
18256 if (V1IsUndef)
18257 return DAG.getCommutedVectorShuffle(*SVOp);
18258
18259 // Check for non-undef masks pointing at an undef vector and make the masks
18260 // undef as well. This makes it easier to match the shuffle based solely on
18261 // the mask.
18262 if (V2IsUndef &&
18263 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18264 SmallVector<int, 8> NewMask(OrigMask);
18265 for (int &M : NewMask)
18266 if (M >= NumElements)
18267 M = -1;
18268 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18269 }
18270
18271 // Check for illegal shuffle mask element index values.
18272 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18273 (void)MaskUpperLimit;
18274 assert(llvm::all_of(OrigMask,
18275 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18276 "Out of bounds shuffle index");
18277
18278 // We actually see shuffles that are entirely re-arrangements of a set of
18279 // zero inputs. This mostly happens while decomposing complex shuffles into
18280 // simple ones. Directly lower these as a buildvector of zeros.
18281 APInt KnownUndef, KnownZero;
18282 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18283
18284 APInt Zeroable = KnownUndef | KnownZero;
18285 if (Zeroable.isAllOnes())
18286 return getZeroVector(VT, Subtarget, DAG, DL);
18287
18288 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18289
18290 // Try to collapse shuffles into using a vector type with fewer elements but
18291 // wider element types. We cap this to not form integers or floating point
18292 // elements wider than 64 bits. It does not seem beneficial to form i128
18293 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18294 SmallVector<int, 16> WidenedMask;
18295 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18296 !canCombineAsMaskOperation(V1, Subtarget) &&
18297 !canCombineAsMaskOperation(V2, Subtarget) &&
18298 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18299 // Shuffle mask widening should not interfere with a broadcast opportunity
18300 // by obfuscating the operands with bitcasts.
18301 // TODO: Avoid lowering directly from this top-level function: make this
18302 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18303 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18304 Subtarget, DAG))
18305 return Broadcast;
18306
18307 MVT NewEltVT = VT.isFloatingPoint()
18310 int NewNumElts = NumElements / 2;
18311 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18312 // Make sure that the new vector type is legal. For example, v2f64 isn't
18313 // legal on SSE1.
18314 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18315 if (V2IsZero) {
18316 // Modify the new Mask to take all zeros from the all-zero vector.
18317 // Choose indices that are blend-friendly.
18318 bool UsedZeroVector = false;
18319 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18320 "V2's non-undef elements are used?!");
18321 for (int i = 0; i != NewNumElts; ++i)
18322 if (WidenedMask[i] == SM_SentinelZero) {
18323 WidenedMask[i] = i + NewNumElts;
18324 UsedZeroVector = true;
18325 }
18326 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18327 // some elements to be undef.
18328 if (UsedZeroVector)
18329 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18330 }
18331 V1 = DAG.getBitcast(NewVT, V1);
18332 V2 = DAG.getBitcast(NewVT, V2);
18333 return DAG.getBitcast(
18334 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18335 }
18336 }
18337
18338 SmallVector<SDValue> Ops = {V1, V2};
18339 SmallVector<int> Mask(OrigMask);
18340
18341 // Canonicalize the shuffle with any horizontal ops inputs.
18342 // NOTE: This may update Ops and Mask.
18344 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18345 return DAG.getBitcast(VT, HOp);
18346
18347 V1 = DAG.getBitcast(VT, Ops[0]);
18348 V2 = DAG.getBitcast(VT, Ops[1]);
18349 assert(NumElements == (int)Mask.size() &&
18350 "canonicalizeShuffleMaskWithHorizOp "
18351 "shouldn't alter the shuffle mask size");
18352
18353 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18354 // These will be materialized uniformly anyway, so make splat matching easier.
18355 // TODO: Allow all int constants?
18356 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18357 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18358 BitVector Undefs;
18359 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18360 if (Undefs.any() &&
18362 isa<ConstantFPSDNode>(Splat))) {
18363 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18364 }
18365 }
18366 }
18367 return V;
18368 };
18369 V1 = CanonicalizeConstant(V1);
18370 V2 = CanonicalizeConstant(V2);
18371
18372 // Commute the shuffle if it will improve canonicalization.
18375 std::swap(V1, V2);
18376 }
18377
18378 // For each vector width, delegate to a specialized lowering routine.
18379 if (VT.is128BitVector())
18380 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18381
18382 if (VT.is256BitVector())
18383 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18384
18385 if (VT.is512BitVector())
18386 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18387
18388 if (Is1BitVector)
18389 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18390
18391 llvm_unreachable("Unimplemented!");
18392}
18393
18394// As legal vpcompress instructions depend on various AVX512 extensions, try to
18395// convert illegal vector sizes to legal ones to avoid expansion.
18397 SelectionDAG &DAG) {
18398 assert(Subtarget.hasAVX512() &&
18399 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18400
18401 SDLoc DL(Op);
18402 SDValue Vec = Op.getOperand(0);
18403 SDValue Mask = Op.getOperand(1);
18404 SDValue Passthru = Op.getOperand(2);
18405
18406 EVT VecVT = Vec.getValueType();
18407 EVT ElementVT = VecVT.getVectorElementType();
18408 unsigned NumElements = VecVT.getVectorNumElements();
18409 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18410 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18411
18412 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18413 // compressed as 512-bit vectors in AVX512F.
18414 if (NumVecBits != 128 && NumVecBits != 256)
18415 return SDValue();
18416
18417 if (NumElementBits == 32 || NumElementBits == 64) {
18418 unsigned NumLargeElements = 512 / NumElementBits;
18419 MVT LargeVecVT =
18420 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18421 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18422
18423 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18424 DAG, DL);
18425 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18426 Subtarget, DAG, DL);
18427 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18428 : widenSubVector(LargeVecVT, Passthru,
18429 /*ZeroNewElements=*/false,
18430 Subtarget, DAG, DL);
18431
18432 SDValue Compressed =
18433 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18434 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18435 DAG.getConstant(0, DL, MVT::i64));
18436 }
18437
18438 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18439 VecVT == MVT::v16i16) {
18440 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18441 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18442
18443 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18444 Passthru = Passthru.isUndef()
18445 ? DAG.getUNDEF(LargeVecVT)
18446 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18447
18448 SDValue Compressed =
18449 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18450 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18451 }
18452
18453 return SDValue();
18454}
18455
18456/// Try to lower a VSELECT instruction to a vector shuffle.
18458 const X86Subtarget &Subtarget,
18459 SelectionDAG &DAG) {
18460 SDValue Cond = Op.getOperand(0);
18461 SDValue LHS = Op.getOperand(1);
18462 SDValue RHS = Op.getOperand(2);
18463 MVT VT = Op.getSimpleValueType();
18464
18465 // Only non-legal VSELECTs reach this lowering, convert those into generic
18466 // shuffles and re-use the shuffle lowering path for blends.
18470 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18471 }
18472
18473 return SDValue();
18474}
18475
18476SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18477 SDValue Cond = Op.getOperand(0);
18478 SDValue LHS = Op.getOperand(1);
18479 SDValue RHS = Op.getOperand(2);
18480
18481 SDLoc dl(Op);
18482 MVT VT = Op.getSimpleValueType();
18483 if (isSoftF16(VT, Subtarget)) {
18485 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18486 DAG.getBitcast(NVT, LHS),
18487 DAG.getBitcast(NVT, RHS)));
18488 }
18489
18490 // A vselect where all conditions and data are constants can be optimized into
18491 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18495 return SDValue();
18496
18497 // Try to lower this to a blend-style vector shuffle. This can handle all
18498 // constant condition cases.
18499 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18500 return BlendOp;
18501
18502 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18503 // with patterns on the mask registers on AVX-512.
18504 MVT CondVT = Cond.getSimpleValueType();
18505 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18506 if (CondEltSize == 1)
18507 return Op;
18508
18509 // Variable blends are only legal from SSE4.1 onward.
18510 if (!Subtarget.hasSSE41())
18511 return SDValue();
18512
18513 unsigned EltSize = VT.getScalarSizeInBits();
18514 unsigned NumElts = VT.getVectorNumElements();
18515
18516 // Expand v32i16/v64i8 without BWI.
18517 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18518 return SDValue();
18519
18520 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18521 // into an i1 condition so that we can use the mask-based 512-bit blend
18522 // instructions.
18523 if (VT.getSizeInBits() == 512) {
18524 // Build a mask by testing the condition against zero.
18525 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18526 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18527 DAG.getConstant(0, dl, CondVT),
18528 ISD::SETNE);
18529 // Now return a new VSELECT using the mask.
18530 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18531 }
18532
18533 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18534 if (CondEltSize != EltSize) {
18535 // If we don't have a sign splat, rely on the expansion.
18536 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18537 return SDValue();
18538
18539 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18540 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18541 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18542 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18543 }
18544
18545 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18546 // are free to split, then better to split before expanding the
18547 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18548 // TODO: This is very similar to narrowVectorSelect.
18549 // TODO: Add Load splitting to isFreeToSplitVector ?
18550 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18551 !Subtarget.hasXOP()) {
18552 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18553 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18554 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18555 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18556 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18557 if (FreeCond && (FreeLHS || FreeRHS))
18558 return splitVectorOp(Op, DAG, dl);
18559 }
18560
18561 // Only some types will be legal on some subtargets. If we can emit a legal
18562 // VSELECT-matching blend, return Op, and but if we need to expand, return
18563 // a null value.
18564 switch (VT.SimpleTy) {
18565 default:
18566 // Most of the vector types have blends past SSE4.1.
18567 return Op;
18568
18569 case MVT::v32i8:
18570 // The byte blends for AVX vectors were introduced only in AVX2.
18571 if (Subtarget.hasAVX2())
18572 return Op;
18573
18574 return SDValue();
18575
18576 case MVT::v8i16:
18577 case MVT::v16i16:
18578 case MVT::v8f16:
18579 case MVT::v16f16: {
18580 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18581 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18582 Cond = DAG.getBitcast(CastVT, Cond);
18583 LHS = DAG.getBitcast(CastVT, LHS);
18584 RHS = DAG.getBitcast(CastVT, RHS);
18585 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18586 return DAG.getBitcast(VT, Select);
18587 }
18588 }
18589}
18590
18592 MVT VT = Op.getSimpleValueType();
18593 SDValue Vec = Op.getOperand(0);
18594 SDValue Idx = Op.getOperand(1);
18595 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18596 SDLoc dl(Op);
18597
18599 return SDValue();
18600
18601 if (VT.getSizeInBits() == 8) {
18602 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18603 // we're going to zero extend the register or fold the store.
18606 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18607 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18608 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18609
18610 unsigned IdxVal = Idx->getAsZExtVal();
18611 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18612 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18613 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18614 }
18615
18616 if (VT == MVT::f32) {
18617 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18618 // the result back to FR32 register. It's only worth matching if the
18619 // result has a single use which is a store or a bitcast to i32. And in
18620 // the case of a store, it's not worth it if the index is a constant 0,
18621 // because a MOVSSmr can be used instead, which is smaller and faster.
18622 if (!Op.hasOneUse())
18623 return SDValue();
18624 SDNode *User = *Op.getNode()->user_begin();
18625 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18626 (User->getOpcode() != ISD::BITCAST ||
18627 User->getValueType(0) != MVT::i32))
18628 return SDValue();
18629 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18630 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18631 return DAG.getBitcast(MVT::f32, Extract);
18632 }
18633
18634 if (VT == MVT::i32 || VT == MVT::i64)
18635 return Op;
18636
18637 return SDValue();
18638}
18639
18640/// Extract one bit from mask vector, like v16i1 or v8i1.
18641/// AVX-512 feature.
18643 const X86Subtarget &Subtarget) {
18644 SDValue Vec = Op.getOperand(0);
18645 SDLoc dl(Vec);
18646 MVT VecVT = Vec.getSimpleValueType();
18647 SDValue Idx = Op.getOperand(1);
18648 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18649 MVT EltVT = Op.getSimpleValueType();
18650
18651 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18652 "Unexpected vector type in ExtractBitFromMaskVector");
18653
18654 // variable index can't be handled in mask registers,
18655 // extend vector to VR512/128
18656 if (!IdxC) {
18657 unsigned NumElts = VecVT.getVectorNumElements();
18658 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18659 // than extending to 128/256bit.
18660 if (NumElts == 1) {
18661 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18663 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18664 }
18665 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18666 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18667 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18668 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18669 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18670 }
18671
18672 unsigned IdxVal = IdxC->getZExtValue();
18673 if (IdxVal == 0) // the operation is legal
18674 return Op;
18675
18676 // Extend to natively supported kshift.
18677 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18678
18679 // Use kshiftr instruction to move to the lower element.
18680 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18681 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18682
18683 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18684 DAG.getVectorIdxConstant(0, dl));
18685}
18686
18687// Helper to find all the extracted elements from a vector.
18689 MVT VT = N->getSimpleValueType(0);
18690 unsigned NumElts = VT.getVectorNumElements();
18691 APInt DemandedElts = APInt::getZero(NumElts);
18692 for (SDNode *User : N->users()) {
18693 switch (User->getOpcode()) {
18694 case X86ISD::PEXTRB:
18695 case X86ISD::PEXTRW:
18697 if (!isa<ConstantSDNode>(User->getOperand(1))) {
18698 DemandedElts.setAllBits();
18699 return DemandedElts;
18700 }
18701 DemandedElts.setBit(User->getConstantOperandVal(1));
18702 break;
18703 case ISD::BITCAST: {
18704 if (!User->getValueType(0).isSimple() ||
18705 !User->getValueType(0).isVector()) {
18706 DemandedElts.setAllBits();
18707 return DemandedElts;
18708 }
18709 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18710 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18711 break;
18712 }
18713 default:
18714 DemandedElts.setAllBits();
18715 return DemandedElts;
18716 }
18717 }
18718 return DemandedElts;
18719}
18720
18721SDValue
18722X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18723 SelectionDAG &DAG) const {
18724 SDLoc dl(Op);
18725 SDValue Vec = Op.getOperand(0);
18726 MVT VecVT = Vec.getSimpleValueType();
18727 SDValue Idx = Op.getOperand(1);
18728 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18729
18730 if (VecVT.getVectorElementType() == MVT::i1)
18731 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18732
18733 if (!IdxC) {
18734 // Its more profitable to go through memory (1 cycles throughput)
18735 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18736 // IACA tool was used to get performance estimation
18737 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18738 //
18739 // example : extractelement <16 x i8> %a, i32 %i
18740 //
18741 // Block Throughput: 3.00 Cycles
18742 // Throughput Bottleneck: Port5
18743 //
18744 // | Num Of | Ports pressure in cycles | |
18745 // | Uops | 0 - DV | 5 | 6 | 7 | |
18746 // ---------------------------------------------
18747 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18748 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18749 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18750 // Total Num Of Uops: 4
18751 //
18752 //
18753 // Block Throughput: 1.00 Cycles
18754 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18755 //
18756 // | | Ports pressure in cycles | |
18757 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18758 // ---------------------------------------------------------
18759 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18760 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18761 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18762 // Total Num Of Uops: 4
18763
18764 return SDValue();
18765 }
18766
18767 unsigned IdxVal = IdxC->getZExtValue();
18768
18769 // If this is a 256-bit vector result, first extract the 128-bit vector and
18770 // then extract the element from the 128-bit vector.
18771 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18772 // Get the 128-bit vector.
18773 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18774 MVT EltVT = VecVT.getVectorElementType();
18775
18776 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18777 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18778
18779 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18780 // this can be done with a mask.
18781 IdxVal &= ElemsPerChunk - 1;
18782 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18783 DAG.getVectorIdxConstant(IdxVal, dl));
18784 }
18785
18786 assert(VecVT.is128BitVector() && "Unexpected vector length");
18787
18788 MVT VT = Op.getSimpleValueType();
18789
18790 if (VT == MVT::i16) {
18791 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18792 // we're going to zero extend the register or fold the store (SSE41 only).
18793 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18794 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18795 if (Subtarget.hasFP16())
18796 return Op;
18797
18798 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18799 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18800 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18801 }
18802
18803 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18804 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18805 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18806 }
18807
18808 if (Subtarget.hasSSE41())
18809 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18810 return Res;
18811
18812 // Only extract a single element from a v16i8 source - determine the common
18813 // DWORD/WORD that all extractions share, and extract the sub-byte.
18814 // TODO: Add QWORD MOVQ extraction?
18815 if (VT == MVT::i8) {
18816 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18817 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18818
18819 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18820 int DWordIdx = IdxVal / 4;
18821 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18822 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18823 DAG.getBitcast(MVT::v4i32, Vec),
18824 DAG.getVectorIdxConstant(DWordIdx, dl));
18825 int ShiftVal = (IdxVal % 4) * 8;
18826 if (ShiftVal != 0)
18827 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18828 DAG.getConstant(ShiftVal, dl, MVT::i8));
18829 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18830 }
18831
18832 int WordIdx = IdxVal / 2;
18833 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18834 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18835 DAG.getBitcast(MVT::v8i16, Vec),
18836 DAG.getVectorIdxConstant(WordIdx, dl));
18837 int ShiftVal = (IdxVal % 2) * 8;
18838 if (ShiftVal != 0)
18839 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18840 DAG.getConstant(ShiftVal, dl, MVT::i8));
18841 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18842 }
18843 }
18844
18845 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18846 if (IdxVal == 0)
18847 return Op;
18848
18849 // Shuffle the element to the lowest element, then movss or movsh.
18851 Mask[0] = static_cast<int>(IdxVal);
18852 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18853 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18854 DAG.getVectorIdxConstant(0, dl));
18855 }
18856
18857 if (VT.getSizeInBits() == 64) {
18858 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18859 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18860 // to match extract_elt for f64.
18861 if (IdxVal == 0)
18862 return Op;
18863
18864 // UNPCKHPD the element to the lowest double word, then movsd.
18865 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18866 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18867 int Mask[2] = { 1, -1 };
18868 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18869 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18870 DAG.getVectorIdxConstant(0, dl));
18871 }
18872
18873 return SDValue();
18874}
18875
18876/// Insert one bit to mask vector, like v16i1 or v8i1.
18877/// AVX-512 feature.
18879 const X86Subtarget &Subtarget) {
18880 SDLoc dl(Op);
18881 SDValue Vec = Op.getOperand(0);
18882 SDValue Elt = Op.getOperand(1);
18883 SDValue Idx = Op.getOperand(2);
18884 MVT VecVT = Vec.getSimpleValueType();
18885
18886 if (!isa<ConstantSDNode>(Idx)) {
18887 // Non constant index. Extend source and destination,
18888 // insert element and then truncate the result.
18889 unsigned NumElts = VecVT.getVectorNumElements();
18890 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18891 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18892 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18893 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18894 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18895 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18896 }
18897
18898 // Copy into a k-register, extract to v1i1 and insert_subvector.
18899 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18900 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18901}
18902
18903SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18904 SelectionDAG &DAG) const {
18905 MVT VT = Op.getSimpleValueType();
18906 MVT EltVT = VT.getVectorElementType();
18907 unsigned NumElts = VT.getVectorNumElements();
18908 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18909
18910 if (EltVT == MVT::i1)
18911 return InsertBitToMaskVector(Op, DAG, Subtarget);
18912
18913 SDLoc dl(Op);
18914 SDValue N0 = Op.getOperand(0);
18915 SDValue N1 = Op.getOperand(1);
18916 SDValue N2 = Op.getOperand(2);
18917 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18918
18919 if (EltVT == MVT::bf16) {
18921 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18922 DAG.getBitcast(IVT, N0),
18923 DAG.getBitcast(MVT::i16, N1), N2);
18924 return DAG.getBitcast(VT, Res);
18925 }
18926
18927 if (!N2C) {
18928 // Variable insertion indices, usually we're better off spilling to stack,
18929 // but AVX512 can use a variable compare+select by comparing against all
18930 // possible vector indices, and FP insertion has less gpr->simd traffic.
18931 if (!(Subtarget.hasBWI() ||
18932 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18933 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18934 return SDValue();
18935
18936 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18937 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18938 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18939 return SDValue();
18940
18941 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18942 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18943 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18944
18945 SmallVector<SDValue, 16> RawIndices;
18946 for (unsigned I = 0; I != NumElts; ++I)
18947 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18948 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18949
18950 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18951 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18953 }
18954
18955 if (N2C->getAPIntValue().uge(NumElts))
18956 return SDValue();
18957 uint64_t IdxVal = N2C->getZExtValue();
18958
18959 bool IsZeroElt = X86::isZeroNode(N1);
18960 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18961
18962 if (IsZeroElt || IsAllOnesElt) {
18963 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18964 // We don't deal with i8 0 since it appears to be handled elsewhere.
18965 if (IsAllOnesElt &&
18966 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18967 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18968 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18969 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18970 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18971 CstVectorElts[IdxVal] = OnesCst;
18972 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18973 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18974 }
18975 // See if we can do this more efficiently with a blend shuffle with a
18976 // rematerializable vector.
18977 if (Subtarget.hasSSE41() &&
18978 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18979 SmallVector<int, 8> BlendMask;
18980 for (unsigned i = 0; i != NumElts; ++i)
18981 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18982 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18983 : getOnesVector(VT, DAG, dl);
18984 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18985 }
18986 }
18987
18988 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18989 // into that, and then insert the subvector back into the result.
18990 if (VT.is256BitVector() || VT.is512BitVector()) {
18991 // With a 256-bit vector, we can insert into the zero element efficiently
18992 // using a blend if we have AVX or AVX2 and the right data type.
18993 if (VT.is256BitVector() && IdxVal == 0) {
18994 // TODO: It is worthwhile to cast integer to floating point and back
18995 // and incur a domain crossing penalty if that's what we'll end up
18996 // doing anyway after extracting to a 128-bit vector.
18997 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18998 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18999 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19000 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19001 DAG.getTargetConstant(1, dl, MVT::i8));
19002 }
19003 }
19004
19005 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19006 assert(isPowerOf2_32(NumEltsIn128) &&
19007 "Vectors will always have power-of-two number of elements.");
19008
19009 // If we are not inserting into the low 128-bit vector chunk,
19010 // then prefer the broadcast+blend sequence.
19011 // FIXME: relax the profitability check iff all N1 uses are insertions.
19012 if (IdxVal >= NumEltsIn128 &&
19013 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19014 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19015 X86::mayFoldLoad(N1, Subtarget)))) {
19016 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19017 SmallVector<int, 8> BlendMask;
19018 for (unsigned i = 0; i != NumElts; ++i)
19019 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19020 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19021 }
19022
19023 // Get the desired 128-bit vector chunk.
19024 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19025
19026 // Insert the element into the desired chunk.
19027 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19028 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19029
19030 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19031 DAG.getVectorIdxConstant(IdxIn128, dl));
19032
19033 // Insert the changed part back into the bigger vector
19034 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19035 }
19036 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19037
19038 // This will be just movw/movd/movq/movsh/movss/movsd.
19039 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19040 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19041 EltVT == MVT::f16 || EltVT == MVT::i64) {
19042 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19043 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19044 }
19045
19046 // We can't directly insert an i8 or i16 into a vector, so zero extend
19047 // it to i32 first.
19048 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19049 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19050 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19051 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19052 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19053 return DAG.getBitcast(VT, N1);
19054 }
19055 }
19056
19057 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19058 // argument. SSE41 required for pinsrb.
19059 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19060 unsigned Opc;
19061 if (VT == MVT::v8i16) {
19062 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19064 } else {
19065 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19066 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19068 }
19069
19070 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19071 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19072 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19073 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19074 }
19075
19076 if (Subtarget.hasSSE41()) {
19077 if (EltVT == MVT::f32) {
19078 // Bits [7:6] of the constant are the source select. This will always be
19079 // zero here. The DAG Combiner may combine an extract_elt index into
19080 // these bits. For example (insert (extract, 3), 2) could be matched by
19081 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19082 // Bits [5:4] of the constant are the destination select. This is the
19083 // value of the incoming immediate.
19084 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19085 // combine either bitwise AND or insert of float 0.0 to set these bits.
19086
19087 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19088 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19089 // If this is an insertion of 32-bits into the low 32-bits of
19090 // a vector, we prefer to generate a blend with immediate rather
19091 // than an insertps. Blends are simpler operations in hardware and so
19092 // will always have equal or better performance than insertps.
19093 // But if optimizing for size and there's a load folding opportunity,
19094 // generate insertps because blendps does not have a 32-bit memory
19095 // operand form.
19096 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19097 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19098 DAG.getTargetConstant(1, dl, MVT::i8));
19099 }
19100 // Create this as a scalar to vector..
19101 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19102 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19103 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19104 }
19105
19106 // PINSR* works with constant index.
19107 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19108 return Op;
19109 }
19110
19111 return SDValue();
19112}
19113
19115 SelectionDAG &DAG) {
19116 SDLoc dl(Op);
19117 MVT OpVT = Op.getSimpleValueType();
19118
19119 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19120 // combines.
19121 if (X86::isZeroNode(Op.getOperand(0)))
19122 return getZeroVector(OpVT, Subtarget, DAG, dl);
19123
19124 // If this is a 256-bit vector result, first insert into a 128-bit
19125 // vector and then insert into the 256-bit vector.
19126 if (!OpVT.is128BitVector()) {
19127 // Insert into a 128-bit vector.
19128 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19130 OpVT.getVectorNumElements() / SizeFactor);
19131
19132 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19133
19134 // Insert the 128-bit vector.
19135 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19136 }
19137 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19138 "Expected an SSE type!");
19139
19140 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19141 // tblgen.
19142 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19143 return Op;
19144
19145 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19146 return DAG.getBitcast(
19147 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19148}
19149
19150// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19151// simple superregister reference or explicit instructions to insert
19152// the upper bits of a vector.
19154 SelectionDAG &DAG) {
19155 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19156
19157 return insert1BitVector(Op, DAG, Subtarget);
19158}
19159
19161 SelectionDAG &DAG) {
19162 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19163 "Only vXi1 extract_subvectors need custom lowering");
19164
19165 SDLoc dl(Op);
19166 SDValue Vec = Op.getOperand(0);
19167 uint64_t IdxVal = Op.getConstantOperandVal(1);
19168
19169 if (IdxVal == 0) // the operation is legal
19170 return Op;
19171
19172 // Extend to natively supported kshift.
19173 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19174
19175 // Shift to the LSB.
19176 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19177 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19178
19179 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19180 DAG.getVectorIdxConstant(0, dl));
19181}
19182
19183// Returns the appropriate wrapper opcode for a global reference.
19184unsigned X86TargetLowering::getGlobalWrapperKind(
19185 const GlobalValue *GV, const unsigned char OpFlags) const {
19186 // References to absolute symbols are never PC-relative.
19187 if (GV && GV->isAbsoluteSymbolRef())
19188 return X86ISD::Wrapper;
19189
19190 // The following OpFlags under RIP-rel PIC use RIP.
19191 if (Subtarget.isPICStyleRIPRel() &&
19192 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19193 OpFlags == X86II::MO_DLLIMPORT))
19194 return X86ISD::WrapperRIP;
19195
19196 // GOTPCREL references must always use RIP.
19197 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19198 return X86ISD::WrapperRIP;
19199
19200 return X86ISD::Wrapper;
19201}
19202
19203// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19204// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19205// one of the above mentioned nodes. It has to be wrapped because otherwise
19206// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19207// be used to form addressing mode. These wrapped nodes will be selected
19208// into MOV32ri.
19209SDValue
19210X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19211 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19212
19213 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19214 // global base reg.
19215 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19216
19217 auto PtrVT = getPointerTy(DAG.getDataLayout());
19219 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19220 SDLoc DL(CP);
19221 Result =
19222 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19223 // With PIC, the address is actually $g + Offset.
19224 if (OpFlag) {
19225 Result =
19226 DAG.getNode(ISD::ADD, DL, PtrVT,
19227 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19228 }
19229
19230 return Result;
19231}
19232
19233SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19234 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19235
19236 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19237 // global base reg.
19238 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19239
19240 EVT PtrVT = Op.getValueType();
19241 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19242 SDLoc DL(JT);
19243 Result =
19244 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19245
19246 // With PIC, the address is actually $g + Offset.
19247 if (OpFlag)
19248 Result =
19249 DAG.getNode(ISD::ADD, DL, PtrVT,
19250 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19251
19252 return Result;
19253}
19254
19255SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19256 SelectionDAG &DAG) const {
19257 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19258}
19259
19260SDValue
19261X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19262 // Create the TargetBlockAddressAddress node.
19263 unsigned char OpFlags =
19265 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19266 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19267 SDLoc dl(Op);
19268 EVT PtrVT = Op.getValueType();
19269 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19270 Result =
19271 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19272
19273 // With PIC, the address is actually $g + Offset.
19274 if (isGlobalRelativeToPICBase(OpFlags)) {
19275 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19276 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19277 }
19278
19279 return Result;
19280}
19281
19282/// Creates target global address or external symbol nodes for calls or
19283/// other uses.
19284SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19285 bool ForCall,
19286 bool *IsImpCall) const {
19287 // Unpack the global address or external symbol.
19288 SDLoc dl(Op);
19289 const GlobalValue *GV = nullptr;
19290 int64_t Offset = 0;
19291 const char *ExternalSym = nullptr;
19292 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19293 GV = G->getGlobal();
19294 Offset = G->getOffset();
19295 } else {
19296 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19297 ExternalSym = ES->getSymbol();
19298 }
19299
19300 // Calculate some flags for address lowering.
19302 unsigned char OpFlags;
19303 if (ForCall)
19304 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19305 else
19306 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19307 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19308 bool NeedsLoad = isGlobalStubReference(OpFlags);
19309
19311 EVT PtrVT = Op.getValueType();
19313
19314 if (GV) {
19315 // Create a target global address if this is a global. If possible, fold the
19316 // offset into the global address reference. Otherwise, ADD it on later.
19317 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19318 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19319 // relocation will compute to a negative value, which is invalid.
19320 int64_t GlobalOffset = 0;
19321 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19323 std::swap(GlobalOffset, Offset);
19324 }
19325 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19326 } else {
19327 // If this is not a global address, this must be an external symbol.
19328 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19329 }
19330
19331 // If this is a direct call, avoid the wrapper if we don't need to do any
19332 // loads or adds. This allows SDAG ISel to match direct calls.
19333 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19334 return Result;
19335
19336 // If Import Call Optimization is enabled and this is an imported function
19337 // then make a note of it and return the global address without wrapping.
19338 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19339 Mod.getModuleFlag("import-call-optimization")) {
19340 assert(ForCall && "Should only enable import call optimization if we are "
19341 "lowering a call");
19342 *IsImpCall = true;
19343 return Result;
19344 }
19345
19346 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19347
19348 // With PIC, the address is actually $g + Offset.
19349 if (HasPICReg) {
19350 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19351 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19352 }
19353
19354 // For globals that require a load from a stub to get the address, emit the
19355 // load.
19356 if (NeedsLoad)
19357 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19359
19360 // If there was a non-zero offset that we didn't fold, create an explicit
19361 // addition for it.
19362 if (Offset != 0)
19363 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19364 DAG.getSignedConstant(Offset, dl, PtrVT));
19365
19366 return Result;
19367}
19368
19369SDValue
19370X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19371 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19372}
19373
19375 const EVT PtrVT, unsigned ReturnReg,
19376 unsigned char OperandFlags,
19377 bool LoadGlobalBaseReg = false,
19378 bool LocalDynamic = false) {
19380 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19381 SDLoc dl(GA);
19382 SDValue TGA;
19383 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19384 SDValue Chain = DAG.getEntryNode();
19385 SDValue Ret;
19386 if (LocalDynamic && UseTLSDESC) {
19387 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19388 // Reuse existing GetTLSADDR node if we can find it.
19389 if (TGA->hasOneUse()) {
19390 // TLSDESC uses TGA.
19391 SDNode *TLSDescOp = *TGA->user_begin();
19392 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19393 "Unexpected TLSDESC DAG");
19394 // CALLSEQ_END uses TGA via a chain and glue.
19395 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19396 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19397 "Unexpected TLSDESC DAG");
19398 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19399 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19400 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19401 "Unexpected TLSDESC DAG");
19402 Ret = SDValue(CopyFromRegOp, 0);
19403 }
19404 } else {
19405 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19406 GA->getOffset(), OperandFlags);
19407 }
19408
19409 if (!Ret) {
19410 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19411 : LocalDynamic ? X86ISD::TLSBASEADDR
19413
19414 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19415 if (LoadGlobalBaseReg) {
19416 SDValue InGlue;
19417 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19418 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19419 InGlue);
19420 InGlue = Chain.getValue(1);
19421 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19422 } else {
19423 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19424 }
19425 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19426
19427 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19428 MFI.setHasCalls(true);
19429
19430 SDValue Glue = Chain.getValue(1);
19431 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19432 }
19433
19434 if (!UseTLSDESC)
19435 return Ret;
19436
19437 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19438 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19439
19441 SDValue Offset =
19442 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19444 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19445}
19446
19447// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19448static SDValue
19450 const EVT PtrVT) {
19451 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19452 /*LoadGlobalBaseReg=*/true);
19453}
19454
19455// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19456static SDValue
19458 const EVT PtrVT) {
19459 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19460}
19461
19462// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19463static SDValue
19465 const EVT PtrVT) {
19466 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19467}
19468
19470 SelectionDAG &DAG, const EVT PtrVT,
19471 bool Is64Bit, bool Is64BitLP64) {
19472 SDLoc dl(GA);
19473
19474 // Get the start address of the TLS block for this module.
19478
19479 SDValue Base;
19480 if (Is64Bit) {
19481 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19482 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19483 /*LoadGlobalBaseReg=*/false,
19484 /*LocalDynamic=*/true);
19485 } else {
19486 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19487 /*LoadGlobalBaseReg=*/true,
19488 /*LocalDynamic=*/true);
19489 }
19490
19491 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19492 // of Base.
19493
19494 // Build x@dtpoff.
19495 unsigned char OperandFlags = X86II::MO_DTPOFF;
19496 unsigned WrapperKind = X86ISD::Wrapper;
19497 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19498 GA->getValueType(0),
19499 GA->getOffset(), OperandFlags);
19500 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19501
19502 // Add x@dtpoff with the base.
19503 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19504}
19505
19506// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19508 const EVT PtrVT, TLSModel::Model model,
19509 bool is64Bit, bool isPIC) {
19510 SDLoc dl(GA);
19511
19512 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19515
19516 SDValue ThreadPointer =
19517 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19519
19520 unsigned char OperandFlags = 0;
19521 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19522 // initialexec.
19523 unsigned WrapperKind = X86ISD::Wrapper;
19524 if (model == TLSModel::LocalExec) {
19525 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19526 } else if (model == TLSModel::InitialExec) {
19527 if (is64Bit) {
19528 OperandFlags = X86II::MO_GOTTPOFF;
19529 WrapperKind = X86ISD::WrapperRIP;
19530 } else {
19531 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19532 }
19533 } else {
19534 llvm_unreachable("Unexpected model");
19535 }
19536
19537 // emit "addl x@ntpoff,%eax" (local exec)
19538 // or "addl x@indntpoff,%eax" (initial exec)
19539 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19540 SDValue TGA =
19541 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19542 GA->getOffset(), OperandFlags);
19543 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19544
19545 if (model == TLSModel::InitialExec) {
19546 if (isPIC && !is64Bit) {
19547 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19548 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19549 Offset);
19550 }
19551
19552 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19554 }
19555
19556 // The address of the thread local variable is the add of the thread
19557 // pointer with the offset of the variable.
19558 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19559}
19560
19561SDValue
19562X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19563
19564 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19565
19566 if (DAG.getTarget().useEmulatedTLS())
19567 return LowerToTLSEmulatedModel(GA, DAG);
19568
19569 const GlobalValue *GV = GA->getGlobal();
19570 EVT PtrVT = Op.getValueType();
19571 bool PositionIndependent = isPositionIndependent();
19572
19573 if (Subtarget.isTargetELF()) {
19574 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19575 switch (model) {
19577 if (Subtarget.is64Bit()) {
19578 if (Subtarget.isTarget64BitLP64())
19579 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19580 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19581 }
19582 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19584 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19585 Subtarget.isTarget64BitLP64());
19588 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19589 PositionIndependent);
19590 }
19591 llvm_unreachable("Unknown TLS model.");
19592 }
19593
19594 if (Subtarget.isTargetDarwin()) {
19595 // Darwin only has one model of TLS. Lower to that.
19596 unsigned char OpFlag = 0;
19597 unsigned WrapperKind = 0;
19598
19599 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19600 // global base reg.
19601 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19602 if (PIC32) {
19603 OpFlag = X86II::MO_TLVP_PIC_BASE;
19604 WrapperKind = X86ISD::Wrapper;
19605 } else {
19606 OpFlag = X86II::MO_TLVP;
19607 WrapperKind = X86ISD::WrapperRIP;
19608 }
19609 SDLoc DL(Op);
19611 GA->getValueType(0),
19612 GA->getOffset(), OpFlag);
19613 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19614
19615 // With PIC32, the address is actually $g + Offset.
19616 if (PIC32)
19617 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19618 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19619 Offset);
19620
19621 // Lowering the machine isd will make sure everything is in the right
19622 // location.
19623 SDValue Chain = DAG.getEntryNode();
19624 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19625 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19626 SDValue Args[] = { Chain, Offset };
19627 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19628 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19629
19630 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19632 MFI.setAdjustsStack(true);
19633
19634 // And our return value (tls address) is in the standard call return value
19635 // location.
19636 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19637 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19638 }
19639
19640 if (Subtarget.isOSWindows()) {
19641 // Just use the implicit TLS architecture
19642 // Need to generate something similar to:
19643 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19644 // ; from TEB
19645 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19646 // mov rcx, qword [rdx+rcx*8]
19647 // mov eax, .tls$:tlsvar
19648 // [rax+rcx] contains the address
19649 // Windows 64bit: gs:0x58
19650 // Windows 32bit: fs:__tls_array
19651
19652 SDLoc dl(GA);
19653 SDValue Chain = DAG.getEntryNode();
19654
19655 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19656 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19657 // use its literal value of 0x2C.
19659 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19661
19662 SDValue TlsArray = Subtarget.is64Bit()
19663 ? DAG.getIntPtrConstant(0x58, dl)
19664 : (Subtarget.isTargetWindowsGNU()
19665 ? DAG.getIntPtrConstant(0x2C, dl)
19666 : DAG.getExternalSymbol("_tls_array", PtrVT));
19667
19669 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19670
19671 SDValue res;
19673 res = ThreadPointer;
19674 } else {
19675 // Load the _tls_index variable
19676 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19677 if (Subtarget.is64Bit())
19678 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19679 MachinePointerInfo(), MVT::i32);
19680 else
19681 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19682
19683 const DataLayout &DL = DAG.getDataLayout();
19684 SDValue Scale =
19685 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19686 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19687
19688 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19689 }
19690
19691 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19692
19693 // Get the offset of start of .tls section
19694 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19695 GA->getValueType(0),
19697 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19698
19699 // The address of the thread local variable is the add of the thread
19700 // pointer with the offset of the variable.
19701 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19702 }
19703
19704 llvm_unreachable("TLS not implemented for this target.");
19705}
19706
19708 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19709 const TargetMachine &TM = getTargetMachine();
19710 TLSModel::Model Model = TM.getTLSModel(&GV);
19711 switch (Model) {
19714 // We can include the %fs segment register in addressing modes.
19715 return true;
19718 // These models do not result in %fs relative addresses unless
19719 // TLS descriptior are used.
19720 //
19721 // Even in the case of TLS descriptors we currently have no way to model
19722 // the difference between %fs access and the computations needed for the
19723 // offset and returning `true` for TLS-desc currently duplicates both
19724 // which is detrimental :-/
19725 return false;
19726 }
19727 }
19728 return false;
19729}
19730
19731/// Lower SRA_PARTS and friends, which return two i32 values
19732/// and take a 2 x i32 value to shift plus a shift amount.
19733/// TODO: Can this be moved to general expansion code?
19735 SDValue Lo, Hi;
19736 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19737 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19738}
19739
19740// Try to use a packed vector operation to handle i64 on 32-bit targets when
19741// AVX512DQ is enabled.
19743 SelectionDAG &DAG,
19744 const X86Subtarget &Subtarget) {
19745 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19746 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19747 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19748 Op.getOpcode() == ISD::UINT_TO_FP) &&
19749 "Unexpected opcode!");
19750 bool IsStrict = Op->isStrictFPOpcode();
19751 unsigned OpNo = IsStrict ? 1 : 0;
19752 SDValue Src = Op.getOperand(OpNo);
19753 MVT SrcVT = Src.getSimpleValueType();
19754 MVT VT = Op.getSimpleValueType();
19755
19756 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19757 (VT != MVT::f32 && VT != MVT::f64))
19758 return SDValue();
19759
19760 // Pack the i64 into a vector, do the operation and extract.
19761
19762 // Using 256-bit to ensure result is 128-bits for f32 case.
19763 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19764 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19765 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19766
19767 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19768 if (IsStrict) {
19769 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19770 {Op.getOperand(0), InVec});
19771 SDValue Chain = CvtVec.getValue(1);
19772 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19773 DAG.getVectorIdxConstant(0, dl));
19774 return DAG.getMergeValues({Value, Chain}, dl);
19775 }
19776
19777 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19778
19779 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19780 DAG.getVectorIdxConstant(0, dl));
19781}
19782
19783// Try to use a packed vector operation to handle i64 on 32-bit targets.
19785 const X86Subtarget &Subtarget) {
19786 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19787 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19788 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19789 Op.getOpcode() == ISD::UINT_TO_FP) &&
19790 "Unexpected opcode!");
19791 bool IsStrict = Op->isStrictFPOpcode();
19792 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19793 MVT SrcVT = Src.getSimpleValueType();
19794 MVT VT = Op.getSimpleValueType();
19795
19796 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19797 return SDValue();
19798
19799 // Pack the i64 into a vector, do the operation and extract.
19800
19801 assert(Subtarget.hasFP16() && "Expected FP16");
19802
19803 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19804 if (IsStrict) {
19805 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19806 {Op.getOperand(0), InVec});
19807 SDValue Chain = CvtVec.getValue(1);
19808 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19809 DAG.getVectorIdxConstant(0, dl));
19810 return DAG.getMergeValues({Value, Chain}, dl);
19811 }
19812
19813 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19814
19815 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19816 DAG.getVectorIdxConstant(0, dl));
19817}
19818
19819static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19820 const X86Subtarget &Subtarget) {
19821 switch (Opcode) {
19822 case ISD::SINT_TO_FP:
19823 // TODO: Handle wider types with AVX/AVX512.
19824 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19825 return false;
19826 // CVTDQ2PS or (V)CVTDQ2PD
19827 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19828
19829 case ISD::UINT_TO_FP:
19830 // TODO: Handle wider types and i64 elements.
19831 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19832 return false;
19833 // VCVTUDQ2PS or VCVTUDQ2PD
19834 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19835
19836 default:
19837 return false;
19838 }
19839}
19840
19841/// Given a scalar cast operation that is extracted from a vector, try to
19842/// vectorize the cast op followed by extraction. This will avoid an expensive
19843/// round-trip between XMM and GPR.
19845 SelectionDAG &DAG,
19846 const X86Subtarget &Subtarget) {
19847 // TODO: This could be enhanced to handle smaller integer types by peeking
19848 // through an extend.
19849 SDValue Extract = Cast.getOperand(0);
19850 MVT DestVT = Cast.getSimpleValueType();
19851 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19852 !isa<ConstantSDNode>(Extract.getOperand(1)))
19853 return SDValue();
19854
19855 // See if we have a 128-bit vector cast op for this type of cast.
19856 SDValue VecOp = Extract.getOperand(0);
19857 MVT FromVT = VecOp.getSimpleValueType();
19858 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19859 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19860 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19861 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19862 return SDValue();
19863
19864 // If we are extracting from a non-zero element, first shuffle the source
19865 // vector to allow extracting from element zero.
19866 if (!isNullConstant(Extract.getOperand(1))) {
19867 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19868 Mask[0] = Extract.getConstantOperandVal(1);
19869 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19870 }
19871 // If the source vector is wider than 128-bits, extract the low part. Do not
19872 // create an unnecessarily wide vector cast op.
19873 if (FromVT != Vec128VT)
19874 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19875
19876 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19877 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19878 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19879 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19880 DAG.getVectorIdxConstant(0, DL));
19881}
19882
19883/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19884/// try to vectorize the cast ops. This will avoid an expensive round-trip
19885/// between XMM and GPR.
19886static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19887 SelectionDAG &DAG,
19888 const X86Subtarget &Subtarget) {
19889 // TODO: Allow FP_TO_UINT.
19890 SDValue CastToInt = CastToFP.getOperand(0);
19891 MVT VT = CastToFP.getSimpleValueType();
19892 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19893 return SDValue();
19894
19895 MVT IntVT = CastToInt.getSimpleValueType();
19896 SDValue X = CastToInt.getOperand(0);
19897 MVT SrcVT = X.getSimpleValueType();
19898 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19899 return SDValue();
19900
19901 // See if we have 128-bit vector cast instructions for this type of cast.
19902 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19903 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19904 IntVT != MVT::i32)
19905 return SDValue();
19906
19907 unsigned SrcSize = SrcVT.getSizeInBits();
19908 unsigned IntSize = IntVT.getSizeInBits();
19909 unsigned VTSize = VT.getSizeInBits();
19910 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19911 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19912 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19913
19914 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19915 unsigned ToIntOpcode =
19916 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19917 unsigned ToFPOpcode =
19918 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19919
19920 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19921 //
19922 // We are not defining the high elements (for example, zero them) because
19923 // that could nullify any performance advantage that we hoped to gain from
19924 // this vector op hack. We do not expect any adverse effects (like denorm
19925 // penalties) with cast ops.
19926 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19927 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19928 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19929 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19930 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19931}
19932
19934 SelectionDAG &DAG,
19935 const X86Subtarget &Subtarget) {
19936 bool IsStrict = Op->isStrictFPOpcode();
19937 MVT VT = Op->getSimpleValueType(0);
19938 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19939
19940 if (Subtarget.hasDQI()) {
19941 assert(!Subtarget.hasVLX() && "Unexpected features");
19942
19943 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19944 Src.getSimpleValueType() == MVT::v4i64) &&
19945 "Unsupported custom type");
19946
19947 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19948 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19949 "Unexpected VT!");
19950 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19951
19952 // Need to concat with zero vector for strict fp to avoid spurious
19953 // exceptions.
19954 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19955 : DAG.getUNDEF(MVT::v8i64);
19956 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19957 DAG.getVectorIdxConstant(0, DL));
19958 SDValue Res, Chain;
19959 if (IsStrict) {
19960 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19961 {Op->getOperand(0), Src});
19962 Chain = Res.getValue(1);
19963 } else {
19964 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19965 }
19966
19967 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19968 DAG.getVectorIdxConstant(0, DL));
19969
19970 if (IsStrict)
19971 return DAG.getMergeValues({Res, Chain}, DL);
19972 return Res;
19973 }
19974
19975 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19976 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19977 if (VT != MVT::v4f32 || IsSigned)
19978 return SDValue();
19979
19980 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19981 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19982 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19983 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19984 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19985 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19986 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19987 SmallVector<SDValue, 4> SignCvts(4);
19988 SmallVector<SDValue, 4> Chains(4);
19989 for (int i = 0; i != 4; ++i) {
19990 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19991 DAG.getVectorIdxConstant(i, DL));
19992 if (IsStrict) {
19993 SignCvts[i] =
19994 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19995 {Op.getOperand(0), Elt});
19996 Chains[i] = SignCvts[i].getValue(1);
19997 } else {
19998 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19999 }
20000 }
20001 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20002
20003 SDValue Slow, Chain;
20004 if (IsStrict) {
20005 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20006 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20007 {Chain, SignCvt, SignCvt});
20008 Chain = Slow.getValue(1);
20009 } else {
20010 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20011 }
20012
20013 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20014 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20015
20016 if (IsStrict)
20017 return DAG.getMergeValues({Cvt, Chain}, DL);
20018
20019 return Cvt;
20020}
20021
20023 SelectionDAG &DAG) {
20024 bool IsStrict = Op->isStrictFPOpcode();
20025 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20026 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20027 MVT VT = Op.getSimpleValueType();
20028 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20029
20030 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20031 if (IsStrict)
20032 return DAG.getNode(
20033 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20034 {Chain,
20035 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20036 Rnd});
20037 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20038 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20039}
20040
20041static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20042 const X86Subtarget &Subtarget) {
20043 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20044 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20045 return true;
20046 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20047 return true;
20048 }
20049 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20050 return true;
20051 if (Subtarget.useAVX512Regs()) {
20052 if (VT == MVT::v16i32)
20053 return true;
20054 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20055 return true;
20056 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20057 return true;
20058 }
20059 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20060 (VT == MVT::v2i64 || VT == MVT::v4i64))
20061 return true;
20062 return false;
20063}
20064
20065SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20066 SelectionDAG &DAG) const {
20067 bool IsStrict = Op->isStrictFPOpcode();
20068 unsigned OpNo = IsStrict ? 1 : 0;
20069 SDValue Src = Op.getOperand(OpNo);
20070 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20071 MVT SrcVT = Src.getSimpleValueType();
20072 MVT VT = Op.getSimpleValueType();
20073 SDLoc dl(Op);
20074
20075 if (isSoftF16(VT, Subtarget))
20076 return promoteXINT_TO_FP(Op, dl, DAG);
20077 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20078 return Op;
20079
20080 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20081 return LowerWin64_INT128_TO_FP(Op, DAG);
20082
20083 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20084 return Extract;
20085
20086 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20087 return R;
20088
20089 if (SrcVT.isVector()) {
20090 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20091 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20092 // source for strict FP.
20093 if (IsStrict)
20094 return DAG.getNode(
20095 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20096 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20097 DAG.getUNDEF(SrcVT))});
20098 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20099 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20100 DAG.getUNDEF(SrcVT)));
20101 }
20102 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20103 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20104
20105 return SDValue();
20106 }
20107
20108 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20109 "Unknown SINT_TO_FP to lower!");
20110
20111 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20112
20113 // These are really Legal; return the operand so the caller accepts it as
20114 // Legal.
20115 if (SrcVT == MVT::i32 && UseSSEReg)
20116 return Op;
20117 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20118 return Op;
20119
20120 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20121 return V;
20122 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20123 return V;
20124
20125 // SSE doesn't have an i16 conversion so we need to promote.
20126 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20127 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20128 if (IsStrict)
20129 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20130 {Chain, Ext});
20131
20132 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20133 }
20134
20135 if (VT == MVT::f128 || !Subtarget.hasX87())
20136 return SDValue();
20137
20138 SDValue ValueToStore = Src;
20139 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20140 // Bitcasting to f64 here allows us to do a single 64-bit store from
20141 // an SSE register, avoiding the store forwarding penalty that would come
20142 // with two 32-bit stores.
20143 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20144
20145 unsigned Size = SrcVT.getStoreSize();
20146 Align Alignment(Size);
20148 auto PtrVT = getPointerTy(MF.getDataLayout());
20149 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20150 MachinePointerInfo MPI =
20152 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20153 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20154 std::pair<SDValue, SDValue> Tmp =
20155 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20156
20157 if (IsStrict)
20158 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20159
20160 return Tmp.first;
20161}
20162
20163std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20164 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20165 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20166 // Build the FILD
20167 SDVTList Tys;
20168 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20169 if (useSSE)
20170 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20171 else
20172 Tys = DAG.getVTList(DstVT, MVT::Other);
20173
20174 SDValue FILDOps[] = {Chain, Pointer};
20175 SDValue Result =
20176 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20177 Alignment, MachineMemOperand::MOLoad);
20178 Chain = Result.getValue(1);
20179
20180 if (useSSE) {
20182 unsigned SSFISize = DstVT.getStoreSize();
20183 int SSFI =
20184 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20185 auto PtrVT = getPointerTy(MF.getDataLayout());
20186 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20187 Tys = DAG.getVTList(MVT::Other);
20188 SDValue FSTOps[] = {Chain, Result, StackSlot};
20191 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20192
20193 Chain =
20194 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20195 Result = DAG.getLoad(
20196 DstVT, DL, Chain, StackSlot,
20198 Chain = Result.getValue(1);
20199 }
20200
20201 return { Result, Chain };
20202}
20203
20204/// Horizontal vector math instructions may be slower than normal math with
20205/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20206/// implementation, and likely shuffle complexity of the alternate sequence.
20207static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20208 const X86Subtarget &Subtarget) {
20209 bool IsOptimizingSize = DAG.shouldOptForSize();
20210 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20211 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20212}
20213
20214/// 64-bit unsigned integer to double expansion.
20216 SelectionDAG &DAG,
20217 const X86Subtarget &Subtarget) {
20218 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20219 // when converting 0 when rounding toward negative infinity. Caller will
20220 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20221 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20222 // This algorithm is not obvious. Here it is what we're trying to output:
20223 /*
20224 movq %rax, %xmm0
20225 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20226 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20227 #ifdef __SSE3__
20228 haddpd %xmm0, %xmm0
20229 #else
20230 pshufd $0x4e, %xmm0, %xmm1
20231 addpd %xmm1, %xmm0
20232 #endif
20233 */
20234
20236
20237 // Build some magic constants.
20238 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20240 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20241 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20242
20244 CV1.push_back(
20245 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20246 APInt(64, 0x4330000000000000ULL))));
20247 CV1.push_back(
20248 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20249 APInt(64, 0x4530000000000000ULL))));
20250 Constant *C1 = ConstantVector::get(CV1);
20251 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20252
20253 // Load the 64-bit value into an XMM register.
20254 SDValue XR1 =
20255 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20256 SDValue CLod0 = DAG.getLoad(
20257 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20259 SDValue Unpck1 =
20260 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20261
20262 SDValue CLod1 = DAG.getLoad(
20263 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20265 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20266 // TODO: Are there any fast-math-flags to propagate here?
20267 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20268 SDValue Result;
20269
20270 if (Subtarget.hasSSE3() &&
20271 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20272 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20273 } else {
20274 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20275 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20276 }
20277 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20278 DAG.getVectorIdxConstant(0, dl));
20279 return Result;
20280}
20281
20282/// 32-bit unsigned integer to float expansion.
20284 SelectionDAG &DAG,
20285 const X86Subtarget &Subtarget) {
20286 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20287 // FP constant to bias correct the final result.
20288 SDValue Bias = DAG.getConstantFP(
20289 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20290
20291 // Load the 32-bit value into an XMM register.
20292 SDValue Load =
20293 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20294
20295 // Zero out the upper parts of the register.
20296 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20297
20298 // Or the load with the bias.
20299 SDValue Or = DAG.getNode(
20300 ISD::OR, dl, MVT::v2i64,
20301 DAG.getBitcast(MVT::v2i64, Load),
20302 DAG.getBitcast(MVT::v2i64,
20303 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20304 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20305 DAG.getBitcast(MVT::v2f64, Or),
20306 DAG.getVectorIdxConstant(0, dl));
20307
20308 if (Op.getNode()->isStrictFPOpcode()) {
20309 // Subtract the bias.
20310 // TODO: Are there any fast-math-flags to propagate here?
20311 SDValue Chain = Op.getOperand(0);
20312 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20313 {Chain, Or, Bias});
20314
20315 if (Op.getValueType() == Sub.getValueType())
20316 return Sub;
20317
20318 // Handle final rounding.
20319 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20320 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20321
20322 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20323 }
20324
20325 // Subtract the bias.
20326 // TODO: Are there any fast-math-flags to propagate here?
20327 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20328
20329 // Handle final rounding.
20330 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20331}
20332
20334 SelectionDAG &DAG,
20335 const X86Subtarget &Subtarget) {
20336 if (Op.getSimpleValueType() != MVT::v2f64)
20337 return SDValue();
20338
20339 bool IsStrict = Op->isStrictFPOpcode();
20340
20341 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20342 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20343
20344 if (Subtarget.hasAVX512()) {
20345 if (!Subtarget.hasVLX()) {
20346 // Let generic type legalization widen this.
20347 if (!IsStrict)
20348 return SDValue();
20349 // Otherwise pad the integer input with 0s and widen the operation.
20350 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20351 DAG.getConstant(0, DL, MVT::v2i32));
20352 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20353 {Op.getOperand(0), N0});
20354 SDValue Chain = Res.getValue(1);
20355 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20356 DAG.getVectorIdxConstant(0, DL));
20357 return DAG.getMergeValues({Res, Chain}, DL);
20358 }
20359
20360 // Legalize to v4i32 type.
20361 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20362 DAG.getUNDEF(MVT::v2i32));
20363 if (IsStrict)
20364 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20365 {Op.getOperand(0), N0});
20366 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20367 }
20368
20369 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20370 // This gives us the floating point equivalent of 2^52 + the i32 integer
20371 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20372 // point leaving just our i32 integers in double format.
20373 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20374 SDValue VBias = DAG.getConstantFP(
20375 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20376 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20377 DAG.getBitcast(MVT::v2i64, VBias));
20378 Or = DAG.getBitcast(MVT::v2f64, Or);
20379
20380 if (IsStrict)
20381 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20382 {Op.getOperand(0), Or, VBias});
20383 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20384}
20385
20387 SelectionDAG &DAG,
20388 const X86Subtarget &Subtarget) {
20389 bool IsStrict = Op->isStrictFPOpcode();
20390 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20391 MVT VecIntVT = V.getSimpleValueType();
20392 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20393 "Unsupported custom type");
20394
20395 if (Subtarget.hasAVX512()) {
20396 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20397 assert(!Subtarget.hasVLX() && "Unexpected features");
20398 MVT VT = Op->getSimpleValueType(0);
20399
20400 // v8i32->v8f64 is legal with AVX512 so just return it.
20401 if (VT == MVT::v8f64)
20402 return Op;
20403
20404 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20405 VT == MVT::v8f16) &&
20406 "Unexpected VT!");
20407 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20408 MVT WideIntVT = MVT::v16i32;
20409 if (VT == MVT::v4f64) {
20410 WideVT = MVT::v8f64;
20411 WideIntVT = MVT::v8i32;
20412 }
20413
20414 // Need to concat with zero vector for strict fp to avoid spurious
20415 // exceptions.
20416 SDValue Tmp =
20417 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20418 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20419 DAG.getVectorIdxConstant(0, DL));
20420 SDValue Res, Chain;
20421 if (IsStrict) {
20422 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20423 {Op->getOperand(0), V});
20424 Chain = Res.getValue(1);
20425 } else {
20426 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20427 }
20428
20429 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20430 DAG.getVectorIdxConstant(0, DL));
20431
20432 if (IsStrict)
20433 return DAG.getMergeValues({Res, Chain}, DL);
20434 return Res;
20435 }
20436
20437 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20438 Op->getSimpleValueType(0) == MVT::v4f64) {
20439 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20440 Constant *Bias = ConstantFP::get(
20441 *DAG.getContext(),
20442 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20443 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20444 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20445 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20446 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20447 SDValue VBias = DAG.getMemIntrinsicNode(
20448 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20451
20452 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20453 DAG.getBitcast(MVT::v4i64, VBias));
20454 Or = DAG.getBitcast(MVT::v4f64, Or);
20455
20456 if (IsStrict)
20457 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20458 {Op.getOperand(0), Or, VBias});
20459 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20460 }
20461
20462 // The algorithm is the following:
20463 // #ifdef __SSE4_1__
20464 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20465 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20466 // (uint4) 0x53000000, 0xaa);
20467 // #else
20468 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20469 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20470 // #endif
20471 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20472 // return (float4) lo + fhi;
20473
20474 bool Is128 = VecIntVT == MVT::v4i32;
20475 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20476 // If we convert to something else than the supported type, e.g., to v4f64,
20477 // abort early.
20478 if (VecFloatVT != Op->getSimpleValueType(0))
20479 return SDValue();
20480
20481 // In the #idef/#else code, we have in common:
20482 // - The vector of constants:
20483 // -- 0x4b000000
20484 // -- 0x53000000
20485 // - A shift:
20486 // -- v >> 16
20487
20488 // Create the splat vector for 0x4b000000.
20489 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20490 // Create the splat vector for 0x53000000.
20491 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20492
20493 // Create the right shift.
20494 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20495 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20496
20497 SDValue Low, High;
20498 if (Subtarget.hasSSE41()) {
20499 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20500 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20501 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20502 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20503 // Low will be bitcasted right away, so do not bother bitcasting back to its
20504 // original type.
20505 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20506 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20507 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20508 // (uint4) 0x53000000, 0xaa);
20509 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20510 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20511 // High will be bitcasted right away, so do not bother bitcasting back to
20512 // its original type.
20513 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20514 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20515 } else {
20516 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20517 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20518 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20519 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20520
20521 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20522 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20523 }
20524
20525 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20526 SDValue VecCstFSub = DAG.getConstantFP(
20527 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20528
20529 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20530 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20531 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20532 // enabled. See PR24512.
20533 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20534 // TODO: Are there any fast-math-flags to propagate here?
20535 // (float4) lo;
20536 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20537 // return (float4) lo + fhi;
20538 if (IsStrict) {
20539 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20540 {Op.getOperand(0), HighBitcast, VecCstFSub});
20541 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20542 {FHigh.getValue(1), LowBitcast, FHigh});
20543 }
20544
20545 SDValue FHigh =
20546 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20547 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20548}
20549
20551 const X86Subtarget &Subtarget) {
20552 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20553 SDValue N0 = Op.getOperand(OpNo);
20554 MVT SrcVT = N0.getSimpleValueType();
20555
20556 switch (SrcVT.SimpleTy) {
20557 default:
20558 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20559 case MVT::v2i32:
20560 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20561 case MVT::v4i32:
20562 case MVT::v8i32:
20563 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20564 case MVT::v2i64:
20565 case MVT::v4i64:
20566 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20567 }
20568}
20569
20570SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20571 SelectionDAG &DAG) const {
20572 bool IsStrict = Op->isStrictFPOpcode();
20573 unsigned OpNo = IsStrict ? 1 : 0;
20574 SDValue Src = Op.getOperand(OpNo);
20575 SDLoc dl(Op);
20576 auto PtrVT = getPointerTy(DAG.getDataLayout());
20577 MVT SrcVT = Src.getSimpleValueType();
20578 MVT DstVT = Op->getSimpleValueType(0);
20579 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20580
20581 // Bail out when we don't have native conversion instructions.
20582 if (DstVT == MVT::f128)
20583 return SDValue();
20584
20585 if (isSoftF16(DstVT, Subtarget))
20586 return promoteXINT_TO_FP(Op, dl, DAG);
20587 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20588 return Op;
20589
20590 if (DstVT.isVector())
20591 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20592
20593 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20594 return LowerWin64_INT128_TO_FP(Op, DAG);
20595
20596 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20597 return Extract;
20598
20599 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20600 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20601 // Conversions from unsigned i32 to f32/f64 are legal,
20602 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20603 return Op;
20604 }
20605
20606 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20607 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20608 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20609 if (IsStrict)
20610 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20611 {Chain, Src});
20612 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20613 }
20614
20615 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20616 return V;
20617 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20618 return V;
20619
20620 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20621 // infinity. It produces -0.0, so disable under strictfp.
20622 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20623 !IsStrict)
20624 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20625 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20626 // negative infinity. So disable under strictfp. Using FILD instead.
20627 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20628 !IsStrict)
20629 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20630 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20631 (DstVT == MVT::f32 || DstVT == MVT::f64))
20632 return SDValue();
20633
20634 // Make a 64-bit buffer, and use it to build an FILD.
20635 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20636 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20637 Align SlotAlign(8);
20638 MachinePointerInfo MPI =
20640 if (SrcVT == MVT::i32) {
20641 SDValue OffsetSlot =
20642 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20643 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20644 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20645 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20646 std::pair<SDValue, SDValue> Tmp =
20647 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20648 if (IsStrict)
20649 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20650
20651 return Tmp.first;
20652 }
20653
20654 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20655 SDValue ValueToStore = Src;
20656 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20657 // Bitcasting to f64 here allows us to do a single 64-bit store from
20658 // an SSE register, avoiding the store forwarding penalty that would come
20659 // with two 32-bit stores.
20660 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20661 }
20662 SDValue Store =
20663 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20664 // For i64 source, we need to add the appropriate power of 2 if the input
20665 // was negative. We must be careful to do the computation in x87 extended
20666 // precision, not in SSE.
20667 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20668 SDValue Ops[] = {Store, StackSlot};
20669 SDValue Fild =
20670 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20671 SlotAlign, MachineMemOperand::MOLoad);
20672 Chain = Fild.getValue(1);
20673
20674 // Check whether the sign bit is set.
20675 SDValue SignSet = DAG.getSetCC(
20676 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20677 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20678
20679 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20680 APInt FF(64, 0x5F80000000000000ULL);
20681 SDValue FudgePtr =
20682 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20683 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20684
20685 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20686 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20687 SDValue Four = DAG.getIntPtrConstant(4, dl);
20688 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20689 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20690
20691 // Load the value out, extending it from f32 to f80.
20692 SDValue Fudge = DAG.getExtLoad(
20693 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20695 CPAlignment);
20696 Chain = Fudge.getValue(1);
20697 // Extend everything to 80 bits to force it to be done on x87.
20698 // TODO: Are there any fast-math-flags to propagate here?
20699 if (IsStrict) {
20700 unsigned Opc = ISD::STRICT_FADD;
20701 // Windows needs the precision control changed to 80bits around this add.
20702 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20704
20705 SDValue Add =
20706 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20707 // STRICT_FP_ROUND can't handle equal types.
20708 if (DstVT == MVT::f80)
20709 return Add;
20710 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20711 {Add.getValue(1), Add,
20712 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20713 }
20714 unsigned Opc = ISD::FADD;
20715 // Windows needs the precision control changed to 80bits around this add.
20716 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20718
20719 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20720 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20721 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20722}
20723
20724// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20725// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20726// just return an SDValue().
20727// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20728// to i16, i32 or i64, and we lower it to a legal sequence and return the
20729// result.
20730SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20731 bool IsSigned,
20732 SDValue &Chain) const {
20733 bool IsStrict = Op->isStrictFPOpcode();
20734 SDLoc DL(Op);
20735
20736 EVT DstTy = Op.getValueType();
20737 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20738 EVT TheVT = Value.getValueType();
20739 auto PtrVT = getPointerTy(DAG.getDataLayout());
20740
20741 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20742 // f16 must be promoted before using the lowering in this routine.
20743 // fp128 does not use this lowering.
20744 return SDValue();
20745 }
20746
20747 // If using FIST to compute an unsigned i64, we'll need some fixup
20748 // to handle values above the maximum signed i64. A FIST is always
20749 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20750 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20751
20752 // FIXME: This does not generate an invalid exception if the input does not
20753 // fit in i32. PR44019
20754 if (!IsSigned && DstTy != MVT::i64) {
20755 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20756 // The low 32 bits of the fist result will have the correct uint32 result.
20757 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20758 DstTy = MVT::i64;
20759 }
20760
20761 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20762 DstTy.getSimpleVT() >= MVT::i16 &&
20763 "Unknown FP_TO_INT to lower!");
20764
20765 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20766 // stack slot.
20768 unsigned MemSize = DstTy.getStoreSize();
20769 int SSFI =
20770 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20771 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20772
20773 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20774
20775 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20776
20777 if (UnsignedFixup) {
20778 //
20779 // Conversion to unsigned i64 is implemented with a select,
20780 // depending on whether the source value fits in the range
20781 // of a signed i64. Let Thresh be the FP equivalent of
20782 // 0x8000000000000000ULL.
20783 //
20784 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20785 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20786 // FistSrc = (Value - FltOfs);
20787 // Fist-to-mem64 FistSrc
20788 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20789 // to XOR'ing the high 32 bits with Adjust.
20790 //
20791 // Being a power of 2, Thresh is exactly representable in all FP formats.
20792 // For X87 we'd like to use the smallest FP type for this constant, but
20793 // for DAG type consistency we have to match the FP operand type.
20794
20795 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20797 bool LosesInfo = false;
20798 if (TheVT == MVT::f64)
20799 // The rounding mode is irrelevant as the conversion should be exact.
20801 &LosesInfo);
20802 else if (TheVT == MVT::f80)
20803 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20804 APFloat::rmNearestTiesToEven, &LosesInfo);
20805
20806 assert(Status == APFloat::opOK && !LosesInfo &&
20807 "FP conversion should have been exact");
20808
20809 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20810
20811 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20812 *DAG.getContext(), TheVT);
20813 SDValue Cmp;
20814 if (IsStrict) {
20815 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20816 /*IsSignaling*/ true);
20817 Chain = Cmp.getValue(1);
20818 } else {
20819 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20820 }
20821
20822 // Our preferred lowering of
20823 //
20824 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20825 //
20826 // is
20827 //
20828 // (Value >= Thresh) << 63
20829 //
20830 // but since we can get here after LegalOperations, DAGCombine might do the
20831 // wrong thing if we create a select. So, directly create the preferred
20832 // version.
20833 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20834 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20835 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20836
20837 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20838 DAG.getConstantFP(0.0, DL, TheVT));
20839
20840 if (IsStrict) {
20841 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20842 { Chain, Value, FltOfs });
20843 Chain = Value.getValue(1);
20844 } else
20845 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20846 }
20847
20849
20850 // FIXME This causes a redundant load/store if the SSE-class value is already
20851 // in memory, such as if it is on the callstack.
20852 if (isScalarFPTypeInSSEReg(TheVT)) {
20853 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20854 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20855 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20856 SDValue Ops[] = { Chain, StackSlot };
20857
20858 unsigned FLDSize = TheVT.getStoreSize();
20859 assert(FLDSize <= MemSize && "Stack slot not big enough");
20861 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20862 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20863 Chain = Value.getValue(1);
20864 }
20865
20866 // Build the FP_TO_INT*_IN_MEM
20868 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20869 SDValue Ops[] = { Chain, Value, StackSlot };
20871 DAG.getVTList(MVT::Other),
20872 Ops, DstTy, MMO);
20873
20874 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20875 Chain = Res.getValue(1);
20876
20877 // If we need an unsigned fixup, XOR the result with adjust.
20878 if (UnsignedFixup)
20879 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20880
20881 return Res;
20882}
20883
20885 const X86Subtarget &Subtarget) {
20886 MVT VT = Op.getSimpleValueType();
20887 SDValue In = Op.getOperand(0);
20888 MVT InVT = In.getSimpleValueType();
20889 unsigned Opc = Op.getOpcode();
20890
20891 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20893 "Unexpected extension opcode");
20895 "Expected same number of elements");
20896 assert((VT.getVectorElementType() == MVT::i16 ||
20897 VT.getVectorElementType() == MVT::i32 ||
20898 VT.getVectorElementType() == MVT::i64) &&
20899 "Unexpected element type");
20900 assert((InVT.getVectorElementType() == MVT::i8 ||
20901 InVT.getVectorElementType() == MVT::i16 ||
20902 InVT.getVectorElementType() == MVT::i32) &&
20903 "Unexpected element type");
20904
20905 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20906
20907 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20908 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20909 return splitVectorIntUnary(Op, DAG, dl);
20910 }
20911
20912 if (Subtarget.hasInt256())
20913 return Op;
20914
20915 // Optimize vectors in AVX mode:
20916 //
20917 // v8i16 -> v8i32
20918 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20919 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20920 // Concat upper and lower parts.
20921 //
20922 // v4i32 -> v4i64
20923 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20924 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20925 // Concat upper and lower parts.
20926 //
20927 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20928 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20929
20930 // Short-circuit if we can determine that each 128-bit half is the same value.
20931 // Otherwise, this is difficult to match and optimize.
20932 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20933 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20934 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20935
20936 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20937 SDValue Undef = DAG.getUNDEF(InVT);
20938 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20939 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20940 OpHi = DAG.getBitcast(HalfVT, OpHi);
20941
20942 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20943}
20944
20945// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20946static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20947 const SDLoc &dl, SelectionDAG &DAG) {
20948 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20949 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20950 DAG.getVectorIdxConstant(0, dl));
20951 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20952 DAG.getVectorIdxConstant(8, dl));
20953 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20954 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20955 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20956 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20957}
20958
20960 const X86Subtarget &Subtarget,
20961 SelectionDAG &DAG) {
20962 MVT VT = Op->getSimpleValueType(0);
20963 SDValue In = Op->getOperand(0);
20964 MVT InVT = In.getSimpleValueType();
20965 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20966 unsigned NumElts = VT.getVectorNumElements();
20967
20968 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20969 // avoids a constant pool load.
20970 if (VT.getVectorElementType() != MVT::i8) {
20971 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20972 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20973 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20974 }
20975
20976 // Extend VT if BWI is not supported.
20977 MVT ExtVT = VT;
20978 if (!Subtarget.hasBWI()) {
20979 // If v16i32 is to be avoided, we'll need to split and concatenate.
20980 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20981 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20982
20983 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20984 }
20985
20986 // Widen to 512-bits if VLX is not supported.
20987 MVT WideVT = ExtVT;
20988 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20989 NumElts *= 512 / ExtVT.getSizeInBits();
20990 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20991 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
20992 DAG.getVectorIdxConstant(0, DL));
20993 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
20994 }
20995
20996 SDValue One = DAG.getConstant(1, DL, WideVT);
20997 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20998
20999 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21000
21001 // Truncate if we had to extend above.
21002 if (VT != ExtVT) {
21003 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21004 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21005 }
21006
21007 // Extract back to 128/256-bit if we widened.
21008 if (WideVT != VT)
21009 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21010 DAG.getVectorIdxConstant(0, DL));
21011
21012 return SelectedVal;
21013}
21014
21016 SelectionDAG &DAG) {
21017 SDValue In = Op.getOperand(0);
21018 MVT SVT = In.getSimpleValueType();
21019 SDLoc DL(Op);
21020
21021 if (SVT.getVectorElementType() == MVT::i1)
21022 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21023
21024 assert(Subtarget.hasAVX() && "Expected AVX support");
21025 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21026}
21027
21028/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21029/// It makes use of the fact that vectors with enough leading sign/zero bits
21030/// prevent the PACKSS/PACKUS from saturating the results.
21031/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21032/// within each 128-bit lane.
21033static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21034 const SDLoc &DL, SelectionDAG &DAG,
21035 const X86Subtarget &Subtarget) {
21036 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21037 "Unexpected PACK opcode");
21038 assert(DstVT.isVector() && "VT not a vector?");
21039
21040 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21041 if (!Subtarget.hasSSE2())
21042 return SDValue();
21043
21044 EVT SrcVT = In.getValueType();
21045
21046 // No truncation required, we might get here due to recursive calls.
21047 if (SrcVT == DstVT)
21048 return In;
21049
21050 unsigned NumElems = SrcVT.getVectorNumElements();
21051 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21052 return SDValue();
21053
21054 unsigned DstSizeInBits = DstVT.getSizeInBits();
21055 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21056 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21057 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21058
21059 LLVMContext &Ctx = *DAG.getContext();
21060 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21061 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21062
21063 // Pack to the largest type possible:
21064 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21065 EVT InVT = MVT::i16, OutVT = MVT::i8;
21066 if (SrcVT.getScalarSizeInBits() > 16 &&
21067 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21068 InVT = MVT::i32;
21069 OutVT = MVT::i16;
21070 }
21071
21072 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21073 // On pre-AVX512, pack the src in both halves to help value tracking.
21074 if (SrcSizeInBits <= 128) {
21075 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21076 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21077 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21078 SDValue LHS = DAG.getBitcast(InVT, In);
21079 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21080 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21081 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21082 Res = DAG.getBitcast(PackedVT, Res);
21083 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21084 }
21085
21086 // Split lower/upper subvectors.
21087 SDValue Lo, Hi;
21088 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21089
21090 // If Hi is undef, then don't bother packing it and widen the result instead.
21091 if (Hi.isUndef()) {
21092 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21093 if (SDValue Res =
21094 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21095 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21096 }
21097
21098 unsigned SubSizeInBits = SrcSizeInBits / 2;
21099 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21100 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21101
21102 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21103 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21104 Lo = DAG.getBitcast(InVT, Lo);
21105 Hi = DAG.getBitcast(InVT, Hi);
21106 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21107 return DAG.getBitcast(DstVT, Res);
21108 }
21109
21110 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21111 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21112 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21113 Lo = DAG.getBitcast(InVT, Lo);
21114 Hi = DAG.getBitcast(InVT, Hi);
21115 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21116
21117 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21118 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21119 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21121 int Scale = 64 / OutVT.getScalarSizeInBits();
21122 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21123 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21124
21125 if (DstVT.is256BitVector())
21126 return DAG.getBitcast(DstVT, Res);
21127
21128 // If 512bit -> 128bit truncate another stage.
21129 Res = DAG.getBitcast(PackedVT, Res);
21130 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21131 }
21132
21133 // Recursively pack lower/upper subvectors, concat result and pack again.
21134 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21135
21136 if (PackedVT.is128BitVector()) {
21137 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21138 // type legalization.
21139 SDValue Res =
21140 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21141 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21142 }
21143
21144 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21145 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21146 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21147 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21148 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21149}
21150
21151/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21152/// e.g. trunc <8 x i32> X to <8 x i16> -->
21153/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21154/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21156 const X86Subtarget &Subtarget,
21157 SelectionDAG &DAG) {
21158 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21159 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21160}
21161
21162/// Truncate using inreg sign extension and X86ISD::PACKSS.
21164 const X86Subtarget &Subtarget,
21165 SelectionDAG &DAG) {
21166 EVT SrcVT = In.getValueType();
21167 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21168 DAG.getValueType(DstVT));
21169 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21170}
21171
21172/// Helper to determine if \p In truncated to \p DstVT has the necessary
21173/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21174/// possibly by converting a SRL node to SRA for sign extension.
21175static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21176 SDValue In, const SDLoc &DL,
21177 SelectionDAG &DAG,
21178 const X86Subtarget &Subtarget,
21179 const SDNodeFlags Flags = SDNodeFlags()) {
21180 // Requires SSE2.
21181 if (!Subtarget.hasSSE2())
21182 return SDValue();
21183
21184 EVT SrcVT = In.getValueType();
21185 EVT DstSVT = DstVT.getVectorElementType();
21186 EVT SrcSVT = SrcVT.getVectorElementType();
21187 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21188 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21189
21190 // Check we have a truncation suited for PACKSS/PACKUS.
21191 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21192 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21193 return SDValue();
21194
21195 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21196 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21197
21198 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21199 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21200 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21201 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21202 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21203 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21204 return SDValue();
21205
21206 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21207 // split this for packing.
21208 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21209 !isFreeToSplitVector(In, DAG) &&
21210 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21211 return SDValue();
21212
21213 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21214 if (Subtarget.hasAVX512() && NumStages > 1)
21215 return SDValue();
21216
21217 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21218 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21219
21220 // Truncate with PACKUS if we are truncating a vector with leading zero
21221 // bits that extend all the way to the packed/truncated value.
21222 // e.g. Masks, zext_in_reg, etc.
21223 // Pre-SSE41 we can only use PACKUSWB.
21224 KnownBits Known = DAG.computeKnownBits(In);
21225 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21226 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21227 PackOpcode = X86ISD::PACKUS;
21228 return In;
21229 }
21230
21231 // Truncate with PACKSS if we are truncating a vector with sign-bits
21232 // that extend all the way to the packed/truncated value.
21233 // e.g. Comparison result, sext_in_reg, etc.
21234 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21235
21236 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21237 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21238 // see through BITCASTs later on and combines/simplifications can't then use
21239 // it.
21240 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21241 !Subtarget.hasAVX512())
21242 return SDValue();
21243
21244 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21245 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21246 MinSignBits < NumSignBits) {
21247 PackOpcode = X86ISD::PACKSS;
21248 return In;
21249 }
21250
21251 // If we have a srl that only generates signbits that we will discard in
21252 // the truncation then we can use PACKSS by converting the srl to a sra.
21253 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21254 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21255 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21256 if (*ShAmt == MinSignBits) {
21257 PackOpcode = X86ISD::PACKSS;
21258 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21259 }
21260 }
21261
21262 return SDValue();
21263}
21264
21265/// This function lowers a vector truncation of 'extended sign-bits' or
21266/// 'extended zero-bits' values.
21267/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21269 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21270 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21271 MVT SrcVT = In.getSimpleValueType();
21272 MVT DstSVT = DstVT.getVectorElementType();
21273 MVT SrcSVT = SrcVT.getVectorElementType();
21274 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21275 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21276 return SDValue();
21277
21278 // If the upper half of the source is undef, then attempt to split and
21279 // only truncate the lower half.
21280 if (DstVT.getSizeInBits() >= 128) {
21281 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21282 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21283 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21284 Subtarget, DAG))
21285 return widenSubVector(Res, false, Subtarget, DAG, DL,
21286 DstVT.getSizeInBits());
21287 }
21288 }
21289
21290 unsigned PackOpcode;
21291 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21292 Subtarget, Flags))
21293 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21294
21295 return SDValue();
21296}
21297
21298/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21299/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21301 const X86Subtarget &Subtarget,
21302 SelectionDAG &DAG) {
21303 MVT SrcVT = In.getSimpleValueType();
21304 MVT DstSVT = DstVT.getVectorElementType();
21305 MVT SrcSVT = SrcVT.getVectorElementType();
21306 unsigned NumElems = DstVT.getVectorNumElements();
21307 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21308 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21309 NumElems >= 8))
21310 return SDValue();
21311
21312 // SSSE3's pshufb results in less instructions in the cases below.
21313 if (Subtarget.hasSSSE3() && NumElems == 8) {
21314 if (SrcSVT == MVT::i16)
21315 return SDValue();
21316 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21317 return SDValue();
21318 }
21319
21320 // If the upper half of the source is undef, then attempt to split and
21321 // only truncate the lower half.
21322 if (DstVT.getSizeInBits() >= 128) {
21323 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21324 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21325 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21326 return widenSubVector(Res, false, Subtarget, DAG, DL,
21327 DstVT.getSizeInBits());
21328 }
21329 }
21330
21331 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21332 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21333 // truncate 2 x v4i32 to v8i16.
21334 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21335 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21336
21337 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21338 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21339
21340 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21341 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21342 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21343 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21344 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21345 }
21346
21347 return SDValue();
21348}
21349
21351 SelectionDAG &DAG,
21352 const X86Subtarget &Subtarget) {
21353 MVT VT = Op.getSimpleValueType();
21354 SDValue In = Op.getOperand(0);
21355 MVT InVT = In.getSimpleValueType();
21356 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21357
21358 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21359 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21360 if (InVT.getScalarSizeInBits() <= 16) {
21361 if (Subtarget.hasBWI()) {
21362 // legal, will go to VPMOVB2M, VPMOVW2M
21363 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21364 // We need to shift to get the lsb into sign position.
21365 // Shift packed bytes not supported natively, bitcast to word
21366 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21367 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21368 DAG.getBitcast(ExtVT, In),
21369 DAG.getConstant(ShiftInx, DL, ExtVT));
21370 In = DAG.getBitcast(InVT, In);
21371 }
21372 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21373 In, ISD::SETGT);
21374 }
21375 // Use TESTD/Q, extended vector to packed dword/qword.
21376 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21377 "Unexpected vector type.");
21378 unsigned NumElts = InVT.getVectorNumElements();
21379 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21380 // We need to change to a wider element type that we have support for.
21381 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21382 // For 16 element vectors we extend to v16i32 unless we are explicitly
21383 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21384 // we need to split into two 8 element vectors which we can extend to v8i32,
21385 // truncate and concat the results. There's an additional complication if
21386 // the original type is v16i8. In that case we can't split the v16i8
21387 // directly, so we need to shuffle high elements to low and use
21388 // sign_extend_vector_inreg.
21389 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21390 SDValue Lo, Hi;
21391 if (InVT == MVT::v16i8) {
21392 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21393 Hi = DAG.getVectorShuffle(
21394 InVT, DL, In, In,
21395 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21396 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21397 } else {
21398 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21399 Lo = extract128BitVector(In, 0, DAG, DL);
21400 Hi = extract128BitVector(In, 8, DAG, DL);
21401 }
21402 // We're split now, just emit two truncates and a concat. The two
21403 // truncates will trigger legalization to come back to this function.
21404 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21405 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21406 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21407 }
21408 // We either have 8 elements or we're allowed to use 512-bit vectors.
21409 // If we have VLX, we want to use the narrowest vector that can get the
21410 // job done so we use vXi32.
21411 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21412 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21413 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21414 InVT = ExtVT;
21415 ShiftInx = InVT.getScalarSizeInBits() - 1;
21416 }
21417
21418 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21419 // We need to shift to get the lsb into sign position.
21420 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21421 DAG.getConstant(ShiftInx, DL, InVT));
21422 }
21423 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21424 if (Subtarget.hasDQI())
21425 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21426 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21427}
21428
21429SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21430 SDLoc DL(Op);
21431 MVT VT = Op.getSimpleValueType();
21432 SDValue In = Op.getOperand(0);
21433 MVT InVT = In.getSimpleValueType();
21435 "Invalid TRUNCATE operation");
21436
21437 // If we're called by the type legalizer, handle a few cases.
21438 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21439 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21440 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21441 VT.is128BitVector() && Subtarget.hasAVX512()) {
21442 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21443 "Unexpected subtarget!");
21444 // The default behavior is to truncate one step, concatenate, and then
21445 // truncate the remainder. We'd rather produce two 64-bit results and
21446 // concatenate those.
21447 SDValue Lo, Hi;
21448 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21449
21450 EVT LoVT, HiVT;
21451 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21452
21453 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21454 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21455 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21456 }
21457
21458 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21459 if (!Subtarget.hasAVX512() ||
21460 (InVT.is512BitVector() && VT.is256BitVector()))
21462 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21463 return SignPack;
21464
21465 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21466 if (!Subtarget.hasAVX512())
21467 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21468
21469 // Otherwise let default legalization handle it.
21470 return SDValue();
21471 }
21472
21473 if (VT.getVectorElementType() == MVT::i1)
21474 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21475
21476 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21477 // concat from subvectors to use VPTRUNC etc.
21478 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21480 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21481 return SignPack;
21482
21483 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21484 if (Subtarget.hasAVX512()) {
21485 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21486 assert(VT == MVT::v32i8 && "Unexpected VT!");
21487 return splitVectorIntUnary(Op, DAG, DL);
21488 }
21489
21490 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21491 // and then truncate that. But we should only do that if we haven't been
21492 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21493 // handled by isel patterns.
21494 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21495 Subtarget.canExtendTo512DQ())
21496 return Op;
21497 }
21498
21499 // Handle truncation of V256 to V128 using shuffles.
21500 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21501
21502 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21503 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21504 if (Subtarget.hasInt256()) {
21505 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21506 In = DAG.getBitcast(MVT::v8i32, In);
21507 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21508 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21509 DAG.getVectorIdxConstant(0, DL));
21510 }
21511
21512 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21513 DAG.getVectorIdxConstant(0, DL));
21514 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21515 DAG.getVectorIdxConstant(2, DL));
21516 static const int ShufMask[] = {0, 2, 4, 6};
21517 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21518 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21519 }
21520
21521 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21522 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21523 if (Subtarget.hasInt256()) {
21524 // The PSHUFB mask:
21525 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21526 -1, -1, -1, -1, -1, -1, -1, -1,
21527 16, 17, 20, 21, 24, 25, 28, 29,
21528 -1, -1, -1, -1, -1, -1, -1, -1 };
21529 In = DAG.getBitcast(MVT::v32i8, In);
21530 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21531 In = DAG.getBitcast(MVT::v4i64, In);
21532
21533 static const int ShufMask2[] = {0, 2, -1, -1};
21534 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21535 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21536 DAG.getVectorIdxConstant(0, DL));
21537 return DAG.getBitcast(MVT::v8i16, In);
21538 }
21539
21540 return Subtarget.hasSSE41()
21541 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21542 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21543 }
21544
21545 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21546 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21547
21548 llvm_unreachable("All 256->128 cases should have been handled above!");
21549}
21550
21551// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21552// behaves on out of range inputs to generate optimized conversions.
21554 SelectionDAG &DAG,
21555 const X86Subtarget &Subtarget) {
21556 MVT SrcVT = Src.getSimpleValueType();
21557 unsigned DstBits = VT.getScalarSizeInBits();
21558 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21559
21560 // Calculate the converted result for values in the range 0 to
21561 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21562 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21563 SDValue Big =
21564 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21565 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21566 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21567
21568 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21569 // and only if the value was out of range. So we can use that
21570 // as our indicator that we rather use "Big" instead of "Small".
21571 //
21572 // Use "Small" if "IsOverflown" has all bits cleared
21573 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21574
21575 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21576 // use the slightly slower blendv select instead.
21577 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21578 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21579 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21580 }
21581
21582 SDValue IsOverflown =
21583 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21584 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21585 return DAG.getNode(ISD::OR, dl, VT, Small,
21586 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21587}
21588
21589SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21590 bool IsStrict = Op->isStrictFPOpcode();
21591 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21592 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21593 bool HasVLX = Subtarget.hasVLX();
21594 MVT VT = Op->getSimpleValueType(0);
21595 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21596 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21597 MVT SrcVT = Src.getSimpleValueType();
21598 SDLoc dl(Op);
21599
21600 SDValue Res;
21601 if (isSoftF16(SrcVT, Subtarget)) {
21602 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21603 if (IsStrict)
21604 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21605 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21606 {NVT, MVT::Other}, {Chain, Src})});
21607 return DAG.getNode(Op.getOpcode(), dl, VT,
21608 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21609 } else if (isTypeLegal(SrcVT) &&
21610 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21611 return Op;
21612 }
21613
21614 if (VT.isVector()) {
21615 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21616 MVT ResVT = MVT::v4i32;
21617 MVT TruncVT = MVT::v4i1;
21618 unsigned Opc;
21619 if (IsStrict)
21621 else
21622 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21623
21624 if (!IsSigned && !HasVLX) {
21625 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21626 // Widen to 512-bits.
21627 ResVT = MVT::v8i32;
21628 TruncVT = MVT::v8i1;
21629 Opc = Op.getOpcode();
21630 // Need to concat with zero vector for strict fp to avoid spurious
21631 // exceptions.
21632 // TODO: Should we just do this for non-strict as well?
21633 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21634 : DAG.getUNDEF(MVT::v8f64);
21635 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21636 DAG.getVectorIdxConstant(0, dl));
21637 }
21638 if (IsStrict) {
21639 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21640 Chain = Res.getValue(1);
21641 } else {
21642 Res = DAG.getNode(Opc, dl, ResVT, Src);
21643 }
21644
21645 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21646 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21647 DAG.getVectorIdxConstant(0, dl));
21648 if (IsStrict)
21649 return DAG.getMergeValues({Res, Chain}, dl);
21650 return Res;
21651 }
21652
21653 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21654 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21655 VT == MVT::v32i16)
21656 return Op;
21657
21658 MVT ResVT = VT;
21659 MVT EleVT = VT.getVectorElementType();
21660 if (EleVT != MVT::i64)
21661 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21662
21663 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21664 SDValue Tmp =
21665 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21666 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21667 Ops[0] = Src;
21668 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21669 }
21670
21671 if (!HasVLX) {
21672 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21673 // Widen to 512-bits.
21674 unsigned IntSize = EleVT.getSizeInBits();
21675 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21676 ResVT = MVT::getVectorVT(EleVT, Num);
21677 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21678 Subtarget, DAG, dl);
21679 }
21680
21681 if (IsStrict) {
21682 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21684 dl, {ResVT, MVT::Other}, {Chain, Src});
21685 Chain = Res.getValue(1);
21686 } else {
21687 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21688 ResVT, Src);
21689 }
21690
21691 // TODO: Need to add exception check code for strict FP.
21692 if (EleVT.getSizeInBits() < 16) {
21693 if (HasVLX)
21694 ResVT = MVT::getVectorVT(EleVT, 8);
21695 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21696 }
21697
21698 if (ResVT != VT)
21699 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21700 DAG.getVectorIdxConstant(0, dl));
21701
21702 if (IsStrict)
21703 return DAG.getMergeValues({Res, Chain}, dl);
21704 return Res;
21705 }
21706
21707 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21708 if (VT.getVectorElementType() == MVT::i16) {
21709 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21710 SrcVT.getVectorElementType() == MVT::f64) &&
21711 "Expected f32/f64 vector!");
21712 MVT NVT = VT.changeVectorElementType(MVT::i32);
21713 if (IsStrict) {
21714 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21716 dl, {NVT, MVT::Other}, {Chain, Src});
21717 Chain = Res.getValue(1);
21718 } else {
21719 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21720 NVT, Src);
21721 }
21722
21723 // TODO: Need to add exception check code for strict FP.
21724 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21725
21726 if (IsStrict)
21727 return DAG.getMergeValues({Res, Chain}, dl);
21728 return Res;
21729 }
21730
21731 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21732 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21733 assert(!IsSigned && "Expected unsigned conversion!");
21734 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21735 return Op;
21736 }
21737
21738 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21739 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21740 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21741 Subtarget.useAVX512Regs()) {
21742 assert(!IsSigned && "Expected unsigned conversion!");
21743 assert(!Subtarget.hasVLX() && "Unexpected features!");
21744 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21745 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21746 // Need to concat with zero vector for strict fp to avoid spurious
21747 // exceptions.
21748 // TODO: Should we just do this for non-strict as well?
21749 SDValue Tmp =
21750 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21751 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21752 DAG.getVectorIdxConstant(0, dl));
21753
21754 if (IsStrict) {
21755 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21756 {Chain, Src});
21757 Chain = Res.getValue(1);
21758 } else {
21759 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21760 }
21761
21762 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21763 DAG.getVectorIdxConstant(0, dl));
21764
21765 if (IsStrict)
21766 return DAG.getMergeValues({Res, Chain}, dl);
21767 return Res;
21768 }
21769
21770 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21771 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21772 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21773 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21774 assert(!Subtarget.hasVLX() && "Unexpected features!");
21775 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21776 // Need to concat with zero vector for strict fp to avoid spurious
21777 // exceptions.
21778 // TODO: Should we just do this for non-strict as well?
21779 SDValue Tmp =
21780 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21781 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21782 DAG.getVectorIdxConstant(0, dl));
21783
21784 if (IsStrict) {
21785 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21786 {Chain, Src});
21787 Chain = Res.getValue(1);
21788 } else {
21789 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21790 }
21791
21792 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21793 DAG.getVectorIdxConstant(0, dl));
21794
21795 if (IsStrict)
21796 return DAG.getMergeValues({Res, Chain}, dl);
21797 return Res;
21798 }
21799
21800 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21801 if (!Subtarget.hasVLX()) {
21802 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21803 // legalizer and then widened again by vector op legalization.
21804 if (!IsStrict)
21805 return SDValue();
21806
21807 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21808 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21809 {Src, Zero, Zero, Zero});
21810 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21811 {Chain, Tmp});
21812 SDValue Chain = Tmp.getValue(1);
21813 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21814 DAG.getVectorIdxConstant(0, dl));
21815 return DAG.getMergeValues({Tmp, Chain}, dl);
21816 }
21817
21818 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21819 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21820 DAG.getUNDEF(MVT::v2f32));
21821 if (IsStrict) {
21822 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21824 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21825 }
21826 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21827 return DAG.getNode(Opc, dl, VT, Tmp);
21828 }
21829
21830 // Generate optimized instructions for pre AVX512 unsigned conversions from
21831 // vXf32 to vXi32.
21832 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21833 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21834 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21835 assert(!IsSigned && "Expected unsigned conversion!");
21836 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21837 }
21838
21839 return SDValue();
21840 }
21841
21842 assert(!VT.isVector());
21843
21844 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21845
21846 if (!IsSigned && UseSSEReg) {
21847 // Conversions from f32/f64 with AVX512 should be legal.
21848 if (Subtarget.hasAVX512())
21849 return Op;
21850
21851 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21852 // behaves on out of range inputs to generate optimized conversions.
21853 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21854 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21855 unsigned DstBits = VT.getScalarSizeInBits();
21856 APInt UIntLimit = APInt::getSignMask(DstBits);
21857 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21858 DAG.getConstant(UIntLimit, dl, VT));
21859 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21860
21861 // Calculate the converted result for values in the range:
21862 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21863 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21864 SDValue Small =
21865 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21866 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21867 SDValue Big = DAG.getNode(
21868 X86ISD::CVTTS2SI, dl, VT,
21869 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21870 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21871
21872 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21873 // and only if the value was out of range. So we can use that
21874 // as our indicator that we rather use "Big" instead of "Small".
21875 //
21876 // Use "Small" if "IsOverflown" has all bits cleared
21877 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21878 SDValue IsOverflown = DAG.getNode(
21879 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21880 return DAG.getNode(ISD::OR, dl, VT, Small,
21881 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21882 }
21883
21884 // Use default expansion for i64.
21885 if (VT == MVT::i64)
21886 return SDValue();
21887
21888 assert(VT == MVT::i32 && "Unexpected VT!");
21889
21890 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21891 // FIXME: This does not generate an invalid exception if the input does not
21892 // fit in i32. PR44019
21893 if (Subtarget.is64Bit()) {
21894 if (IsStrict) {
21895 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21896 {Chain, Src});
21897 Chain = Res.getValue(1);
21898 } else
21899 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21900
21901 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21902 if (IsStrict)
21903 return DAG.getMergeValues({Res, Chain}, dl);
21904 return Res;
21905 }
21906
21907 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21908 // use fisttp which will be handled later.
21909 if (!Subtarget.hasSSE3())
21910 return SDValue();
21911 }
21912
21913 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21914 // FIXME: This does not generate an invalid exception if the input does not
21915 // fit in i16. PR44019
21916 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21917 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21918 if (IsStrict) {
21919 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21920 {Chain, Src});
21921 Chain = Res.getValue(1);
21922 } else
21923 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21924
21925 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21926 if (IsStrict)
21927 return DAG.getMergeValues({Res, Chain}, dl);
21928 return Res;
21929 }
21930
21931 // If this is a FP_TO_SINT using SSEReg we're done.
21932 if (UseSSEReg && IsSigned)
21933 return Op;
21934
21935 // fp128 needs to use a libcall.
21936 if (SrcVT == MVT::f128) {
21937 RTLIB::Libcall LC;
21938 if (IsSigned)
21939 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21940 else
21941 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21942
21943 MakeLibCallOptions CallOptions;
21944 std::pair<SDValue, SDValue> Tmp =
21945 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21946
21947 if (IsStrict)
21948 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21949
21950 return Tmp.first;
21951 }
21952
21953 // Fall back to X87.
21954 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21955 if (IsStrict)
21956 return DAG.getMergeValues({V, Chain}, dl);
21957 return V;
21958 }
21959
21960 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21961}
21962
21963SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21964 SelectionDAG &DAG) const {
21965 SDValue Src = Op.getOperand(0);
21966 EVT DstVT = Op.getSimpleValueType();
21967 MVT SrcVT = Src.getSimpleValueType();
21968
21969 if (SrcVT.isVector())
21970 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21971
21972 if (SrcVT == MVT::f16)
21973 return SDValue();
21974
21975 // If the source is in an SSE register, the node is Legal.
21976 if (isScalarFPTypeInSSEReg(SrcVT))
21977 return Op;
21978
21979 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21980}
21981
21982SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21983 SelectionDAG &DAG) const {
21984 EVT DstVT = N->getValueType(0);
21985 SDValue Src = N->getOperand(0);
21986 EVT SrcVT = Src.getValueType();
21987
21988 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21989 // f16 must be promoted before using the lowering in this routine.
21990 // fp128 does not use this lowering.
21991 return SDValue();
21992 }
21993
21994 SDLoc DL(N);
21995 SDValue Chain = DAG.getEntryNode();
21996
21997 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21998
21999 // If we're converting from SSE, the stack slot needs to hold both types.
22000 // Otherwise it only needs to hold the DstVT.
22001 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22002 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22003 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22004 MachinePointerInfo MPI =
22006
22007 if (UseSSE) {
22008 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22009 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22010 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22011 SDValue Ops[] = { Chain, StackPtr };
22012
22013 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22014 /*Align*/ std::nullopt,
22016 Chain = Src.getValue(1);
22017 }
22018
22019 SDValue StoreOps[] = { Chain, Src, StackPtr };
22020 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22021 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22023
22024 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22025}
22026
22027SDValue
22028X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22029 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22030 // but making use of X86 specifics to produce better instruction sequences.
22031 SDNode *Node = Op.getNode();
22032 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22033 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22034 SDLoc dl(SDValue(Node, 0));
22035 SDValue Src = Node->getOperand(0);
22036
22037 // There are three types involved here: SrcVT is the source floating point
22038 // type, DstVT is the type of the result, and TmpVT is the result of the
22039 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22040 // DstVT).
22041 EVT SrcVT = Src.getValueType();
22042 EVT DstVT = Node->getValueType(0);
22043 EVT TmpVT = DstVT;
22044
22045 // This code is only for floats and doubles. Fall back to generic code for
22046 // anything else.
22047 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22048 return SDValue();
22049
22050 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22051 unsigned SatWidth = SatVT.getScalarSizeInBits();
22052 unsigned DstWidth = DstVT.getScalarSizeInBits();
22053 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22054 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22055 "Expected saturation width smaller than result width");
22056
22057 // Promote result of FP_TO_*INT to at least 32 bits.
22058 if (TmpWidth < 32) {
22059 TmpVT = MVT::i32;
22060 TmpWidth = 32;
22061 }
22062
22063 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22064 // us to use a native signed conversion instead.
22065 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22066 TmpVT = MVT::i64;
22067 TmpWidth = 64;
22068 }
22069
22070 // If the saturation width is smaller than the size of the temporary result,
22071 // we can always use signed conversion, which is native.
22072 if (SatWidth < TmpWidth)
22073 FpToIntOpcode = ISD::FP_TO_SINT;
22074
22075 // Determine minimum and maximum integer values and their corresponding
22076 // floating-point values.
22077 APInt MinInt, MaxInt;
22078 if (IsSigned) {
22079 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22080 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22081 } else {
22082 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22083 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22084 }
22085
22086 const fltSemantics &Sem = SrcVT.getFltSemantics();
22087 APFloat MinFloat(Sem);
22088 APFloat MaxFloat(Sem);
22089
22090 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22091 MinInt, IsSigned, APFloat::rmTowardZero);
22092 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22093 MaxInt, IsSigned, APFloat::rmTowardZero);
22094 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22095 && !(MaxStatus & APFloat::opStatus::opInexact);
22096
22097 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22098 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22099
22100 // If the integer bounds are exactly representable as floats, emit a
22101 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22102 if (AreExactFloatBounds) {
22103 if (DstVT != TmpVT) {
22104 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22105 SDValue MinClamped = DAG.getNode(
22106 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22107 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22108 SDValue BothClamped = DAG.getNode(
22109 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22110 // Convert clamped value to integer.
22111 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22112
22113 // NaN will become INDVAL, with the top bit set and the rest zero.
22114 // Truncation will discard the top bit, resulting in zero.
22115 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22116 }
22117
22118 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22119 SDValue MinClamped = DAG.getNode(
22120 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22121 // Clamp by MaxFloat from above. NaN cannot occur.
22122 SDValue BothClamped = DAG.getNode(
22123 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22124 // Convert clamped value to integer.
22125 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22126
22127 if (!IsSigned) {
22128 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22129 // which is zero.
22130 return FpToInt;
22131 }
22132
22133 // Otherwise, select zero if Src is NaN.
22134 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22135 return DAG.getSelectCC(
22136 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22137 }
22138
22139 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22140 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22141
22142 // Result of direct conversion, which may be selected away.
22143 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22144
22145 if (DstVT != TmpVT) {
22146 // NaN will become INDVAL, with the top bit set and the rest zero.
22147 // Truncation will discard the top bit, resulting in zero.
22148 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22149 }
22150
22151 SDValue Select = FpToInt;
22152 // For signed conversions where we saturate to the same size as the
22153 // result type of the fptoi instructions, INDVAL coincides with integer
22154 // minimum, so we don't need to explicitly check it.
22155 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22156 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22157 // MinInt if Src is NaN.
22158 Select = DAG.getSelectCC(
22159 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22160 }
22161
22162 // If Src OGT MaxFloat, select MaxInt.
22163 Select = DAG.getSelectCC(
22164 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22165
22166 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22167 // is already zero. The promoted case was already handled above.
22168 if (!IsSigned || DstVT != TmpVT) {
22169 return Select;
22170 }
22171
22172 // Otherwise, select 0 if Src is NaN.
22173 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22174 return DAG.getSelectCC(
22175 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22176}
22177
22178SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22179 bool IsStrict = Op->isStrictFPOpcode();
22180
22181 SDLoc DL(Op);
22182 MVT VT = Op.getSimpleValueType();
22183 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22184 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22185 MVT SVT = In.getSimpleValueType();
22186
22187 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22188 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22189 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22190 !Subtarget.getTargetTriple().isOSDarwin()))
22191 return SDValue();
22192
22193 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22194 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22195 return Op;
22196
22197 if (SVT == MVT::f16) {
22198 if (Subtarget.hasFP16())
22199 return Op;
22200
22201 if (VT != MVT::f32) {
22202 if (IsStrict)
22203 return DAG.getNode(
22204 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22205 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22206 {MVT::f32, MVT::Other}, {Chain, In})});
22207
22208 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22209 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22210 }
22211
22212 if (!Subtarget.hasF16C()) {
22213 if (!Subtarget.getTargetTriple().isOSDarwin())
22214 return SDValue();
22215
22216 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22217
22218 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22220 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22221
22222 In = DAG.getBitcast(MVT::i16, In);
22225 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22226 Entry.IsSExt = false;
22227 Entry.IsZExt = true;
22228 Args.push_back(Entry);
22229
22231 getLibcallName(RTLIB::FPEXT_F16_F32),
22233 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22234 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22235 std::move(Args));
22236
22237 SDValue Res;
22238 std::tie(Res,Chain) = LowerCallTo(CLI);
22239 if (IsStrict)
22240 Res = DAG.getMergeValues({Res, Chain}, DL);
22241
22242 return Res;
22243 }
22244
22245 In = DAG.getBitcast(MVT::i16, In);
22246 SDValue Res;
22247 if (IsStrict) {
22248 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22249 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22250 DAG.getVectorIdxConstant(0, DL));
22251 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22252 {Chain, In});
22253 Chain = Res.getValue(1);
22254 } else {
22255 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22256 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22257 DAG.getUNDEF(MVT::v4i32), In,
22258 DAG.getVectorIdxConstant(0, DL));
22259 In = DAG.getBitcast(MVT::v8i16, In);
22260 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22261 DAG.getTargetConstant(4, DL, MVT::i32));
22262 }
22263 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22264 DAG.getVectorIdxConstant(0, DL));
22265 if (IsStrict)
22266 return DAG.getMergeValues({Res, Chain}, DL);
22267 return Res;
22268 }
22269
22270 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22271 return Op;
22272
22273 if (SVT.getVectorElementType() == MVT::f16) {
22274 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22275 return Op;
22276 assert(Subtarget.hasF16C() && "Unexpected features!");
22277 if (SVT == MVT::v2f16)
22278 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22279 DAG.getUNDEF(MVT::v2f16));
22280 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22281 DAG.getUNDEF(MVT::v4f16));
22282 if (IsStrict)
22283 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22284 {Op->getOperand(0), Res});
22285 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22286 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22287 return Op;
22288 }
22289
22290 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22291
22292 SDValue Res =
22293 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22294 if (IsStrict)
22295 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22296 {Op->getOperand(0), Res});
22297 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22298}
22299
22300SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22301 bool IsStrict = Op->isStrictFPOpcode();
22302
22303 SDLoc DL(Op);
22304 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22305 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22306 MVT VT = Op.getSimpleValueType();
22307 MVT SVT = In.getSimpleValueType();
22308
22309 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22310 return SDValue();
22311
22312 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22313 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22314 if (!Subtarget.getTargetTriple().isOSDarwin())
22315 return SDValue();
22316
22317 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22319 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22320
22323 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22324 Entry.IsSExt = false;
22325 Entry.IsZExt = true;
22326 Args.push_back(Entry);
22327
22329 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22330 : RTLIB::FPROUND_F32_F16),
22332 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22333 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22334 std::move(Args));
22335
22336 SDValue Res;
22337 std::tie(Res, Chain) = LowerCallTo(CLI);
22338
22339 Res = DAG.getBitcast(MVT::f16, Res);
22340
22341 if (IsStrict)
22342 Res = DAG.getMergeValues({Res, Chain}, DL);
22343
22344 return Res;
22345 }
22346
22347 if (VT.getScalarType() == MVT::bf16) {
22348 if (SVT.getScalarType() == MVT::f32 &&
22349 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22350 Subtarget.hasAVXNECONVERT()))
22351 return Op;
22352 return SDValue();
22353 }
22354
22355 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22356 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22357 return SDValue();
22358
22359 if (VT.isVector())
22360 return Op;
22361
22362 SDValue Res;
22364 MVT::i32);
22365 if (IsStrict) {
22366 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22367 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22368 DAG.getVectorIdxConstant(0, DL));
22369 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22370 {Chain, Res, Rnd});
22371 Chain = Res.getValue(1);
22372 } else {
22373 // FIXME: Should we use zeros for upper elements for non-strict?
22374 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22375 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22376 }
22377
22378 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22379 DAG.getVectorIdxConstant(0, DL));
22380 Res = DAG.getBitcast(MVT::f16, Res);
22381
22382 if (IsStrict)
22383 return DAG.getMergeValues({Res, Chain}, DL);
22384
22385 return Res;
22386 }
22387
22388 return Op;
22389}
22390
22392 bool IsStrict = Op->isStrictFPOpcode();
22393 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22394 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22395 "Unexpected VT!");
22396
22397 SDLoc dl(Op);
22398 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22399 DAG.getConstant(0, dl, MVT::v8i16), Src,
22400 DAG.getVectorIdxConstant(0, dl));
22401
22402 SDValue Chain;
22403 if (IsStrict) {
22404 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22405 {Op.getOperand(0), Res});
22406 Chain = Res.getValue(1);
22407 } else {
22408 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22409 }
22410
22411 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22412 DAG.getVectorIdxConstant(0, dl));
22413
22414 if (IsStrict)
22415 return DAG.getMergeValues({Res, Chain}, dl);
22416
22417 return Res;
22418}
22419
22421 bool IsStrict = Op->isStrictFPOpcode();
22422 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22423 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22424 "Unexpected VT!");
22425
22426 SDLoc dl(Op);
22427 SDValue Res, Chain;
22428 if (IsStrict) {
22429 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22430 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22431 DAG.getVectorIdxConstant(0, dl));
22432 Res = DAG.getNode(
22433 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22434 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22435 Chain = Res.getValue(1);
22436 } else {
22437 // FIXME: Should we use zeros for upper elements for non-strict?
22438 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22439 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22440 DAG.getTargetConstant(4, dl, MVT::i32));
22441 }
22442
22443 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22444 DAG.getVectorIdxConstant(0, dl));
22445
22446 if (IsStrict)
22447 return DAG.getMergeValues({Res, Chain}, dl);
22448
22449 return Res;
22450}
22451
22452SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22453 SelectionDAG &DAG) const {
22454 SDLoc DL(Op);
22455
22456 MVT SVT = Op.getOperand(0).getSimpleValueType();
22457 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22458 Subtarget.hasAVXNECONVERT())) {
22459 SDValue Res;
22460 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22461 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22462 Res = DAG.getBitcast(MVT::v8i16, Res);
22463 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22464 DAG.getVectorIdxConstant(0, DL));
22465 }
22466
22467 MakeLibCallOptions CallOptions;
22468 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22469 SDValue Res =
22470 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22471 return DAG.getBitcast(MVT::i16, Res);
22472}
22473
22474/// Depending on uarch and/or optimizing for size, we might prefer to use a
22475/// vector operation in place of the typical scalar operation.
22477 SelectionDAG &DAG,
22478 const X86Subtarget &Subtarget) {
22479 // If both operands have other uses, this is probably not profitable.
22480 SDValue LHS = Op.getOperand(0);
22481 SDValue RHS = Op.getOperand(1);
22482 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22483 return Op;
22484
22485 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22486 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22487 if (IsFP && !Subtarget.hasSSE3())
22488 return Op;
22489 if (!IsFP && !Subtarget.hasSSSE3())
22490 return Op;
22491
22492 // Extract from a common vector.
22493 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22494 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22495 LHS.getOperand(0) != RHS.getOperand(0) ||
22496 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22497 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22498 !shouldUseHorizontalOp(true, DAG, Subtarget))
22499 return Op;
22500
22501 // Allow commuted 'hadd' ops.
22502 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22503 unsigned HOpcode;
22504 switch (Op.getOpcode()) {
22505 // clang-format off
22506 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22507 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22508 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22509 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22510 default:
22511 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22512 // clang-format on
22513 }
22514 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22515 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22516 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22517 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22518 std::swap(LExtIndex, RExtIndex);
22519
22520 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22521 return Op;
22522
22523 SDValue X = LHS.getOperand(0);
22524 EVT VecVT = X.getValueType();
22525 unsigned BitWidth = VecVT.getSizeInBits();
22526 unsigned NumLanes = BitWidth / 128;
22527 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22528 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22529 "Not expecting illegal vector widths here");
22530
22531 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22532 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22533 if (BitWidth == 256 || BitWidth == 512) {
22534 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22535 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22536 LExtIndex %= NumEltsPerLane;
22537 }
22538
22539 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22540 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22541 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22542 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22543 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22544 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22545 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22546}
22547
22548/// Depending on uarch and/or optimizing for size, we might prefer to use a
22549/// vector operation in place of the typical scalar operation.
22550SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22551 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22552 "Only expecting float/double");
22553 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22554}
22555
22556/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22557/// This mode isn't supported in hardware on X86. But as long as we aren't
22558/// compiling with trapping math, we can emulate this with
22559/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22561 SDValue N0 = Op.getOperand(0);
22562 SDLoc dl(Op);
22563 MVT VT = Op.getSimpleValueType();
22564
22565 // N0 += copysign(nextafter(0.5, 0.0), N0)
22566 const fltSemantics &Sem = VT.getFltSemantics();
22567 bool Ignored;
22568 APFloat Point5Pred = APFloat(0.5f);
22569 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22570 Point5Pred.next(/*nextDown*/true);
22571
22572 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22573 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22574 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22575
22576 // Truncate the result to remove fraction.
22577 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22578}
22579
22580/// The only differences between FABS and FNEG are the mask and the logic op.
22581/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22583 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22584 "Wrong opcode for lowering FABS or FNEG.");
22585
22586 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22587
22588 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22589 // into an FNABS. We'll lower the FABS after that if it is still in use.
22590 if (IsFABS)
22591 for (SDNode *User : Op->users())
22592 if (User->getOpcode() == ISD::FNEG)
22593 return Op;
22594
22595 SDLoc dl(Op);
22596 MVT VT = Op.getSimpleValueType();
22597
22598 bool IsF128 = (VT == MVT::f128);
22599 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22601 "Unexpected type in LowerFABSorFNEG");
22602
22603 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22604 // decide if we should generate a 16-byte constant mask when we only need 4 or
22605 // 8 bytes for the scalar case.
22606
22607 // There are no scalar bitwise logical SSE/AVX instructions, so we
22608 // generate a 16-byte vector constant and logic op even for the scalar case.
22609 // Using a 16-byte mask allows folding the load of the mask with
22610 // the logic op, so it can save (~4 bytes) on code size.
22611 bool IsFakeVector = !VT.isVector() && !IsF128;
22612 MVT LogicVT = VT;
22613 if (IsFakeVector)
22614 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22615 : (VT == MVT::f32) ? MVT::v4f32
22616 : MVT::v8f16;
22617
22618 unsigned EltBits = VT.getScalarSizeInBits();
22619 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22620 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22621 APInt::getSignMask(EltBits);
22622 const fltSemantics &Sem = VT.getFltSemantics();
22623 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22624
22625 SDValue Op0 = Op.getOperand(0);
22626 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22627 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22628 IsFNABS ? X86ISD::FOR :
22630 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22631
22632 if (VT.isVector() || IsF128)
22633 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22634
22635 // For the scalar case extend to a 128-bit vector, perform the logic op,
22636 // and extract the scalar result back out.
22637 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22638 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22639 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22640 DAG.getVectorIdxConstant(0, dl));
22641}
22642
22644 SDValue Mag = Op.getOperand(0);
22645 SDValue Sign = Op.getOperand(1);
22646 SDLoc dl(Op);
22647
22648 // If the sign operand is smaller, extend it first.
22649 MVT VT = Op.getSimpleValueType();
22650 if (Sign.getSimpleValueType().bitsLT(VT))
22651 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22652
22653 // And if it is bigger, shrink it first.
22654 if (Sign.getSimpleValueType().bitsGT(VT))
22655 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22656 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22657
22658 // At this point the operands and the result should have the same
22659 // type, and that won't be f80 since that is not custom lowered.
22660 bool IsF128 = (VT == MVT::f128);
22661 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22663 "Unexpected type in LowerFCOPYSIGN");
22664
22665 const fltSemantics &Sem = VT.getFltSemantics();
22666
22667 // Perform all scalar logic operations as 16-byte vectors because there are no
22668 // scalar FP logic instructions in SSE.
22669 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22670 // unnecessary splats, but we might miss load folding opportunities. Should
22671 // this decision be based on OptimizeForSize?
22672 bool IsFakeVector = !VT.isVector() && !IsF128;
22673 MVT LogicVT = VT;
22674 if (IsFakeVector)
22675 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22676 : (VT == MVT::f32) ? MVT::v4f32
22677 : MVT::v8f16;
22678
22679 // The mask constants are automatically splatted for vector types.
22680 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22681 SDValue SignMask = DAG.getConstantFP(
22682 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22683 SDValue MagMask = DAG.getConstantFP(
22684 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22685
22686 // First, clear all bits but the sign bit from the second operand (sign).
22687 if (IsFakeVector)
22688 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22689 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22690
22691 // Next, clear the sign bit from the first operand (magnitude).
22692 // TODO: If we had general constant folding for FP logic ops, this check
22693 // wouldn't be necessary.
22694 SDValue MagBits;
22695 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22696 APFloat APF = Op0CN->getValueAPF();
22697 APF.clearSign();
22698 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22699 } else {
22700 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22701 if (IsFakeVector)
22702 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22703 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22704 }
22705
22706 // OR the magnitude value with the sign bit.
22707 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22708 return !IsFakeVector ? Or
22709 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22710 DAG.getVectorIdxConstant(0, dl));
22711}
22712
22714 SDValue N0 = Op.getOperand(0);
22715 SDLoc dl(Op);
22716 MVT VT = Op.getSimpleValueType();
22717
22718 MVT OpVT = N0.getSimpleValueType();
22719 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22720 "Unexpected type for FGETSIGN");
22721
22722 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22723 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22724 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22725 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22726 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22727 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22728 return Res;
22729}
22730
22731/// Helper for attempting to create a X86ISD::BT node.
22732static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22733 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22734 // instruction. Since the shift amount is in-range-or-undefined, we know
22735 // that doing a bittest on the i32 value is ok. We extend to i32 because
22736 // the encoding for the i16 version is larger than the i32 version.
22737 // Also promote i16 to i32 for performance / code size reason.
22738 if (Src.getValueType().getScalarSizeInBits() < 32)
22739 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22740
22741 // No legal type found, give up.
22742 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22743 return SDValue();
22744
22745 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22746 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22747 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22748 // known to be zero.
22749 if (Src.getValueType() == MVT::i64 &&
22750 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22751 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22752
22753 // If the operand types disagree, extend the shift amount to match. Since
22754 // BT ignores high bits (like shifts) we can use anyextend.
22755 if (Src.getValueType() != BitNo.getValueType()) {
22756 // Peek through a mask/modulo operation.
22757 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22758 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22759 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22760 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22761 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22762 BitNo.getOperand(0)),
22763 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22764 BitNo.getOperand(1)));
22765 else
22766 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22767 }
22768
22769 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22770}
22771
22772/// Helper for creating a X86ISD::SETCC node.
22774 SelectionDAG &DAG) {
22775 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22776 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22777}
22778
22779/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22780/// recognizable memcmp expansion.
22781static bool isOrXorXorTree(SDValue X, bool Root = true) {
22782 if (X.getOpcode() == ISD::OR)
22783 return isOrXorXorTree(X.getOperand(0), false) &&
22784 isOrXorXorTree(X.getOperand(1), false);
22785 if (Root)
22786 return false;
22787 return X.getOpcode() == ISD::XOR;
22788}
22789
22790/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22791/// expansion.
22792template <typename F>
22794 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22795 SDValue Op0 = X.getOperand(0);
22796 SDValue Op1 = X.getOperand(1);
22797 if (X.getOpcode() == ISD::OR) {
22798 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22799 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22800 if (VecVT != CmpVT)
22801 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22802 if (HasPT)
22803 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22804 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22805 }
22806 if (X.getOpcode() == ISD::XOR) {
22807 SDValue A = SToV(Op0);
22808 SDValue B = SToV(Op1);
22809 if (VecVT != CmpVT)
22810 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22811 if (HasPT)
22812 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22813 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22814 }
22815 llvm_unreachable("Impossible");
22816}
22817
22818/// Try to map a 128-bit or larger integer comparison to vector instructions
22819/// before type legalization splits it up into chunks.
22821 ISD::CondCode CC,
22822 const SDLoc &DL,
22823 SelectionDAG &DAG,
22824 const X86Subtarget &Subtarget) {
22825 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22826
22827 // We're looking for an oversized integer equality comparison.
22828 EVT OpVT = X.getValueType();
22829 unsigned OpSize = OpVT.getSizeInBits();
22830 if (!OpVT.isScalarInteger() || OpSize < 128)
22831 return SDValue();
22832
22833 // Ignore a comparison with zero because that gets special treatment in
22834 // EmitTest(). But make an exception for the special case of a pair of
22835 // logically-combined vector-sized operands compared to zero. This pattern may
22836 // be generated by the memcmp expansion pass with oversized integer compares
22837 // (see PR33325).
22838 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22839 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22840 return SDValue();
22841
22842 // Don't perform this combine if constructing the vector will be expensive.
22843 auto IsVectorBitCastCheap = [](SDValue X) {
22845 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22846 X.getOpcode() == ISD::LOAD;
22847 };
22848 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22849 !IsOrXorXorTreeCCZero)
22850 return SDValue();
22851
22852 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22853 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22854 // Otherwise use PCMPEQ (plus AND) and mask testing.
22855 bool NoImplicitFloatOps =
22857 Attribute::NoImplicitFloat);
22858 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22859 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22860 (OpSize == 256 && Subtarget.hasAVX()) ||
22861 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22862 bool HasPT = Subtarget.hasSSE41();
22863
22864 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22865 // vector registers are essentially free. (Technically, widening registers
22866 // prevents load folding, but the tradeoff is worth it.)
22867 bool PreferKOT = Subtarget.preferMaskRegisters();
22868 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22869
22870 EVT VecVT = MVT::v16i8;
22871 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22872 if (OpSize == 256) {
22873 VecVT = MVT::v32i8;
22874 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22875 }
22876 EVT CastVT = VecVT;
22877 bool NeedsAVX512FCast = false;
22878 if (OpSize == 512 || NeedZExt) {
22879 if (Subtarget.hasBWI()) {
22880 VecVT = MVT::v64i8;
22881 CmpVT = MVT::v64i1;
22882 if (OpSize == 512)
22883 CastVT = VecVT;
22884 } else {
22885 VecVT = MVT::v16i32;
22886 CmpVT = MVT::v16i1;
22887 CastVT = OpSize == 512 ? VecVT
22888 : OpSize == 256 ? MVT::v8i32
22889 : MVT::v4i32;
22890 NeedsAVX512FCast = true;
22891 }
22892 }
22893
22894 auto ScalarToVector = [&](SDValue X) -> SDValue {
22895 bool TmpZext = false;
22896 EVT TmpCastVT = CastVT;
22897 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22898 SDValue OrigX = X.getOperand(0);
22899 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22900 if (OrigSize < OpSize) {
22901 if (OrigSize == 128) {
22902 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22903 X = OrigX;
22904 TmpZext = true;
22905 } else if (OrigSize == 256) {
22906 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22907 X = OrigX;
22908 TmpZext = true;
22909 }
22910 }
22911 }
22912 X = DAG.getBitcast(TmpCastVT, X);
22913 if (!NeedZExt && !TmpZext)
22914 return X;
22915 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22916 DAG.getConstant(0, DL, VecVT), X,
22917 DAG.getVectorIdxConstant(0, DL));
22918 };
22919
22920 SDValue Cmp;
22921 if (IsOrXorXorTreeCCZero) {
22922 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22923 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22924 // Use 2 vector equality compares and 'and' the results before doing a
22925 // MOVMSK.
22926 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22927 } else {
22928 SDValue VecX = ScalarToVector(X);
22929 SDValue VecY = ScalarToVector(Y);
22930 if (VecVT != CmpVT) {
22931 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22932 } else if (HasPT) {
22933 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22934 } else {
22935 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22936 }
22937 }
22938 // AVX512 should emit a setcc that will lower to kortest.
22939 if (VecVT != CmpVT) {
22940 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22941 : CmpVT == MVT::v32i1 ? MVT::i32
22942 : MVT::i16;
22943 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22944 DAG.getConstant(0, DL, KRegVT), CC);
22945 }
22946 if (HasPT) {
22947 SDValue BCCmp =
22948 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22949 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22951 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22952 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22953 }
22954 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22955 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22956 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22957 assert(Cmp.getValueType() == MVT::v16i8 &&
22958 "Non 128-bit vector on pre-SSE41 target");
22959 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22960 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22961 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22962 }
22963
22964 return SDValue();
22965}
22966
22967/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22968/// style scalarized (associative) reduction patterns. Partial reductions
22969/// are supported when the pointer SrcMask is non-null.
22970/// TODO - move this to SelectionDAG?
22973 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22975 DenseMap<SDValue, APInt> SrcOpMap;
22976 EVT VT = MVT::Other;
22977
22978 // Recognize a special case where a vector is casted into wide integer to
22979 // test all 0s.
22980 assert(Op.getOpcode() == unsigned(BinOp) &&
22981 "Unexpected bit reduction opcode");
22982 Opnds.push_back(Op.getOperand(0));
22983 Opnds.push_back(Op.getOperand(1));
22984
22985 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22987 // BFS traverse all BinOp operands.
22988 if (I->getOpcode() == unsigned(BinOp)) {
22989 Opnds.push_back(I->getOperand(0));
22990 Opnds.push_back(I->getOperand(1));
22991 // Re-evaluate the number of nodes to be traversed.
22992 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22993 continue;
22994 }
22995
22996 // Quit if a non-EXTRACT_VECTOR_ELT
22997 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22998 return false;
22999
23000 // Quit if without a constant index.
23001 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23002 if (!Idx)
23003 return false;
23004
23005 SDValue Src = I->getOperand(0);
23006 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23007 if (M == SrcOpMap.end()) {
23008 VT = Src.getValueType();
23009 // Quit if not the same type.
23010 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23011 return false;
23012 unsigned NumElts = VT.getVectorNumElements();
23013 APInt EltCount = APInt::getZero(NumElts);
23014 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23015 SrcOps.push_back(Src);
23016 }
23017
23018 // Quit if element already used.
23019 unsigned CIdx = Idx->getZExtValue();
23020 if (M->second[CIdx])
23021 return false;
23022 M->second.setBit(CIdx);
23023 }
23024
23025 if (SrcMask) {
23026 // Collect the source partial masks.
23027 for (SDValue &SrcOp : SrcOps)
23028 SrcMask->push_back(SrcOpMap[SrcOp]);
23029 } else {
23030 // Quit if not all elements are used.
23031 for (const auto &I : SrcOpMap)
23032 if (!I.second.isAllOnes())
23033 return false;
23034 }
23035
23036 return true;
23037}
23038
23039// Helper function for comparing all bits of two vectors.
23041 ISD::CondCode CC, const APInt &OriginalMask,
23042 const X86Subtarget &Subtarget,
23043 SelectionDAG &DAG, X86::CondCode &X86CC) {
23044 EVT VT = LHS.getValueType();
23045 unsigned ScalarSize = VT.getScalarSizeInBits();
23046 if (OriginalMask.getBitWidth() != ScalarSize) {
23047 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23048 return SDValue();
23049 }
23050
23051 // Quit if not convertable to legal scalar or 128/256-bit vector.
23052 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
23053 return SDValue();
23054
23055 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23056 if (VT.isFloatingPoint())
23057 return SDValue();
23058
23059 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23060 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23061
23062 APInt Mask = OriginalMask;
23063
23064 auto MaskBits = [&](SDValue Src) {
23065 if (Mask.isAllOnes())
23066 return Src;
23067 EVT SrcVT = Src.getValueType();
23068 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23069 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23070 };
23071
23072 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23073 if (VT.getSizeInBits() < 128) {
23074 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23075 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23076 if (IntVT != MVT::i64)
23077 return SDValue();
23078 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23079 MVT::i32, MVT::i32);
23080 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23081 MVT::i32, MVT::i32);
23082 SDValue Lo =
23083 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23084 SDValue Hi =
23085 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23086 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23087 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23088 DAG.getConstant(0, DL, MVT::i32));
23089 }
23090 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23091 DAG.getBitcast(IntVT, MaskBits(LHS)),
23092 DAG.getBitcast(IntVT, MaskBits(RHS)));
23093 }
23094
23095 // Without PTEST, a masked v2i64 or-reduction is not faster than
23096 // scalarization.
23097 bool UseKORTEST = Subtarget.useAVX512Regs();
23098 bool UsePTEST = Subtarget.hasSSE41();
23099 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23100 return SDValue();
23101
23102 // Split down to 128/256/512-bit vector.
23103 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23104
23105 // If the input vector has vector elements wider than the target test size,
23106 // then cast to <X x i64> so it will safely split.
23107 if (ScalarSize > TestSize) {
23108 if (!Mask.isAllOnes())
23109 return SDValue();
23110 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23111 LHS = DAG.getBitcast(VT, LHS);
23112 RHS = DAG.getBitcast(VT, RHS);
23113 Mask = APInt::getAllOnes(64);
23114 }
23115
23116 if (VT.getSizeInBits() > TestSize) {
23117 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23118 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23119 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23120 while (VT.getSizeInBits() > TestSize) {
23121 auto Split = DAG.SplitVector(LHS, DL);
23122 VT = Split.first.getValueType();
23123 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23124 }
23125 RHS = DAG.getAllOnesConstant(DL, VT);
23126 } else if (!UsePTEST && !KnownRHS.isZero()) {
23127 // MOVMSK Special Case:
23128 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23129 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23130 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23131 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23132 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23133 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23134 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23135 V = DAG.getSExtOrTrunc(V, DL, VT);
23136 while (VT.getSizeInBits() > TestSize) {
23137 auto Split = DAG.SplitVector(V, DL);
23138 VT = Split.first.getValueType();
23139 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23140 }
23141 V = DAG.getNOT(DL, V, VT);
23142 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23143 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23144 DAG.getConstant(0, DL, MVT::i32));
23145 } else {
23146 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23147 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23148 while (VT.getSizeInBits() > TestSize) {
23149 auto Split = DAG.SplitVector(V, DL);
23150 VT = Split.first.getValueType();
23151 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23152 }
23153 LHS = V;
23154 RHS = DAG.getConstant(0, DL, VT);
23155 }
23156 }
23157
23158 if (UseKORTEST && VT.is512BitVector()) {
23159 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23160 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23161 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23162 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23163 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23164 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23165 }
23166
23167 if (UsePTEST) {
23168 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23169 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23170 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23171 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23172 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23173 }
23174
23175 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23176 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23177 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23178 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23179 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23180 V = DAG.getNOT(DL, V, MaskVT);
23181 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23182 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23183 DAG.getConstant(0, DL, MVT::i32));
23184}
23185
23186// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23187// to CMP(MOVMSK(PCMPEQB(X,Y))).
23189 ISD::CondCode CC, const SDLoc &DL,
23190 const X86Subtarget &Subtarget,
23191 SelectionDAG &DAG,
23192 X86::CondCode &X86CC) {
23193 SDValue Op = OrigLHS;
23194
23195 bool CmpNull;
23196 APInt Mask;
23197 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23198 CmpNull = isNullConstant(OrigRHS);
23199 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23200 return SDValue();
23201
23202 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23203 return SDValue();
23204
23205 // Check whether we're masking/truncating an OR-reduction result, in which
23206 // case track the masked bits.
23207 // TODO: Add CmpAllOnes support.
23208 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23209 if (CmpNull) {
23210 switch (Op.getOpcode()) {
23211 case ISD::TRUNCATE: {
23212 SDValue Src = Op.getOperand(0);
23213 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23214 Op.getScalarValueSizeInBits());
23215 Op = Src;
23216 break;
23217 }
23218 case ISD::AND: {
23219 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23220 Mask = Cst->getAPIntValue();
23221 Op = Op.getOperand(0);
23222 }
23223 break;
23224 }
23225 }
23226 }
23227 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23228 CC = ISD::SETEQ;
23229 CmpNull = true;
23230 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23231 } else {
23232 return SDValue();
23233 }
23234
23235 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23236
23237 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23238 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23240 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23241 EVT VT = VecIns[0].getValueType();
23242 assert(llvm::all_of(VecIns,
23243 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23244 "Reduction source vector mismatch");
23245
23246 // Quit if not splittable to scalar/128/256/512-bit vector.
23247 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
23248 return SDValue();
23249
23250 // If more than one full vector is evaluated, AND/OR them first before
23251 // PTEST.
23252 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23253 Slot += 2, e += 1) {
23254 // Each iteration will AND/OR 2 nodes and append the result until there is
23255 // only 1 node left, i.e. the final value of all vectors.
23256 SDValue LHS = VecIns[Slot];
23257 SDValue RHS = VecIns[Slot + 1];
23258 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23259 }
23260
23261 return LowerVectorAllEqual(DL, VecIns.back(),
23262 CmpNull ? DAG.getConstant(0, DL, VT)
23263 : DAG.getAllOnesConstant(DL, VT),
23264 CC, Mask, Subtarget, DAG, X86CC);
23265 }
23266
23267 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23268 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23269 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23270 ISD::NodeType BinOp;
23271 if (SDValue Match =
23272 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23273 EVT MatchVT = Match.getValueType();
23274 return LowerVectorAllEqual(DL, Match,
23275 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23276 : DAG.getAllOnesConstant(DL, MatchVT),
23277 CC, Mask, Subtarget, DAG, X86CC);
23278 }
23279 }
23280
23281 if (Mask.isAllOnes()) {
23282 assert(!Op.getValueType().isVector() &&
23283 "Illegal vector type for reduction pattern");
23285 if (Src.getValueType().isFixedLengthVector() &&
23286 Src.getValueType().getScalarType() == MVT::i1) {
23287 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23288 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23289 if (Src.getOpcode() == ISD::SETCC) {
23290 SDValue LHS = Src.getOperand(0);
23291 SDValue RHS = Src.getOperand(1);
23292 EVT LHSVT = LHS.getValueType();
23293 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23294 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23295 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
23296 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23297 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23298 X86CC);
23299 }
23300 }
23301 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23302 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23303 // Peek through truncation, mask the LSB and compare against zero/LSB.
23304 if (Src.getOpcode() == ISD::TRUNCATE) {
23305 SDValue Inner = Src.getOperand(0);
23306 EVT InnerVT = Inner.getValueType();
23307 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
23308 unsigned BW = InnerVT.getScalarSizeInBits();
23309 APInt SrcMask = APInt(BW, 1);
23310 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23311 return LowerVectorAllEqual(DL, Inner,
23312 DAG.getConstant(Cmp, DL, InnerVT), CC,
23313 SrcMask, Subtarget, DAG, X86CC);
23314 }
23315 }
23316 }
23317 }
23318
23319 return SDValue();
23320}
23321
23322/// return true if \c Op has a use that doesn't just read flags.
23324 for (SDUse &Use : Op->uses()) {
23325 SDNode *User = Use.getUser();
23326 unsigned UOpNo = Use.getOperandNo();
23327 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23328 // Look past truncate.
23329 UOpNo = User->use_begin()->getOperandNo();
23330 User = User->use_begin()->getUser();
23331 }
23332
23333 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23334 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23335 return true;
23336 }
23337 return false;
23338}
23339
23340// Transform to an x86-specific ALU node with flags if there is a chance of
23341// using an RMW op or only the flags are used. Otherwise, leave
23342// the node alone and emit a 'cmp' or 'test' instruction.
23344 for (SDNode *U : Op->users())
23345 if (U->getOpcode() != ISD::CopyToReg &&
23346 U->getOpcode() != ISD::SETCC &&
23347 U->getOpcode() != ISD::STORE)
23348 return false;
23349
23350 return true;
23351}
23352
23353/// Emit nodes that will be selected as "test Op0,Op0", or something
23354/// equivalent.
23356 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23357 // CF and OF aren't always set the way we want. Determine which
23358 // of these we need.
23359 bool NeedCF = false;
23360 bool NeedOF = false;
23361 switch (X86CC) {
23362 default: break;
23363 case X86::COND_A: case X86::COND_AE:
23364 case X86::COND_B: case X86::COND_BE:
23365 NeedCF = true;
23366 break;
23367 case X86::COND_G: case X86::COND_GE:
23368 case X86::COND_L: case X86::COND_LE:
23369 case X86::COND_O: case X86::COND_NO: {
23370 // Check if we really need to set the
23371 // Overflow flag. If NoSignedWrap is present
23372 // that is not actually needed.
23373 switch (Op->getOpcode()) {
23374 case ISD::ADD:
23375 case ISD::SUB:
23376 case ISD::MUL:
23377 case ISD::SHL:
23378 if (Op.getNode()->getFlags().hasNoSignedWrap())
23379 break;
23380 [[fallthrough]];
23381 default:
23382 NeedOF = true;
23383 break;
23384 }
23385 break;
23386 }
23387 }
23388 // See if we can use the EFLAGS value from the operand instead of
23389 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23390 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23391 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23392 // Emit a CMP with 0, which is the TEST pattern.
23393 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23394 DAG.getConstant(0, dl, Op.getValueType()));
23395 }
23396 unsigned Opcode = 0;
23397 unsigned NumOperands = 0;
23398
23399 SDValue ArithOp = Op;
23400
23401 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23402 // which may be the result of a CAST. We use the variable 'Op', which is the
23403 // non-casted variable when we check for possible users.
23404 switch (ArithOp.getOpcode()) {
23405 case ISD::AND:
23406 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23407 // because a TEST instruction will be better.
23408 if (!hasNonFlagsUse(Op))
23409 break;
23410
23411 [[fallthrough]];
23412 case ISD::ADD:
23413 case ISD::SUB:
23414 case ISD::OR:
23415 case ISD::XOR:
23417 break;
23418
23419 // Otherwise use a regular EFLAGS-setting instruction.
23420 switch (ArithOp.getOpcode()) {
23421 // clang-format off
23422 default: llvm_unreachable("unexpected operator!");
23423 case ISD::ADD: Opcode = X86ISD::ADD; break;
23424 case ISD::SUB: Opcode = X86ISD::SUB; break;
23425 case ISD::XOR: Opcode = X86ISD::XOR; break;
23426 case ISD::AND: Opcode = X86ISD::AND; break;
23427 case ISD::OR: Opcode = X86ISD::OR; break;
23428 // clang-format on
23429 }
23430
23431 NumOperands = 2;
23432 break;
23433 case X86ISD::ADD:
23434 case X86ISD::SUB:
23435 case X86ISD::OR:
23436 case X86ISD::XOR:
23437 case X86ISD::AND:
23438 return SDValue(Op.getNode(), 1);
23439 case ISD::SSUBO:
23440 case ISD::USUBO: {
23441 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23442 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23443 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23444 Op->getOperand(1)).getValue(1);
23445 }
23446 default:
23447 break;
23448 }
23449
23450 if (Opcode == 0) {
23451 // Emit a CMP with 0, which is the TEST pattern.
23452 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23453 DAG.getConstant(0, dl, Op.getValueType()));
23454 }
23455 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23456 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23457
23458 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23459 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23460 return SDValue(New.getNode(), 1);
23461}
23462
23463/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23464/// equivalent.
23466 const SDLoc &dl, SelectionDAG &DAG,
23467 const X86Subtarget &Subtarget) {
23468 if (isNullConstant(Op1))
23469 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23470
23471 EVT CmpVT = Op0.getValueType();
23472
23473 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23474 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23475
23476 // Only promote the compare up to I32 if it is a 16 bit operation
23477 // with an immediate. 16 bit immediates are to be avoided unless the target
23478 // isn't slowed down by length changing prefixes, we're optimizing for
23479 // codesize or the comparison is with a folded load.
23480 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23481 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23483 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23484 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23485 // Don't do this if the immediate can fit in 8-bits.
23486 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23487 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23488 unsigned ExtendOp =
23490 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23491 // For equality comparisons try to use SIGN_EXTEND if the input was
23492 // truncate from something with enough sign bits.
23493 if (Op0.getOpcode() == ISD::TRUNCATE) {
23494 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23495 ExtendOp = ISD::SIGN_EXTEND;
23496 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23497 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23498 ExtendOp = ISD::SIGN_EXTEND;
23499 }
23500 }
23501
23502 CmpVT = MVT::i32;
23503 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23504 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23505 }
23506 }
23507
23508 // Try to shrink i64 compares if the input has enough zero bits.
23509 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23510 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23511 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23512 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23513 CmpVT = MVT::i32;
23514 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23515 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23516 }
23517
23518 // Try to shrink all i64 compares if the inputs are representable as signed
23519 // i32.
23520 if (CmpVT == MVT::i64 &&
23521 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23522 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23523 CmpVT = MVT::i32;
23524 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23525 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23526 }
23527
23528 // 0-x == y --> x+y == 0
23529 // 0-x != y --> x+y != 0
23530 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23531 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23532 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23533 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23534 return Add.getValue(1);
23535 }
23536
23537 // x == 0-y --> x+y == 0
23538 // x != 0-y --> x+y != 0
23539 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23540 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23541 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23542 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23543 return Add.getValue(1);
23544 }
23545
23546 // If we already have an XOR of the ops, use that to check for equality.
23547 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23548 unsigned X86Opc = X86ISD::SUB;
23549 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23550 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23551 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23552 X86Opc = X86ISD::XOR;
23553
23554 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23555 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23556 return CmpOp.getValue(1);
23557}
23558
23560 EVT VT) const {
23561 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
23562}
23563
23564bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23565 SDNode *N, SDValue, SDValue IntPow2) const {
23566 if (N->getOpcode() == ISD::FDIV)
23567 return true;
23568
23569 EVT FPVT = N->getValueType(0);
23570 EVT IntVT = IntPow2.getValueType();
23571
23572 // This indicates a non-free bitcast.
23573 // TODO: This is probably overly conservative as we will need to scale the
23574 // integer vector anyways for the int->fp cast.
23575 if (FPVT.isVector() &&
23576 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23577 return false;
23578
23579 return true;
23580}
23581
23582/// Check if replacement of SQRT with RSQRT should be disabled.
23583bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23584 EVT VT = Op.getValueType();
23585
23586 // We don't need to replace SQRT with RSQRT for half type.
23587 if (VT.getScalarType() == MVT::f16)
23588 return true;
23589
23590 // We never want to use both SQRT and RSQRT instructions for the same input.
23591 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23592 return false;
23593
23594 if (VT.isVector())
23595 return Subtarget.hasFastVectorFSQRT();
23596 return Subtarget.hasFastScalarFSQRT();
23597}
23598
23599/// The minimum architected relative accuracy is 2^-12. We need one
23600/// Newton-Raphson step to have a good float result (24 bits of precision).
23601SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23602 SelectionDAG &DAG, int Enabled,
23603 int &RefinementSteps,
23604 bool &UseOneConstNR,
23605 bool Reciprocal) const {
23606 SDLoc DL(Op);
23607 EVT VT = Op.getValueType();
23608
23609 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23610 // It is likely not profitable to do this for f64 because a double-precision
23611 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23612 // instructions: convert to single, rsqrtss, convert back to double, refine
23613 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23614 // along with FMA, this could be a throughput win.
23615 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23616 // after legalize types.
23617 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23618 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23619 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23620 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23621 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23622 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23623 RefinementSteps = 1;
23624
23625 UseOneConstNR = false;
23626 // There is no FSQRT for 512-bits, but there is RSQRT14.
23627 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23628 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23629 if (RefinementSteps == 0 && !Reciprocal)
23630 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23631 return Estimate;
23632 }
23633
23634 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23635 Subtarget.hasFP16()) {
23636 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23637 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23638 RefinementSteps = 0;
23639
23640 if (VT == MVT::f16) {
23642 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23643 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23644 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23645 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23646 }
23647
23648 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23649 }
23650 return SDValue();
23651}
23652
23653/// The minimum architected relative accuracy is 2^-12. We need one
23654/// Newton-Raphson step to have a good float result (24 bits of precision).
23655SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23656 int Enabled,
23657 int &RefinementSteps) const {
23658 SDLoc DL(Op);
23659 EVT VT = Op.getValueType();
23660
23661 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23662 // It is likely not profitable to do this for f64 because a double-precision
23663 // reciprocal estimate with refinement on x86 prior to FMA requires
23664 // 15 instructions: convert to single, rcpss, convert back to double, refine
23665 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23666 // along with FMA, this could be a throughput win.
23667
23668 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23669 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23670 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23671 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23672 // Enable estimate codegen with 1 refinement step for vector division.
23673 // Scalar division estimates are disabled because they break too much
23674 // real-world code. These defaults are intended to match GCC behavior.
23675 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23676 return SDValue();
23677
23678 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23679 RefinementSteps = 1;
23680
23681 // There is no FSQRT for 512-bits, but there is RCP14.
23682 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23683 return DAG.getNode(Opcode, DL, VT, Op);
23684 }
23685
23686 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23687 Subtarget.hasFP16()) {
23688 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23689 RefinementSteps = 0;
23690
23691 if (VT == MVT::f16) {
23693 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23694 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23695 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23696 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23697 }
23698
23699 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23700 }
23701 return SDValue();
23702}
23703
23704/// If we have at least two divisions that use the same divisor, convert to
23705/// multiplication by a reciprocal. This may need to be adjusted for a given
23706/// CPU if a division's cost is not at least twice the cost of a multiplication.
23707/// This is because we still need one division to calculate the reciprocal and
23708/// then we need two multiplies by that reciprocal as replacements for the
23709/// original divisions.
23710unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
23711 return 2;
23712}
23713
23714SDValue
23715X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23716 SelectionDAG &DAG,
23717 SmallVectorImpl<SDNode *> &Created) const {
23719 if (isIntDivCheap(N->getValueType(0), Attr))
23720 return SDValue(N,0); // Lower SDIV as SDIV
23721
23722 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23723 "Unexpected divisor!");
23724
23725 // Only perform this transform if CMOV is supported otherwise the select
23726 // below will become a branch.
23727 if (!Subtarget.canUseCMOV())
23728 return SDValue();
23729
23730 // fold (sdiv X, pow2)
23731 EVT VT = N->getValueType(0);
23732 // FIXME: Support i8.
23733 if (VT != MVT::i16 && VT != MVT::i32 &&
23734 !(Subtarget.is64Bit() && VT == MVT::i64))
23735 return SDValue();
23736
23737 // If the divisor is 2 or -2, the default expansion is better.
23738 if (Divisor == 2 ||
23739 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23740 return SDValue();
23741
23742 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23743}
23744
23745/// Result of 'and' is compared against zero. Change to a BT node if possible.
23746/// Returns the BT node and the condition code needed to use it.
23748 SelectionDAG &DAG, X86::CondCode &X86CC) {
23749 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23750 SDValue Op0 = And.getOperand(0);
23751 SDValue Op1 = And.getOperand(1);
23752 if (Op0.getOpcode() == ISD::TRUNCATE)
23753 Op0 = Op0.getOperand(0);
23754 if (Op1.getOpcode() == ISD::TRUNCATE)
23755 Op1 = Op1.getOperand(0);
23756
23757 SDValue Src, BitNo;
23758 if (Op1.getOpcode() == ISD::SHL)
23759 std::swap(Op0, Op1);
23760 if (Op0.getOpcode() == ISD::SHL) {
23761 if (isOneConstant(Op0.getOperand(0))) {
23762 // If we looked past a truncate, check that it's only truncating away
23763 // known zeros.
23764 unsigned BitWidth = Op0.getValueSizeInBits();
23765 unsigned AndBitWidth = And.getValueSizeInBits();
23766 if (BitWidth > AndBitWidth) {
23767 KnownBits Known = DAG.computeKnownBits(Op0);
23768 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23769 return SDValue();
23770 }
23771 Src = Op1;
23772 BitNo = Op0.getOperand(1);
23773 }
23774 } else if (Op1.getOpcode() == ISD::Constant) {
23775 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23776 uint64_t AndRHSVal = AndRHS->getZExtValue();
23777 SDValue AndLHS = Op0;
23778
23779 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23780 Src = AndLHS.getOperand(0);
23781 BitNo = AndLHS.getOperand(1);
23782 } else {
23783 // Use BT if the immediate can't be encoded in a TEST instruction or we
23784 // are optimizing for size and the immedaite won't fit in a byte.
23785 bool OptForSize = DAG.shouldOptForSize();
23786 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23787 isPowerOf2_64(AndRHSVal)) {
23788 Src = AndLHS;
23789 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23790 Src.getValueType());
23791 }
23792 }
23793 }
23794
23795 // No patterns found, give up.
23796 if (!Src.getNode())
23797 return SDValue();
23798
23799 // Remove any bit flip.
23800 if (isBitwiseNot(Src)) {
23801 Src = Src.getOperand(0);
23802 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23803 }
23804
23805 // Attempt to create the X86ISD::BT node.
23806 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23807 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23808 return BT;
23809 }
23810
23811 return SDValue();
23812}
23813
23814// Check if pre-AVX condcode can be performed by a single FCMP op.
23815static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23816 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23817}
23818
23819/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23820/// CMPs.
23821static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23822 SDValue &Op1, bool &IsAlwaysSignaling) {
23823 unsigned SSECC;
23824 bool Swap = false;
23825
23826 // SSE Condition code mapping:
23827 // 0 - EQ
23828 // 1 - LT
23829 // 2 - LE
23830 // 3 - UNORD
23831 // 4 - NEQ
23832 // 5 - NLT
23833 // 6 - NLE
23834 // 7 - ORD
23835 switch (SetCCOpcode) {
23836 // clang-format off
23837 default: llvm_unreachable("Unexpected SETCC condition");
23838 case ISD::SETOEQ:
23839 case ISD::SETEQ: SSECC = 0; break;
23840 case ISD::SETOGT:
23841 case ISD::SETGT: Swap = true; [[fallthrough]];
23842 case ISD::SETLT:
23843 case ISD::SETOLT: SSECC = 1; break;
23844 case ISD::SETOGE:
23845 case ISD::SETGE: Swap = true; [[fallthrough]];
23846 case ISD::SETLE:
23847 case ISD::SETOLE: SSECC = 2; break;
23848 case ISD::SETUO: SSECC = 3; break;
23849 case ISD::SETUNE:
23850 case ISD::SETNE: SSECC = 4; break;
23851 case ISD::SETULE: Swap = true; [[fallthrough]];
23852 case ISD::SETUGE: SSECC = 5; break;
23853 case ISD::SETULT: Swap = true; [[fallthrough]];
23854 case ISD::SETUGT: SSECC = 6; break;
23855 case ISD::SETO: SSECC = 7; break;
23856 case ISD::SETUEQ: SSECC = 8; break;
23857 case ISD::SETONE: SSECC = 12; break;
23858 // clang-format on
23859 }
23860 if (Swap)
23861 std::swap(Op0, Op1);
23862
23863 switch (SetCCOpcode) {
23864 default:
23865 IsAlwaysSignaling = true;
23866 break;
23867 case ISD::SETEQ:
23868 case ISD::SETOEQ:
23869 case ISD::SETUEQ:
23870 case ISD::SETNE:
23871 case ISD::SETONE:
23872 case ISD::SETUNE:
23873 case ISD::SETO:
23874 case ISD::SETUO:
23875 IsAlwaysSignaling = false;
23876 break;
23877 }
23878
23879 return SSECC;
23880}
23881
23882/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23883/// concatenate the result back.
23885 SelectionDAG &DAG, const SDLoc &dl) {
23886 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23887 "Unsupported VTs!");
23888 SDValue CC = DAG.getCondCode(Cond);
23889
23890 // Extract the LHS Lo/Hi vectors
23891 SDValue LHS1, LHS2;
23892 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23893
23894 // Extract the RHS Lo/Hi vectors
23895 SDValue RHS1, RHS2;
23896 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23897
23898 // Issue the operation on the smaller types and concatenate the result back
23899 EVT LoVT, HiVT;
23900 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23901 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23902 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23903 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23904}
23905
23907 SelectionDAG &DAG) {
23908 SDValue Op0 = Op.getOperand(0);
23909 SDValue Op1 = Op.getOperand(1);
23910 SDValue CC = Op.getOperand(2);
23911 MVT VT = Op.getSimpleValueType();
23912 assert(VT.getVectorElementType() == MVT::i1 &&
23913 "Cannot set masked compare for this operation");
23914
23915 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23916
23917 // Prefer SETGT over SETLT.
23918 if (SetCCOpcode == ISD::SETLT) {
23919 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23920 std::swap(Op0, Op1);
23921 }
23922
23923 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23924}
23925
23926/// Given a buildvector constant, return a new vector constant with each element
23927/// incremented or decremented. If incrementing or decrementing would result in
23928/// unsigned overflow or underflow or this is not a simple vector constant,
23929/// return an empty value.
23931 bool NSW) {
23932 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23933 if (!BV || !V.getValueType().isSimple())
23934 return SDValue();
23935
23936 MVT VT = V.getSimpleValueType();
23937 MVT EltVT = VT.getVectorElementType();
23938 unsigned NumElts = VT.getVectorNumElements();
23940 SDLoc DL(V);
23941 for (unsigned i = 0; i < NumElts; ++i) {
23942 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23943 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23944 return SDValue();
23945
23946 // Avoid overflow/underflow.
23947 const APInt &EltC = Elt->getAPIntValue();
23948 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23949 return SDValue();
23950 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23951 (!IsInc && EltC.isMinSignedValue())))
23952 return SDValue();
23953
23954 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23955 }
23956
23957 return DAG.getBuildVector(VT, DL, NewVecC);
23958}
23959
23960/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23961/// Op0 u<= Op1:
23962/// t = psubus Op0, Op1
23963/// pcmpeq t, <0..0>
23965 ISD::CondCode Cond, const SDLoc &dl,
23966 const X86Subtarget &Subtarget,
23967 SelectionDAG &DAG) {
23968 if (!Subtarget.hasSSE2())
23969 return SDValue();
23970
23971 MVT VET = VT.getVectorElementType();
23972 if (VET != MVT::i8 && VET != MVT::i16)
23973 return SDValue();
23974
23975 switch (Cond) {
23976 default:
23977 return SDValue();
23978 case ISD::SETULT: {
23979 // If the comparison is against a constant we can turn this into a
23980 // setule. With psubus, setule does not require a swap. This is
23981 // beneficial because the constant in the register is no longer
23982 // destructed as the destination so it can be hoisted out of a loop.
23983 // Only do this pre-AVX since vpcmp* is no longer destructive.
23984 if (Subtarget.hasAVX())
23985 return SDValue();
23986 SDValue ULEOp1 =
23987 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23988 if (!ULEOp1)
23989 return SDValue();
23990 Op1 = ULEOp1;
23991 break;
23992 }
23993 case ISD::SETUGT: {
23994 // If the comparison is against a constant, we can turn this into a setuge.
23995 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23996 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23997 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23998 SDValue UGEOp1 =
23999 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24000 if (!UGEOp1)
24001 return SDValue();
24002 Op1 = Op0;
24003 Op0 = UGEOp1;
24004 break;
24005 }
24006 // Psubus is better than flip-sign because it requires no inversion.
24007 case ISD::SETUGE:
24008 std::swap(Op0, Op1);
24009 break;
24010 case ISD::SETULE:
24011 break;
24012 }
24013
24014 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24015 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24016 DAG.getConstant(0, dl, VT));
24017}
24018
24019static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24020 SelectionDAG &DAG) {
24021 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24022 Op.getOpcode() == ISD::STRICT_FSETCCS;
24023 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24024 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24025 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24026 MVT VT = Op->getSimpleValueType(0);
24027 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
24028 MVT OpVT = Op0.getSimpleValueType();
24029 SDLoc dl(Op);
24030
24031 if (OpVT.isFloatingPoint()) {
24032 MVT EltVT = OpVT.getVectorElementType();
24033 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24034 EltVT == MVT::f64);
24035
24036 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24037 if (isSoftF16(EltVT, Subtarget)) {
24038 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24039 return SDValue();
24040
24041 // Break 256-bit FP vector compare into smaller ones.
24042 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24043 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24044
24045 // Break 512-bit FP vector compare into smaller ones.
24046 if (OpVT.is512BitVector())
24047 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24048
24049 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24050 if (IsStrict) {
24051 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24052 {Chain, Op0});
24053 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24054 {Chain, Op1});
24055 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24056 {Chain, Op0, Op1, CC});
24057 }
24058 MVT DVT = VT.getVectorElementType() == MVT::i16
24059 ? VT.changeVectorElementType(MVT::i32)
24060 : VT;
24061 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24062 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24063 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24064 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24065 }
24066
24067 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24068
24069 // If we have a strict compare with a vXi1 result and the input is 128/256
24070 // bits we can't use a masked compare unless we have VLX. If we use a wider
24071 // compare like we do for non-strict, we might trigger spurious exceptions
24072 // from the upper elements. Instead emit a AVX compare and convert to mask.
24073 unsigned Opc;
24074 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24075 (!IsStrict || Subtarget.hasVLX() ||
24077#ifndef NDEBUG
24078 unsigned Num = VT.getVectorNumElements();
24079 assert(Num <= 16 ||
24080 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24081#endif
24082 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24083 } else {
24084 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24085 // The SSE/AVX packed FP comparison nodes are defined with a
24086 // floating-point vector result that matches the operand type. This allows
24087 // them to work with an SSE1 target (integer vector types are not legal).
24088 VT = Op0.getSimpleValueType();
24089 }
24090
24091 SDValue Cmp;
24092 bool IsAlwaysSignaling;
24093 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24094 if (!Subtarget.hasAVX()) {
24095 // TODO: We could use following steps to handle a quiet compare with
24096 // signaling encodings.
24097 // 1. Get ordered masks from a quiet ISD::SETO
24098 // 2. Use the masks to mask potential unordered elements in operand A, B
24099 // 3. Get the compare results of masked A, B
24100 // 4. Calculating final result using the mask and result from 3
24101 // But currently, we just fall back to scalar operations.
24102 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24103 return SDValue();
24104
24105 // Insert an extra signaling instruction to raise exception.
24106 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24107 SDValue SignalCmp = DAG.getNode(
24108 Opc, dl, {VT, MVT::Other},
24109 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24110 // FIXME: It seems we need to update the flags of all new strict nodes.
24111 // Otherwise, mayRaiseFPException in MI will return false due to
24112 // NoFPExcept = false by default. However, I didn't find it in other
24113 // patches.
24114 SignalCmp->setFlags(Op->getFlags());
24115 Chain = SignalCmp.getValue(1);
24116 }
24117
24118 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24119 // emit two comparisons and a logic op to tie them together.
24120 if (!cheapX86FSETCC_SSE(Cond)) {
24121 // LLVM predicate is SETUEQ or SETONE.
24122 unsigned CC0, CC1;
24123 unsigned CombineOpc;
24124 if (Cond == ISD::SETUEQ) {
24125 CC0 = 3; // UNORD
24126 CC1 = 0; // EQ
24127 CombineOpc = X86ISD::FOR;
24128 } else {
24130 CC0 = 7; // ORD
24131 CC1 = 4; // NEQ
24132 CombineOpc = X86ISD::FAND;
24133 }
24134
24135 SDValue Cmp0, Cmp1;
24136 if (IsStrict) {
24137 Cmp0 = DAG.getNode(
24138 Opc, dl, {VT, MVT::Other},
24139 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24140 Cmp1 = DAG.getNode(
24141 Opc, dl, {VT, MVT::Other},
24142 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24143 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24144 Cmp1.getValue(1));
24145 } else {
24146 Cmp0 = DAG.getNode(
24147 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24148 Cmp1 = DAG.getNode(
24149 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24150 }
24151 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24152 } else {
24153 if (IsStrict) {
24154 Cmp = DAG.getNode(
24155 Opc, dl, {VT, MVT::Other},
24156 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24157 Chain = Cmp.getValue(1);
24158 } else
24159 Cmp = DAG.getNode(
24160 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24161 }
24162 } else {
24163 // Handle all other FP comparisons here.
24164 if (IsStrict) {
24165 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24166 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24167 Cmp = DAG.getNode(
24168 Opc, dl, {VT, MVT::Other},
24169 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24170 Chain = Cmp.getValue(1);
24171 } else
24172 Cmp = DAG.getNode(
24173 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24174 }
24175
24176 if (VT.getFixedSizeInBits() >
24177 Op.getSimpleValueType().getFixedSizeInBits()) {
24178 // We emitted a compare with an XMM/YMM result. Finish converting to a
24179 // mask register using a vptestm.
24181 Cmp = DAG.getBitcast(CastVT, Cmp);
24182 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24183 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24184 } else {
24185 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24186 // the result type of SETCC. The bitcast is expected to be optimized
24187 // away during combining/isel.
24188 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24189 }
24190
24191 if (IsStrict)
24192 return DAG.getMergeValues({Cmp, Chain}, dl);
24193
24194 return Cmp;
24195 }
24196
24197 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24198
24199 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24200 assert(VTOp0 == Op1.getSimpleValueType() &&
24201 "Expected operands with same type!");
24203 "Invalid number of packed elements for source and destination!");
24204
24205 // The non-AVX512 code below works under the assumption that source and
24206 // destination types are the same.
24207 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24208 "Value types for source and destination must be the same!");
24209
24210 // The result is boolean, but operands are int/float
24211 if (VT.getVectorElementType() == MVT::i1) {
24212 // In AVX-512 architecture setcc returns mask with i1 elements,
24213 // But there is no compare instruction for i8 and i16 elements in KNL.
24214 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24215 "Unexpected operand type");
24216 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24217 }
24218
24219 // Lower using XOP integer comparisons.
24220 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24221 // Translate compare code to XOP PCOM compare mode.
24222 unsigned CmpMode = 0;
24223 switch (Cond) {
24224 // clang-format off
24225 default: llvm_unreachable("Unexpected SETCC condition");
24226 case ISD::SETULT:
24227 case ISD::SETLT: CmpMode = 0x00; break;
24228 case ISD::SETULE:
24229 case ISD::SETLE: CmpMode = 0x01; break;
24230 case ISD::SETUGT:
24231 case ISD::SETGT: CmpMode = 0x02; break;
24232 case ISD::SETUGE:
24233 case ISD::SETGE: CmpMode = 0x03; break;
24234 case ISD::SETEQ: CmpMode = 0x04; break;
24235 case ISD::SETNE: CmpMode = 0x05; break;
24236 // clang-format on
24237 }
24238
24239 // Are we comparing unsigned or signed integers?
24240 unsigned Opc =
24242
24243 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24244 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24245 }
24246
24247 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24248 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24250 SDValue BC0 = peekThroughBitcasts(Op0);
24251 if (BC0.getOpcode() == ISD::AND &&
24253 /*AllowUndefs=*/false)) {
24254 Cond = ISD::SETEQ;
24255 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24256 }
24257 }
24258
24259 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24260 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24261 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24263 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24264 unsigned BitWidth = VT.getScalarSizeInBits();
24265 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24266
24267 SDValue Result = Op0.getOperand(0);
24268 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24269 DAG.getConstant(ShiftAmt, dl, VT));
24270 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24271 DAG.getConstant(BitWidth - 1, dl, VT));
24272 return Result;
24273 }
24274 }
24275
24276 // Break 256-bit integer vector compare into smaller ones.
24277 if (VT.is256BitVector() && !Subtarget.hasInt256())
24278 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24279
24280 // Break 512-bit integer vector compare into smaller ones.
24281 // TODO: Try harder to use VPCMPx + VPMOV2x?
24282 if (VT.is512BitVector())
24283 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24284
24285 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24286 // not-of-PCMPEQ:
24287 // X != INT_MIN --> X >s INT_MIN
24288 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24289 // +X != 0 --> +X >s 0
24290 APInt ConstValue;
24291 if (Cond == ISD::SETNE &&
24292 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24293 if (ConstValue.isMinSignedValue())
24294 Cond = ISD::SETGT;
24295 else if (ConstValue.isMaxSignedValue())
24296 Cond = ISD::SETLT;
24297 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24298 Cond = ISD::SETGT;
24299 }
24300
24301 // If both operands are known non-negative, then an unsigned compare is the
24302 // same as a signed compare and there's no need to flip signbits.
24303 // TODO: We could check for more general simplifications here since we're
24304 // computing known bits.
24305 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24306 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24307
24308 // Special case: Use min/max operations for unsigned compares.
24309 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24311 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24312 TLI.isOperationLegal(ISD::UMIN, VT)) {
24313 // If we have a constant operand, increment/decrement it and change the
24314 // condition to avoid an invert.
24315 if (Cond == ISD::SETUGT) {
24316 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24317 if (SDValue UGTOp1 =
24318 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24319 Op1 = UGTOp1;
24320 Cond = ISD::SETUGE;
24321 }
24322 }
24323 if (Cond == ISD::SETULT) {
24324 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24325 if (SDValue ULTOp1 =
24326 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24327 Op1 = ULTOp1;
24328 Cond = ISD::SETULE;
24329 }
24330 }
24331 bool Invert = false;
24332 unsigned Opc;
24333 switch (Cond) {
24334 // clang-format off
24335 default: llvm_unreachable("Unexpected condition code");
24336 case ISD::SETUGT: Invert = true; [[fallthrough]];
24337 case ISD::SETULE: Opc = ISD::UMIN; break;
24338 case ISD::SETULT: Invert = true; [[fallthrough]];
24339 case ISD::SETUGE: Opc = ISD::UMAX; break;
24340 // clang-format on
24341 }
24342
24343 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24344 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24345
24346 // If the logical-not of the result is required, perform that now.
24347 if (Invert)
24348 Result = DAG.getNOT(dl, Result, VT);
24349
24350 return Result;
24351 }
24352
24353 // Try to use SUBUS and PCMPEQ.
24354 if (FlipSigns)
24355 if (SDValue V =
24356 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24357 return V;
24358
24359 // We are handling one of the integer comparisons here. Since SSE only has
24360 // GT and EQ comparisons for integer, swapping operands and multiple
24361 // operations may be required for some comparisons.
24362 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24364 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24366 bool Invert = Cond == ISD::SETNE ||
24368
24369 if (Swap)
24370 std::swap(Op0, Op1);
24371
24372 // Check that the operation in question is available (most are plain SSE2,
24373 // but PCMPGTQ and PCMPEQQ have different requirements).
24374 if (VT == MVT::v2i64) {
24375 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24376 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24377
24378 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24379 // the odd elements over the even elements.
24380 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24381 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24382 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24383
24384 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24385 static const int MaskHi[] = { 1, 1, 3, 3 };
24386 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24387
24388 return DAG.getBitcast(VT, Result);
24389 }
24390
24391 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24392 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24393 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24394
24395 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24396 static const int MaskHi[] = { 1, 1, 3, 3 };
24397 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24398
24399 return DAG.getBitcast(VT, Result);
24400 }
24401
24402 // If the i64 elements are sign-extended enough to be representable as i32
24403 // then we can compare the lower i32 bits and splat.
24404 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24405 DAG.ComputeNumSignBits(Op1) > 32) {
24406 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24407 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24408
24409 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24410 static const int MaskLo[] = {0, 0, 2, 2};
24411 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24412
24413 return DAG.getBitcast(VT, Result);
24414 }
24415
24416 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24417 // bits of the inputs before performing those operations. The lower
24418 // compare is always unsigned.
24419 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24420 : 0x0000000080000000ULL,
24421 dl, MVT::v2i64);
24422
24423 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24424 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24425
24426 // Cast everything to the right type.
24427 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24428 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24429
24430 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24431 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24432 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24433
24434 // Create masks for only the low parts/high parts of the 64 bit integers.
24435 static const int MaskHi[] = { 1, 1, 3, 3 };
24436 static const int MaskLo[] = { 0, 0, 2, 2 };
24437 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24438 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24439 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24440
24441 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24442 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24443
24444 if (Invert)
24445 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24446
24447 return DAG.getBitcast(VT, Result);
24448 }
24449
24450 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24451 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24452 // pcmpeqd + pshufd + pand.
24453 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24454
24455 // First cast everything to the right type.
24456 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24457 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24458
24459 // Do the compare.
24460 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24461
24462 // Make sure the lower and upper halves are both all-ones.
24463 static const int Mask[] = { 1, 0, 3, 2 };
24464 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24465 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24466
24467 if (Invert)
24468 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24469
24470 return DAG.getBitcast(VT, Result);
24471 }
24472 }
24473
24474 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24475 // bits of the inputs before performing those operations.
24476 if (FlipSigns) {
24477 MVT EltVT = VT.getVectorElementType();
24479 VT);
24480 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24481 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24482 }
24483
24484 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24485
24486 // If the logical-not of the result is required, perform that now.
24487 if (Invert)
24488 Result = DAG.getNOT(dl, Result, VT);
24489
24490 return Result;
24491}
24492
24493// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24495 const SDLoc &dl, SelectionDAG &DAG,
24496 const X86Subtarget &Subtarget,
24497 SDValue &X86CC) {
24498 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24499
24500 // Must be a bitcast from vXi1.
24501 if (Op0.getOpcode() != ISD::BITCAST)
24502 return SDValue();
24503
24504 Op0 = Op0.getOperand(0);
24505 MVT VT = Op0.getSimpleValueType();
24506 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24507 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24508 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24509 return SDValue();
24510
24511 X86::CondCode X86Cond;
24512 if (isNullConstant(Op1)) {
24513 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24514 } else if (isAllOnesConstant(Op1)) {
24515 // C flag is set for all ones.
24516 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24517 } else
24518 return SDValue();
24519
24520 // If the input is an AND, we can combine it's operands into the KTEST.
24521 bool KTestable = false;
24522 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24523 KTestable = true;
24524 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24525 KTestable = true;
24526 if (!isNullConstant(Op1))
24527 KTestable = false;
24528 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24529 SDValue LHS = Op0.getOperand(0);
24530 SDValue RHS = Op0.getOperand(1);
24531 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24532 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24533 }
24534
24535 // If the input is an OR, we can combine it's operands into the KORTEST.
24536 SDValue LHS = Op0;
24537 SDValue RHS = Op0;
24538 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24539 LHS = Op0.getOperand(0);
24540 RHS = Op0.getOperand(1);
24541 }
24542
24543 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24544 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24545}
24546
24547/// Emit flags for the given setcc condition and operands. Also returns the
24548/// corresponding X86 condition code constant in X86CC.
24549SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24550 ISD::CondCode CC, const SDLoc &dl,
24551 SelectionDAG &DAG,
24552 SDValue &X86CC) const {
24553 // Equality Combines.
24554 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24555 X86::CondCode X86CondCode;
24556
24557 // Optimize to BT if possible.
24558 // Lower (X & (1 << N)) == 0 to BT(X, N).
24559 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24560 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24561 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24562 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24563 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24564 return BT;
24565 }
24566 }
24567
24568 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24569 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24570 X86CondCode)) {
24571 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24572 return CmpZ;
24573 }
24574
24575 // Try to lower using KORTEST or KTEST.
24576 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24577 return Test;
24578
24579 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24580 // of these.
24581 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24582 // If the input is a setcc, then reuse the input setcc or use a new one
24583 // with the inverted condition.
24584 if (Op0.getOpcode() == X86ISD::SETCC) {
24585 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24586
24587 X86CC = Op0.getOperand(0);
24588 if (Invert) {
24589 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24590 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24591 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24592 }
24593
24594 return Op0.getOperand(1);
24595 }
24596 }
24597
24598 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24599 // overflow.
24600 if (isMinSignedConstant(Op1)) {
24601 EVT VT = Op0.getValueType();
24602 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24603 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24605 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24606 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24607 DAG.getConstant(0, dl, VT), Op0);
24608 return SDValue(Neg.getNode(), 1);
24609 }
24610 }
24611
24612 // Try to use the carry flag from the add in place of an separate CMP for:
24613 // (seteq (add X, -1), -1). Similar for setne.
24614 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24615 Op0.getOperand(1) == Op1) {
24616 if (isProfitableToUseFlagOp(Op0)) {
24617 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24618
24619 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24620 Op0.getOperand(1));
24621 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24622 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24623 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24624 return SDValue(New.getNode(), 1);
24625 }
24626 }
24627 }
24628
24630 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24631 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24632
24633 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24634 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24635 return EFLAGS;
24636}
24637
24638SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24639
24640 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24641 Op.getOpcode() == ISD::STRICT_FSETCCS;
24642 MVT VT = Op->getSimpleValueType(0);
24643
24644 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24645
24646 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24647 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24648 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24649 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24650 SDLoc dl(Op);
24651 ISD::CondCode CC =
24652 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24653
24654 if (isSoftF16(Op0.getValueType(), Subtarget))
24655 return SDValue();
24656
24657 // Handle f128 first, since one possible outcome is a normal integer
24658 // comparison which gets handled by emitFlagsForSetcc.
24659 if (Op0.getValueType() == MVT::f128) {
24660 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24661 Op.getOpcode() == ISD::STRICT_FSETCCS);
24662
24663 // If softenSetCCOperands returned a scalar, use it.
24664 if (!Op1.getNode()) {
24665 assert(Op0.getValueType() == Op.getValueType() &&
24666 "Unexpected setcc expansion!");
24667 if (IsStrict)
24668 return DAG.getMergeValues({Op0, Chain}, dl);
24669 return Op0;
24670 }
24671 }
24672
24673 if (Op0.getSimpleValueType().isInteger()) {
24674 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24675 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24676 // this may translate to less uops depending on uarch implementation. The
24677 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24678 // canonicalize to that CondCode.
24679 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24680 // encoding size - so it must either already be a i8 or i32 immediate, or it
24681 // shrinks down to that. We don't do this for any i64's to avoid additional
24682 // constant materializations.
24683 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24684 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24685 const APInt &Op1Val = Op1C->getAPIntValue();
24686 if (!Op1Val.isZero()) {
24687 // Ensure the constant+1 doesn't overflow.
24688 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24689 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24690 APInt Op1ValPlusOne = Op1Val + 1;
24691 if (Op1ValPlusOne.isSignedIntN(32) &&
24692 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24693 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24696 }
24697 }
24698 }
24699 }
24700
24701 SDValue X86CC;
24702 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24703 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24704 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24705 }
24706
24707 if (Subtarget.hasAVX10_2()) {
24708 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24709 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24710 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24711 if (Op0.getSimpleValueType() != MVT::f80) {
24712 SDValue Res = getSETCC(
24713 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24714 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24715 }
24716 }
24717 }
24718 // Handle floating point.
24719 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24720 if (CondCode == X86::COND_INVALID)
24721 return SDValue();
24722
24723 SDValue EFLAGS;
24724 if (IsStrict) {
24725 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24726 EFLAGS =
24728 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24729 Chain = EFLAGS.getValue(1);
24730 } else {
24731 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24732 }
24733
24734 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24735 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24736 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24737}
24738
24739SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24740 SDValue LHS = Op.getOperand(0);
24741 SDValue RHS = Op.getOperand(1);
24742 SDValue Carry = Op.getOperand(2);
24743 SDValue Cond = Op.getOperand(3);
24744 SDLoc DL(Op);
24745
24746 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24747 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24748
24749 // Recreate the carry if needed.
24750 EVT CarryVT = Carry.getValueType();
24751 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24752 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24753
24754 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24755 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24756 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24757}
24758
24759// This function returns three things: the arithmetic computation itself
24760// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24761// flag and the condition code define the case in which the arithmetic
24762// computation overflows.
24763static std::pair<SDValue, SDValue>
24765 assert(Op.getResNo() == 0 && "Unexpected result number!");
24766 SDValue Value, Overflow;
24767 SDValue LHS = Op.getOperand(0);
24768 SDValue RHS = Op.getOperand(1);
24769 unsigned BaseOp = 0;
24770 SDLoc DL(Op);
24771 switch (Op.getOpcode()) {
24772 default: llvm_unreachable("Unknown ovf instruction!");
24773 case ISD::SADDO:
24774 BaseOp = X86ISD::ADD;
24775 Cond = X86::COND_O;
24776 break;
24777 case ISD::UADDO:
24778 BaseOp = X86ISD::ADD;
24780 break;
24781 case ISD::SSUBO:
24782 BaseOp = X86ISD::SUB;
24783 Cond = X86::COND_O;
24784 break;
24785 case ISD::USUBO:
24786 BaseOp = X86ISD::SUB;
24787 Cond = X86::COND_B;
24788 break;
24789 case ISD::SMULO:
24790 BaseOp = X86ISD::SMUL;
24791 Cond = X86::COND_O;
24792 break;
24793 case ISD::UMULO:
24794 BaseOp = X86ISD::UMUL;
24795 Cond = X86::COND_O;
24796 break;
24797 }
24798
24799 if (BaseOp) {
24800 // Also sets EFLAGS.
24801 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24802 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24803 Overflow = Value.getValue(1);
24804 }
24805
24806 return std::make_pair(Value, Overflow);
24807}
24808
24810 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24811 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24812 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24813 // has only one use.
24814 SDLoc DL(Op);
24816 SDValue Value, Overflow;
24817 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24818
24819 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24820 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24821 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24822}
24823
24824/// Return true if opcode is a X86 logical comparison.
24826 unsigned Opc = Op.getOpcode();
24827 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24828 Opc == X86ISD::FCMP)
24829 return true;
24830 if (Op.getResNo() == 1 &&
24831 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24833 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24834 return true;
24835
24836 return false;
24837}
24838
24840 if (V.getOpcode() != ISD::TRUNCATE)
24841 return false;
24842
24843 SDValue VOp0 = V.getOperand(0);
24844 unsigned InBits = VOp0.getValueSizeInBits();
24845 unsigned Bits = V.getValueSizeInBits();
24846 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24847}
24848
24849// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24851 unsigned X86CC, const SDLoc &DL,
24852 SelectionDAG &DAG,
24853 const X86Subtarget &Subtarget) {
24854 EVT CmpVT = CmpVal.getValueType();
24855 EVT VT = LHS.getValueType();
24856 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24857 return SDValue();
24858
24859 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24860 isOneConstant(CmpVal.getOperand(1))) {
24861 auto SplatLSB = [&](EVT SplatVT) {
24862 // we need mask of all zeros or ones with same size of the other
24863 // operands.
24864 SDValue Neg = CmpVal;
24865 if (CmpVT.bitsGT(SplatVT))
24866 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24867 else if (CmpVT.bitsLT(SplatVT))
24868 Neg = DAG.getNode(
24869 ISD::AND, DL, SplatVT,
24870 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24871 DAG.getConstant(1, DL, SplatVT));
24872 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24873 };
24874
24875 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24877 return SplatLSB(VT);
24878
24879 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24880 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24881 isa<ConstantSDNode>(RHS)) {
24882 SDValue Mask = SplatLSB(VT);
24883 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24884 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24885 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24886 }
24887
24888 SDValue Src1, Src2;
24889 auto isIdentityPatternZero = [&]() {
24890 switch (RHS.getOpcode()) {
24891 default:
24892 break;
24893 case ISD::OR:
24894 case ISD::XOR:
24895 case ISD::ADD:
24896 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24897 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24898 Src2 = LHS;
24899 return true;
24900 }
24901 break;
24902 case ISD::SHL:
24903 case ISD::SRA:
24904 case ISD::SRL:
24905 case ISD::SUB:
24906 if (RHS.getOperand(0) == LHS) {
24907 Src1 = RHS.getOperand(1);
24908 Src2 = LHS;
24909 return true;
24910 }
24911 break;
24912 }
24913 return false;
24914 };
24915
24916 auto isIdentityPatternOnes = [&]() {
24917 switch (LHS.getOpcode()) {
24918 default:
24919 break;
24920 case ISD::AND:
24921 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24922 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24923 Src2 = RHS;
24924 return true;
24925 }
24926 break;
24927 }
24928 return false;
24929 };
24930
24931 // Convert 'identity' patterns (iff X is 0 or 1):
24932 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24933 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24934 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24935 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24936 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24937 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24938 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24939 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24940 SDValue Mask = SplatLSB(Src1.getValueType());
24941 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24942 Src1); // Mask & z
24943 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24944 }
24945 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24946 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24947 SDValue Mask = SplatLSB(VT);
24948 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24949 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24950 }
24951 }
24952
24953 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24956 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24957
24958 // 'X - 1' sets the carry flag if X == 0.
24959 // '0 - X' sets the carry flag if X != 0.
24960 // Convert the carry flag to a -1/0 mask with sbb:
24961 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24962 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24963 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24964 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24965 SDValue Sub;
24966 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24967 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24968 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24969 } else {
24970 SDValue One = DAG.getConstant(1, DL, CmpVT);
24971 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24972 }
24973 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24974 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24975 Sub.getValue(1));
24976 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24977 }
24978
24979 return SDValue();
24980}
24981
24982SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24983 bool AddTest = true;
24984 SDValue Cond = Op.getOperand(0);
24985 SDValue Op1 = Op.getOperand(1);
24986 SDValue Op2 = Op.getOperand(2);
24987 SDLoc DL(Op);
24988 MVT VT = Op1.getSimpleValueType();
24989 SDValue CC;
24990
24991 if (isSoftF16(VT, Subtarget)) {
24992 MVT NVT = VT.changeTypeToInteger();
24993 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24994 DAG.getBitcast(NVT, Op1),
24995 DAG.getBitcast(NVT, Op2)));
24996 }
24997
24998 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24999 // are available or VBLENDV if AVX is available.
25000 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25001 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25002 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25003 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25004 bool IsAlwaysSignaling;
25005 unsigned SSECC =
25006 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25007 CondOp0, CondOp1, IsAlwaysSignaling);
25008
25009 if (Subtarget.hasAVX512()) {
25010 SDValue Cmp =
25011 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25012 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25013 assert(!VT.isVector() && "Not a scalar type?");
25014 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25015 }
25016
25017 if (SSECC < 8 || Subtarget.hasAVX()) {
25018 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25019 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25020
25021 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25022 // instead of 3 logic instructions for size savings and potentially speed.
25023 // Unfortunately, there is no scalar form of VBLENDV.
25024 //
25025 // If either operand is a +0.0 constant, don't try this. We can expect to
25026 // optimize away at least one of the logic instructions later in that
25027 // case, so that sequence would be faster than a variable blend.
25028 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25029 !isNullFPConstant(Op2)) {
25030 // Convert to vectors, do a VSELECT, and convert back to scalar.
25031 // All of the conversions should be optimized away.
25032 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25033 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25034 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25035 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25036
25037 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25038 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25039
25040 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25041
25042 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25043 DAG.getVectorIdxConstant(0, DL));
25044 }
25045 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25046 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25047 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25048 }
25049 }
25050
25051 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25052 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25053 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25054 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25055 }
25056
25057 if (Cond.getOpcode() == ISD::SETCC &&
25058 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25059 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25060 Cond = NewCond;
25061 // If the condition was updated, it's possible that the operands of the
25062 // select were also updated (for example, EmitTest has a RAUW). Refresh
25063 // the local references to the select operands in case they got stale.
25064 Op1 = Op.getOperand(1);
25065 Op2 = Op.getOperand(2);
25066 }
25067 }
25068
25069 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25070 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25071 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25072 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25073 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25074 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25075 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25076 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25077 if (Cond.getOpcode() == X86ISD::SETCC &&
25078 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25079 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25080 SDValue Cmp = Cond.getOperand(1);
25081 SDValue CmpOp0 = Cmp.getOperand(0);
25082 unsigned CondCode = Cond.getConstantOperandVal(0);
25083
25084 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25085 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25086 // handle to keep the CMP with 0. This should be removed by
25087 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25088 // cttz_zero_undef.
25089 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25090 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25091 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25092 };
25093 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25094 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25095 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25096 // Keep Cmp.
25097 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25098 DL, DAG, Subtarget)) {
25099 return R;
25100 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
25101 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25102 ((CondCode == X86::COND_S) || // smin(x, 0)
25103 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25104 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25105 //
25106 // If the comparison is testing for a positive value, we have to invert
25107 // the sign bit mask, so only do that transform if the target has a
25108 // bitwise 'and not' instruction (the invert is free).
25109 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25110 unsigned ShCt = VT.getSizeInBits() - 1;
25111 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25112 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25113 if (CondCode == X86::COND_G)
25114 Shift = DAG.getNOT(DL, Shift, VT);
25115 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25116 }
25117 }
25118
25119 // Look past (and (setcc_carry (cmp ...)), 1).
25120 if (Cond.getOpcode() == ISD::AND &&
25121 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25122 isOneConstant(Cond.getOperand(1)))
25123 Cond = Cond.getOperand(0);
25124
25125 // Attempt to fold "raw cond" cases by treating them as:
25126 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25127 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25128 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25129 Subtarget))
25130 return R;
25131
25132 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25133 // setting operand in place of the X86ISD::SETCC.
25134 unsigned CondOpcode = Cond.getOpcode();
25135 if (CondOpcode == X86ISD::SETCC ||
25136 CondOpcode == X86ISD::SETCC_CARRY) {
25137 CC = Cond.getOperand(0);
25138
25139 SDValue Cmp = Cond.getOperand(1);
25140 bool IllegalFPCMov = false;
25141 if (VT.isFloatingPoint() && !VT.isVector() &&
25142 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25143 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25144
25145 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25146 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25147 Cond = Cmp;
25148 AddTest = false;
25149 }
25150 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25151 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25152 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25153 SDValue Value;
25154 X86::CondCode X86Cond;
25155 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25156
25157 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25158 AddTest = false;
25159 }
25160
25161 if (AddTest) {
25162 // Look past the truncate if the high bits are known zero.
25164 Cond = Cond.getOperand(0);
25165
25166 // We know the result of AND is compared against zero. Try to match
25167 // it to BT.
25168 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25169 X86::CondCode X86CondCode;
25170 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25171 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25172 Cond = BT;
25173 AddTest = false;
25174 }
25175 }
25176 }
25177
25178 if (AddTest) {
25179 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25180 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25181 }
25182
25183 // a < b ? -1 : 0 -> RES = ~setcc_carry
25184 // a < b ? 0 : -1 -> RES = setcc_carry
25185 // a >= b ? -1 : 0 -> RES = setcc_carry
25186 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25187 if (Cond.getOpcode() == X86ISD::SUB) {
25188 unsigned CondCode = CC->getAsZExtVal();
25189
25190 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25191 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25192 (isNullConstant(Op1) || isNullConstant(Op2))) {
25193 SDValue Res =
25194 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25195 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25196 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25197 return DAG.getNOT(DL, Res, Res.getValueType());
25198 return Res;
25199 }
25200 }
25201
25202 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25203 // widen the cmov and push the truncate through. This avoids introducing a new
25204 // branch during isel and doesn't add any extensions.
25205 if (Op.getValueType() == MVT::i8 &&
25206 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25207 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25208 if (T1.getValueType() == T2.getValueType() &&
25209 // Exclude CopyFromReg to avoid partial register stalls.
25210 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25211 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25212 CC, Cond);
25213 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25214 }
25215 }
25216
25217 // Or finally, promote i8 cmovs if we have CMOV,
25218 // or i16 cmovs if it won't prevent folding a load.
25219 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25220 // legal, but EmitLoweredSelect() can not deal with these extensions
25221 // being inserted between two CMOV's. (in i16 case too TBN)
25222 // https://bugs.llvm.org/show_bug.cgi?id=40974
25223 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25224 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25225 !X86::mayFoldLoad(Op2, Subtarget))) {
25226 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25227 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25228 SDValue Ops[] = { Op2, Op1, CC, Cond };
25229 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25230 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25231 }
25232
25233 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25234 // condition is true.
25235 SDValue Ops[] = { Op2, Op1, CC, Cond };
25236 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25237}
25238
25240 const X86Subtarget &Subtarget,
25241 SelectionDAG &DAG) {
25242 MVT VT = Op->getSimpleValueType(0);
25243 SDValue In = Op->getOperand(0);
25244 MVT InVT = In.getSimpleValueType();
25245 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25246 MVT VTElt = VT.getVectorElementType();
25247 unsigned NumElts = VT.getVectorNumElements();
25248
25249 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25250 MVT ExtVT = VT;
25251 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25252 // If v16i32 is to be avoided, we'll need to split and concatenate.
25253 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25254 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25255
25256 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25257 }
25258
25259 // Widen to 512-bits if VLX is not supported.
25260 MVT WideVT = ExtVT;
25261 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25262 NumElts *= 512 / ExtVT.getSizeInBits();
25263 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25264 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25265 DAG.getVectorIdxConstant(0, dl));
25266 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25267 }
25268
25269 SDValue V;
25270 MVT WideEltVT = WideVT.getVectorElementType();
25271 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25272 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25273 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25274 } else {
25275 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25276 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25277 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25278 }
25279
25280 // Truncate if we had to extend i16/i8 above.
25281 if (VT != ExtVT) {
25282 WideVT = MVT::getVectorVT(VTElt, NumElts);
25283 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25284 }
25285
25286 // Extract back to 128/256-bit if we widened.
25287 if (WideVT != VT)
25288 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25289 DAG.getVectorIdxConstant(0, dl));
25290
25291 return V;
25292}
25293
25295 SelectionDAG &DAG) {
25296 SDValue In = Op->getOperand(0);
25297 MVT InVT = In.getSimpleValueType();
25298 SDLoc DL(Op);
25299
25300 if (InVT.getVectorElementType() == MVT::i1)
25301 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25302
25303 assert(Subtarget.hasAVX() && "Expected AVX support");
25304 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25305}
25306
25307// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25308// For sign extend this needs to handle all vector sizes and SSE4.1 and
25309// non-SSE4.1 targets. For zero extend this should only handle inputs of
25310// MVT::v64i8 when BWI is not supported, but AVX512 is.
25312 const X86Subtarget &Subtarget,
25313 SelectionDAG &DAG) {
25314 SDValue In = Op->getOperand(0);
25315 MVT VT = Op->getSimpleValueType(0);
25316 MVT InVT = In.getSimpleValueType();
25317
25318 MVT SVT = VT.getVectorElementType();
25319 MVT InSVT = InVT.getVectorElementType();
25321
25322 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25323 return SDValue();
25324 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25325 return SDValue();
25326 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25327 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25328 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25329 return SDValue();
25330
25331 SDLoc dl(Op);
25332 unsigned Opc = Op.getOpcode();
25333 unsigned NumElts = VT.getVectorNumElements();
25334
25335 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25336 // For 512-bit vectors, we need 128-bits or 256-bits.
25337 if (InVT.getSizeInBits() > 128) {
25338 // Input needs to be at least the same number of elements as output, and
25339 // at least 128-bits.
25340 int InSize = InSVT.getSizeInBits() * NumElts;
25341 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25342 InVT = In.getSimpleValueType();
25343 }
25344
25345 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25346 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25347 // need to be handled here for 256/512-bit results.
25348 if (Subtarget.hasInt256()) {
25349 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25350
25351 if (InVT.getVectorNumElements() != NumElts)
25352 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25353
25354 // FIXME: Apparently we create inreg operations that could be regular
25355 // extends.
25356 unsigned ExtOpc =
25359 return DAG.getNode(ExtOpc, dl, VT, In);
25360 }
25361
25362 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25363 if (Subtarget.hasAVX()) {
25364 assert(VT.is256BitVector() && "256-bit vector expected");
25365 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25366 int HalfNumElts = HalfVT.getVectorNumElements();
25367
25368 unsigned NumSrcElts = InVT.getVectorNumElements();
25369 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25370 for (int i = 0; i != HalfNumElts; ++i)
25371 HiMask[i] = HalfNumElts + i;
25372
25373 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25374 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25375 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25376 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25377 }
25378
25379 // We should only get here for sign extend.
25380 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25381 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25382 unsigned InNumElts = InVT.getVectorNumElements();
25383
25384 // If the source elements are already all-signbits, we don't need to extend,
25385 // just splat the elements.
25386 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25387 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25388 unsigned Scale = InNumElts / NumElts;
25389 SmallVector<int, 16> ShuffleMask;
25390 for (unsigned I = 0; I != NumElts; ++I)
25391 ShuffleMask.append(Scale, I);
25392 return DAG.getBitcast(VT,
25393 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25394 }
25395
25396 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25397 SDValue Curr = In;
25398 SDValue SignExt = Curr;
25399
25400 // As SRAI is only available on i16/i32 types, we expand only up to i32
25401 // and handle i64 separately.
25402 if (InVT != MVT::v4i32) {
25403 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25404
25405 unsigned DestWidth = DestVT.getScalarSizeInBits();
25406 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25407 unsigned DestElts = DestVT.getVectorNumElements();
25408
25409 // Build a shuffle mask that takes each input element and places it in the
25410 // MSBs of the new element size.
25411 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25412 for (unsigned i = 0; i != DestElts; ++i)
25413 Mask[i * Scale + (Scale - 1)] = i;
25414
25415 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25416 Curr = DAG.getBitcast(DestVT, Curr);
25417
25418 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25419 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25420 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25421 }
25422
25423 if (VT == MVT::v2i64) {
25424 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25425 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25426 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25427 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25428 SignExt = DAG.getBitcast(VT, SignExt);
25429 }
25430
25431 return SignExt;
25432}
25433
25435 SelectionDAG &DAG) {
25436 MVT VT = Op->getSimpleValueType(0);
25437 SDValue In = Op->getOperand(0);
25438 MVT InVT = In.getSimpleValueType();
25439 SDLoc dl(Op);
25440
25441 if (InVT.getVectorElementType() == MVT::i1)
25442 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25443
25444 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25446 "Expected same number of elements");
25447 assert((VT.getVectorElementType() == MVT::i16 ||
25448 VT.getVectorElementType() == MVT::i32 ||
25449 VT.getVectorElementType() == MVT::i64) &&
25450 "Unexpected element type");
25451 assert((InVT.getVectorElementType() == MVT::i8 ||
25452 InVT.getVectorElementType() == MVT::i16 ||
25453 InVT.getVectorElementType() == MVT::i32) &&
25454 "Unexpected element type");
25455
25456 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25457 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25458 return splitVectorIntUnary(Op, DAG, dl);
25459 }
25460
25461 if (Subtarget.hasInt256())
25462 return Op;
25463
25464 // Optimize vectors in AVX mode
25465 // Sign extend v8i16 to v8i32 and
25466 // v4i32 to v4i64
25467 //
25468 // Divide input vector into two parts
25469 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25470 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25471 // concat the vectors to original VT
25472 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25473 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25474
25475 unsigned NumElems = InVT.getVectorNumElements();
25476 SmallVector<int,8> ShufMask(NumElems, -1);
25477 for (unsigned i = 0; i != NumElems/2; ++i)
25478 ShufMask[i] = i + NumElems/2;
25479
25480 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25481 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25482
25483 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25484}
25485
25486/// Change a vector store into a pair of half-size vector stores.
25488 SDValue StoredVal = Store->getValue();
25489 assert((StoredVal.getValueType().is256BitVector() ||
25490 StoredVal.getValueType().is512BitVector()) &&
25491 "Expecting 256/512-bit op");
25492
25493 // Splitting volatile memory ops is not allowed unless the operation was not
25494 // legal to begin with. Assume the input store is legal (this transform is
25495 // only used for targets with AVX). Note: It is possible that we have an
25496 // illegal type like v2i128, and so we could allow splitting a volatile store
25497 // in that case if that is important.
25498 if (!Store->isSimple())
25499 return SDValue();
25500
25501 SDLoc DL(Store);
25502 SDValue Value0, Value1;
25503 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25504 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25505 SDValue Ptr0 = Store->getBasePtr();
25506 SDValue Ptr1 =
25507 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25508 SDValue Ch0 =
25509 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25510 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25511 SDValue Ch1 =
25512 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25513 Store->getPointerInfo().getWithOffset(HalfOffset),
25514 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25515 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25516}
25517
25518/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25519/// type.
25521 SelectionDAG &DAG) {
25522 SDValue StoredVal = Store->getValue();
25523 assert(StoreVT.is128BitVector() &&
25524 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25525 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25526
25527 // Splitting volatile memory ops is not allowed unless the operation was not
25528 // legal to begin with. We are assuming the input op is legal (this transform
25529 // is only used for targets with AVX).
25530 if (!Store->isSimple())
25531 return SDValue();
25532
25533 MVT StoreSVT = StoreVT.getScalarType();
25534 unsigned NumElems = StoreVT.getVectorNumElements();
25535 unsigned ScalarSize = StoreSVT.getStoreSize();
25536
25537 SDLoc DL(Store);
25539 for (unsigned i = 0; i != NumElems; ++i) {
25540 unsigned Offset = i * ScalarSize;
25541 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25543 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25544 DAG.getVectorIdxConstant(i, DL));
25545 SDValue Ch =
25546 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25547 Store->getPointerInfo().getWithOffset(Offset),
25548 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25549 Stores.push_back(Ch);
25550 }
25551 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25552}
25553
25554static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25555 SelectionDAG &DAG) {
25556 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25557 SDLoc dl(St);
25558 SDValue StoredVal = St->getValue();
25559
25560 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25561 if (StoredVal.getValueType().isVector() &&
25562 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25563 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25564 assert(NumElts <= 8 && "Unexpected VT");
25565 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25566 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25567 "Expected AVX512F without AVX512DQI");
25568
25569 // We must pad with zeros to ensure we store zeroes to any unused bits.
25570 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25571 DAG.getUNDEF(MVT::v16i1), StoredVal,
25572 DAG.getVectorIdxConstant(0, dl));
25573 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25574 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25575 // Make sure we store zeros in the extra bits.
25576 if (NumElts < 8)
25577 StoredVal = DAG.getZeroExtendInReg(
25578 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25579
25580 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25581 St->getPointerInfo(), St->getBaseAlign(),
25582 St->getMemOperand()->getFlags());
25583 }
25584
25585 if (St->isTruncatingStore())
25586 return SDValue();
25587
25588 // If this is a 256/512-bit store of concatenated ops, we are better off
25589 // splitting that store into two half-size stores. This avoids spurious use of
25590 // concatenated ops and each half can execute independently. Some cores would
25591 // split the op into halves anyway, so the concat is purely an extra op.
25592 MVT StoreVT = StoredVal.getSimpleValueType();
25593 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25594 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25595 return splitVectorStore(St, DAG);
25596 return SDValue();
25597 }
25598
25599 if (StoreVT.is32BitVector())
25600 return SDValue();
25601
25602 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25603 assert(StoreVT.is64BitVector() && "Unexpected VT");
25604 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25606 "Unexpected type action!");
25607
25608 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25609 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25610 DAG.getUNDEF(StoreVT));
25611
25612 if (Subtarget.hasSSE2()) {
25613 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25614 // and store it.
25615 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25616 MVT CastVT = MVT::getVectorVT(StVT, 2);
25617 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25618 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25619 DAG.getVectorIdxConstant(0, dl));
25620
25621 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25622 St->getPointerInfo(), St->getBaseAlign(),
25623 St->getMemOperand()->getFlags());
25624 }
25625 assert(Subtarget.hasSSE1() && "Expected SSE");
25626 SDVTList Tys = DAG.getVTList(MVT::Other);
25627 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25628 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25629 St->getMemOperand());
25630}
25631
25632// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25633// may emit an illegal shuffle but the expansion is still better than scalar
25634// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25635// we'll emit a shuffle and a arithmetic shift.
25636// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25637// TODO: It is possible to support ZExt by zeroing the undef values during
25638// the shuffle phase or after the shuffle.
25639static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25640 SelectionDAG &DAG) {
25641 MVT RegVT = Op.getSimpleValueType();
25642 assert(RegVT.isVector() && "We only custom lower vector loads.");
25643 assert(RegVT.isInteger() &&
25644 "We only custom lower integer vector loads.");
25645
25646 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25647 SDLoc dl(Ld);
25648
25649 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25650 if (RegVT.getVectorElementType() == MVT::i1) {
25651 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25652 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25653 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25654 "Expected AVX512F without AVX512DQI");
25655
25656 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25657 Ld->getPointerInfo(), Ld->getBaseAlign(),
25658 Ld->getMemOperand()->getFlags());
25659
25660 // Replace chain users with the new chain.
25661 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25662
25663 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25664 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25665 DAG.getBitcast(MVT::v16i1, Val),
25666 DAG.getVectorIdxConstant(0, dl));
25667 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25668 }
25669
25670 return SDValue();
25671}
25672
25673/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25674/// each of which has no other use apart from the AND / OR.
25675static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25676 Opc = Op.getOpcode();
25677 if (Opc != ISD::OR && Opc != ISD::AND)
25678 return false;
25679 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25680 Op.getOperand(0).hasOneUse() &&
25681 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25682 Op.getOperand(1).hasOneUse());
25683}
25684
25685SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25686 SDValue Chain = Op.getOperand(0);
25687 SDValue Cond = Op.getOperand(1);
25688 SDValue Dest = Op.getOperand(2);
25689 SDLoc dl(Op);
25690
25691 // Bail out when we don't have native compare instructions.
25692 if (Cond.getOpcode() == ISD::SETCC &&
25693 Cond.getOperand(0).getValueType() != MVT::f128 &&
25694 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25695 SDValue LHS = Cond.getOperand(0);
25696 SDValue RHS = Cond.getOperand(1);
25697 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25698
25699 // Special case for
25700 // setcc([su]{add,sub,mul}o == 0)
25701 // setcc([su]{add,sub,mul}o != 1)
25702 if (ISD::isOverflowIntrOpRes(LHS) &&
25703 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25704 (isNullConstant(RHS) || isOneConstant(RHS))) {
25705 SDValue Value, Overflow;
25706 X86::CondCode X86Cond;
25707 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25708
25709 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25710 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25711
25712 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25713 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25714 Overflow, Op->getFlags());
25715 }
25716
25717 if (LHS.getSimpleValueType().isInteger()) {
25718 SDValue CCVal;
25719 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25720 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25721 EFLAGS, Op->getFlags());
25722 }
25723
25724 if (CC == ISD::SETOEQ) {
25725 // For FCMP_OEQ, we can emit
25726 // two branches instead of an explicit AND instruction with a
25727 // separate test. However, we only do this if this block doesn't
25728 // have a fall-through edge, because this requires an explicit
25729 // jmp when the condition is false.
25730 if (Op.getNode()->hasOneUse()) {
25731 SDNode *User = *Op.getNode()->user_begin();
25732 // Look for an unconditional branch following this conditional branch.
25733 // We need this because we need to reverse the successors in order
25734 // to implement FCMP_OEQ.
25735 if (User->getOpcode() == ISD::BR) {
25736 SDValue FalseBB = User->getOperand(1);
25737 SDNode *NewBR =
25738 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25739 assert(NewBR == User);
25740 (void)NewBR;
25741 Dest = FalseBB;
25742
25743 SDValue Cmp =
25744 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25745 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25746 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25747 CCVal, Cmp, Op->getFlags());
25748 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25749 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25750 Cmp, Op->getFlags());
25751 }
25752 }
25753 } else if (CC == ISD::SETUNE) {
25754 // For FCMP_UNE, we can emit
25755 // two branches instead of an explicit OR instruction with a
25756 // separate test.
25757 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25758 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25759 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25760 Cmp, Op->getFlags());
25761 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25762 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25763 Cmp, Op->getFlags());
25764 } else {
25765 X86::CondCode X86Cond =
25766 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25767 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25768 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25769 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25770 Cmp, Op->getFlags());
25771 }
25772 }
25773
25775 SDValue Value, Overflow;
25776 X86::CondCode X86Cond;
25777 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25778
25779 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25780 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25781 Overflow, Op->getFlags());
25782 }
25783
25784 // Look past the truncate if the high bits are known zero.
25786 Cond = Cond.getOperand(0);
25787
25788 EVT CondVT = Cond.getValueType();
25789
25790 // Add an AND with 1 if we don't already have one.
25791 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25792 Cond =
25793 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25794
25795 SDValue LHS = Cond;
25796 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25797
25798 SDValue CCVal;
25799 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25800 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25801 Op->getFlags());
25802}
25803
25804// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25805// Calls to _alloca are needed to probe the stack when allocating more than 4k
25806// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25807// that the guard pages used by the OS virtual memory manager are allocated in
25808// correct sequence.
25809SDValue
25810X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25811 SelectionDAG &DAG) const {
25813 bool SplitStack = MF.shouldSplitStack();
25814 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25815 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25816 SplitStack || EmitStackProbeCall;
25817 SDLoc dl(Op);
25818
25819 // Get the inputs.
25820 SDNode *Node = Op.getNode();
25821 SDValue Chain = Op.getOperand(0);
25822 SDValue Size = Op.getOperand(1);
25823 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25824 EVT VT = Node->getValueType(0);
25825
25826 // Chain the dynamic stack allocation so that it doesn't modify the stack
25827 // pointer when other instructions are using the stack.
25828 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25829
25830 bool Is64Bit = Subtarget.is64Bit();
25831 MVT SPTy = Op.getValueType().getSimpleVT();
25832
25834 if (!Lower) {
25835 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25837 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25838 " not tell us which reg is the stack pointer!");
25839
25840 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25841 const Align StackAlign = TFI.getStackAlign();
25842 if (hasInlineStackProbe(MF)) {
25843 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25844 {Chain, Size});
25845 Chain = Result.getValue(1);
25846 } else {
25847 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25848 Chain = SP.getValue(1);
25849 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25850 }
25851 if (Alignment && *Alignment > StackAlign)
25852 Result = DAG.getNode(
25853 ISD::AND, dl, VT, Result,
25854 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25855 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25856 } else if (SplitStack) {
25857 if (Is64Bit) {
25858 // The 64 bit implementation of segmented stacks needs to clobber both r10
25859 // r11. This makes it impossible to use it along with nested parameters.
25860 const Function &F = MF.getFunction();
25861 for (const auto &A : F.args()) {
25862 if (A.hasNestAttr())
25863 report_fatal_error("Cannot use segmented stacks with functions that "
25864 "have nested arguments.");
25865 }
25866 }
25867
25868 Result =
25869 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25870 Chain = Result.getValue(1);
25871 } else {
25872 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25873 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25874 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25875
25876 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25877 Register SPReg = RegInfo->getStackRegister();
25878 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25879 Chain = SP.getValue(1);
25880
25881 if (Alignment) {
25882 SP = DAG.getNode(
25883 ISD::AND, dl, VT, SP.getValue(0),
25884 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25885 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25886 }
25887
25888 Result = SP;
25889 }
25890
25891 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25892
25893 SDValue Ops[2] = {Result, Chain};
25894 return DAG.getMergeValues(Ops, dl);
25895}
25896
25897SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25899 SDValue Ptr = Op.getOperand(1);
25900 EVT PtrVT = Ptr.getValueType();
25901
25903
25904 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25905 SDLoc DL(Op);
25906
25907 if (!Subtarget.is64Bit() ||
25908 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25909 // vastart just stores the address of the VarArgsFrameIndex slot into the
25910 // memory location argument.
25911 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25912 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25913 }
25914
25915 // __va_list_tag:
25916 // gp_offset (0 - 6 * 8)
25917 // fp_offset (48 - 48 + 8 * 16)
25918 // overflow_arg_area (point to parameters coming in memory).
25919 // reg_save_area
25921 SDValue FIN = Op.getOperand(1);
25922 // Store gp_offset
25923 SDValue Store = DAG.getStore(
25924 Op.getOperand(0), DL,
25925 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25926 MachinePointerInfo(SV));
25927 MemOps.push_back(Store);
25928
25929 // Store fp_offset
25930 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25931 Store = DAG.getStore(
25932 Op.getOperand(0), DL,
25933 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25934 MachinePointerInfo(SV, 4));
25935 MemOps.push_back(Store);
25936
25937 // Store ptr to overflow_arg_area
25938 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25939 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25940 Store =
25941 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25942 MemOps.push_back(Store);
25943
25944 // Store ptr to reg_save_area.
25945 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25946 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25947 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25948 Store = DAG.getStore(
25949 Op.getOperand(0), DL, RSFIN, FIN,
25950 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25951 MemOps.push_back(Store);
25952 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25953}
25954
25955SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25956 assert(Subtarget.is64Bit() &&
25957 "LowerVAARG only handles 64-bit va_arg!");
25958 assert(Op.getNumOperands() == 4);
25959
25961 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25962 // The Win64 ABI uses char* instead of a structure.
25963 return DAG.expandVAArg(Op.getNode());
25964
25965 SDValue Chain = Op.getOperand(0);
25966 SDValue SrcPtr = Op.getOperand(1);
25967 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25968 unsigned Align = Op.getConstantOperandVal(3);
25969 SDLoc dl(Op);
25970
25971 EVT ArgVT = Op.getNode()->getValueType(0);
25972 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25973 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25974 uint8_t ArgMode;
25975
25976 // Decide which area this value should be read from.
25977 // TODO: Implement the AMD64 ABI in its entirety. This simple
25978 // selection mechanism works only for the basic types.
25979 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25980 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25981 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25982 } else {
25983 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25984 "Unhandled argument type in LowerVAARG");
25985 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25986 }
25987
25988 if (ArgMode == 2) {
25989 // Make sure using fp_offset makes sense.
25990 assert(!Subtarget.useSoftFloat() &&
25991 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25992 Subtarget.hasSSE1());
25993 }
25994
25995 // Insert VAARG node into the DAG
25996 // VAARG returns two values: Variable Argument Address, Chain
25997 SDValue InstOps[] = {Chain, SrcPtr,
25998 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25999 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26000 DAG.getTargetConstant(Align, dl, MVT::i32)};
26001 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26004 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26005 /*Alignment=*/std::nullopt,
26007 Chain = VAARG.getValue(1);
26008
26009 // Load the next argument and return it
26010 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26011}
26012
26013static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26014 SelectionDAG &DAG) {
26015 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26016 // where a va_list is still an i8*.
26017 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26018 if (Subtarget.isCallingConvWin64(
26020 // Probably a Win64 va_copy.
26021 return DAG.expandVACopy(Op.getNode());
26022
26023 SDValue Chain = Op.getOperand(0);
26024 SDValue DstPtr = Op.getOperand(1);
26025 SDValue SrcPtr = Op.getOperand(2);
26026 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26027 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26028 SDLoc DL(Op);
26029
26030 return DAG.getMemcpy(
26031 Chain, DL, DstPtr, SrcPtr,
26032 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26033 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26034 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26035 MachinePointerInfo(SrcSV));
26036}
26037
26038// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26039static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26040 switch (Opc) {
26041 case ISD::SHL:
26042 case X86ISD::VSHL:
26043 case X86ISD::VSHLI:
26044 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26045 case ISD::SRL:
26046 case X86ISD::VSRL:
26047 case X86ISD::VSRLI:
26048 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26049 case ISD::SRA:
26050 case X86ISD::VSRA:
26051 case X86ISD::VSRAI:
26052 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26053 }
26054 llvm_unreachable("Unknown target vector shift node");
26055}
26056
26057/// Handle vector element shifts where the shift amount is a constant.
26058/// Takes immediate version of shift as input.
26059static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26060 SDValue SrcOp, uint64_t ShiftAmt,
26061 SelectionDAG &DAG) {
26062 MVT ElementType = VT.getVectorElementType();
26063
26064 // Bitcast the source vector to the output type, this is mainly necessary for
26065 // vXi8/vXi64 shifts.
26066 if (VT != SrcOp.getSimpleValueType())
26067 SrcOp = DAG.getBitcast(VT, SrcOp);
26068
26069 // Fold this packed shift into its first operand if ShiftAmt is 0.
26070 if (ShiftAmt == 0)
26071 return SrcOp;
26072
26073 // Check for ShiftAmt >= element width
26074 if (ShiftAmt >= ElementType.getSizeInBits()) {
26075 if (Opc == X86ISD::VSRAI)
26076 ShiftAmt = ElementType.getSizeInBits() - 1;
26077 else
26078 return DAG.getConstant(0, dl, VT);
26079 }
26080
26082 && "Unknown target vector shift-by-constant node");
26083
26084 // Fold this packed vector shift into a build vector if SrcOp is a
26085 // vector of Constants or UNDEFs.
26087 unsigned ShiftOpc;
26088 switch (Opc) {
26089 default: llvm_unreachable("Unknown opcode!");
26090 case X86ISD::VSHLI:
26091 ShiftOpc = ISD::SHL;
26092 break;
26093 case X86ISD::VSRLI:
26094 ShiftOpc = ISD::SRL;
26095 break;
26096 case X86ISD::VSRAI:
26097 ShiftOpc = ISD::SRA;
26098 break;
26099 }
26100
26101 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26102 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26103 return C;
26104 }
26105
26106 return DAG.getNode(Opc, dl, VT, SrcOp,
26107 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26108}
26109
26110/// Handle vector element shifts by a splat shift amount
26111static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26112 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26113 const X86Subtarget &Subtarget,
26114 SelectionDAG &DAG) {
26115 MVT AmtVT = ShAmt.getSimpleValueType();
26116 assert(AmtVT.isVector() && "Vector shift type mismatch");
26117 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26118 "Illegal vector splat index");
26119
26120 // Move the splat element to the bottom element.
26121 if (ShAmtIdx != 0) {
26122 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26123 Mask[0] = ShAmtIdx;
26124 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26125 }
26126
26127 // Peek through any zext node if we can get back to a 128-bit source.
26128 if (AmtVT.getScalarSizeInBits() == 64 &&
26129 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26131 ShAmt.getOperand(0).getValueType().isSimple() &&
26132 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26133 ShAmt = ShAmt.getOperand(0);
26134 AmtVT = ShAmt.getSimpleValueType();
26135 }
26136
26137 // See if we can mask off the upper elements using the existing source node.
26138 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26139 // do this for vXi64 types.
26140 bool IsMasked = false;
26141 if (AmtVT.getScalarSizeInBits() < 64) {
26142 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26143 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26144 // If the shift amount has come from a scalar, then zero-extend the scalar
26145 // before moving to the vector.
26146 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26147 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26148 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26149 AmtVT = MVT::v4i32;
26150 IsMasked = true;
26151 } else if (ShAmt.getOpcode() == ISD::AND) {
26152 // See if the shift amount is already masked (e.g. for rotation modulo),
26153 // then we can zero-extend it by setting all the other mask elements to
26154 // zero.
26155 SmallVector<SDValue> MaskElts(
26156 AmtVT.getVectorNumElements(),
26157 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26158 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26159 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26160 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26161 {ShAmt.getOperand(1), Mask}))) {
26162 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26163 IsMasked = true;
26164 }
26165 }
26166 }
26167
26168 // Extract if the shift amount vector is larger than 128-bits.
26169 if (AmtVT.getSizeInBits() > 128) {
26170 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26171 AmtVT = ShAmt.getSimpleValueType();
26172 }
26173
26174 // Zero-extend bottom element to v2i64 vector type, either by extension or
26175 // shuffle masking.
26176 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26177 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26178 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26179 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26180 } else if (Subtarget.hasSSE41()) {
26181 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26182 MVT::v2i64, ShAmt);
26183 } else {
26184 SDValue ByteShift = DAG.getTargetConstant(
26185 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26186 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26187 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26188 ByteShift);
26189 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26190 ByteShift);
26191 }
26192 }
26193
26194 // Change opcode to non-immediate version.
26196
26197 // The return type has to be a 128-bit type with the same element
26198 // type as the input type.
26199 MVT EltVT = VT.getVectorElementType();
26200 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26201
26202 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26203 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26204}
26205
26206/// Return Mask with the necessary casting or extending
26207/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26208static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26209 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26210 const SDLoc &dl) {
26211
26212 if (isAllOnesConstant(Mask))
26213 return DAG.getConstant(1, dl, MaskVT);
26214 if (X86::isZeroNode(Mask))
26215 return DAG.getConstant(0, dl, MaskVT);
26216
26217 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26218
26219 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26220 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26221 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26222 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26223 SDValue Lo, Hi;
26224 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26225 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26226 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26227 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26228 } else {
26229 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26230 Mask.getSimpleValueType().getSizeInBits());
26231 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26232 // are extracted by EXTRACT_SUBVECTOR.
26233 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26234 DAG.getBitcast(BitcastVT, Mask),
26235 DAG.getVectorIdxConstant(0, dl));
26236 }
26237}
26238
26239/// Return (and \p Op, \p Mask) for compare instructions or
26240/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26241/// necessary casting or extending for \p Mask when lowering masking intrinsics
26243 SDValue PreservedSrc,
26244 const X86Subtarget &Subtarget,
26245 SelectionDAG &DAG) {
26246 MVT VT = Op.getSimpleValueType();
26247 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26248 unsigned OpcodeSelect = ISD::VSELECT;
26249 SDLoc dl(Op);
26250
26251 if (isAllOnesConstant(Mask))
26252 return Op;
26253
26254 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26255
26256 if (PreservedSrc.isUndef())
26257 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26258 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26259}
26260
26261/// Creates an SDNode for a predicated scalar operation.
26262/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26263/// The mask is coming as MVT::i8 and it should be transformed
26264/// to MVT::v1i1 while lowering masking intrinsics.
26265/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26266/// "X86select" instead of "vselect". We just can't create the "vselect" node
26267/// for a scalar instruction.
26269 SDValue PreservedSrc,
26270 const X86Subtarget &Subtarget,
26271 SelectionDAG &DAG) {
26272 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26273 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26274 return Op;
26275
26276 MVT VT = Op.getSimpleValueType();
26277 SDLoc dl(Op);
26278
26279 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26280 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26281 DAG.getBitcast(MVT::v8i1, Mask),
26282 DAG.getVectorIdxConstant(0, dl));
26283 if (Op.getOpcode() == X86ISD::FSETCCM ||
26284 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26285 Op.getOpcode() == X86ISD::VFPCLASSS)
26286 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26287
26288 if (PreservedSrc.isUndef())
26289 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26290
26291 if (MaskConst) {
26292 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26293 // Discard op and blend passthrough with scalar op src/dst.
26295 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26296 ShuffleMask[0] = VT.getVectorNumElements();
26297 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26298 ShuffleMask);
26299 }
26300
26301 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26302}
26303
26305 if (!Fn->hasPersonalityFn())
26307 "querying registration node size for function without personality");
26308 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26309 // WinEHStatePass for the full struct definition.
26310 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26311 case EHPersonality::MSVC_X86SEH: return 24;
26312 case EHPersonality::MSVC_CXX: return 16;
26313 default: break;
26314 }
26316 "can only recover FP for 32-bit MSVC EH personality functions");
26317}
26318
26319/// When the MSVC runtime transfers control to us, either to an outlined
26320/// function or when returning to a parent frame after catching an exception, we
26321/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26322/// Here's the math:
26323/// RegNodeBase = EntryEBP - RegNodeSize
26324/// ParentFP = RegNodeBase - ParentFrameOffset
26325/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26326/// subtracting the offset (negative on x86) takes us back to the parent FP.
26328 SDValue EntryEBP) {
26330 SDLoc dl;
26331
26332 // It's possible that the parent function no longer has a personality function
26333 // if the exceptional code was optimized away, in which case we just return
26334 // the incoming EBP.
26335 if (!Fn->hasPersonalityFn())
26336 return EntryEBP;
26337
26338 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26339 // registration, or the .set_setframe offset.
26342 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26343 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26344 SDValue ParentFrameOffset =
26345 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26346
26347 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26348 // prologue to RBP in the parent function.
26349 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26350 if (Subtarget.is64Bit())
26351 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26352
26353 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26354 // RegNodeBase = EntryEBP - RegNodeSize
26355 // ParentFP = RegNodeBase - ParentFrameOffset
26356 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26357 DAG.getConstant(RegNodeSize, dl, PtrVT));
26358 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26359}
26360
26361SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26362 SelectionDAG &DAG) const {
26363 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26364 auto isRoundModeCurDirection = [](SDValue Rnd) {
26365 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26366 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26367
26368 return false;
26369 };
26370 auto isRoundModeSAE = [](SDValue Rnd) {
26371 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26372 unsigned RC = C->getZExtValue();
26374 // Clear the NO_EXC bit and check remaining bits.
26376 // As a convenience we allow no other bits or explicitly
26377 // current direction.
26378 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26379 }
26380 }
26381
26382 return false;
26383 };
26384 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26385 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26386 RC = C->getZExtValue();
26388 // Clear the NO_EXC bit and check remaining bits.
26394 }
26395 }
26396
26397 return false;
26398 };
26399
26400 SDLoc dl(Op);
26401 unsigned IntNo = Op.getConstantOperandVal(0);
26402 MVT VT = Op.getSimpleValueType();
26403 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26404
26405 // Propagate flags from original node to transformed node(s).
26406 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26407
26408 if (IntrData) {
26409 switch(IntrData->Type) {
26410 case INTR_TYPE_1OP: {
26411 // We specify 2 possible opcodes for intrinsics with rounding modes.
26412 // First, we check if the intrinsic may have non-default rounding mode,
26413 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26414 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26415 if (IntrWithRoundingModeOpcode != 0) {
26416 SDValue Rnd = Op.getOperand(2);
26417 unsigned RC = 0;
26418 if (isRoundModeSAEToX(Rnd, RC))
26419 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26420 Op.getOperand(1),
26421 DAG.getTargetConstant(RC, dl, MVT::i32));
26422 if (!isRoundModeCurDirection(Rnd))
26423 return SDValue();
26424 }
26425 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26426 Op.getOperand(1));
26427 }
26428 case INTR_TYPE_1OP_SAE: {
26429 SDValue Sae = Op.getOperand(2);
26430
26431 unsigned Opc;
26432 if (isRoundModeCurDirection(Sae))
26433 Opc = IntrData->Opc0;
26434 else if (isRoundModeSAE(Sae))
26435 Opc = IntrData->Opc1;
26436 else
26437 return SDValue();
26438
26439 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26440 }
26441 case INTR_TYPE_2OP: {
26442 SDValue Src2 = Op.getOperand(2);
26443
26444 // We specify 2 possible opcodes for intrinsics with rounding modes.
26445 // First, we check if the intrinsic may have non-default rounding mode,
26446 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26447 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26448 if (IntrWithRoundingModeOpcode != 0) {
26449 SDValue Rnd = Op.getOperand(3);
26450 unsigned RC = 0;
26451 if (isRoundModeSAEToX(Rnd, RC))
26452 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26453 Op.getOperand(1), Src2,
26454 DAG.getTargetConstant(RC, dl, MVT::i32));
26455 if (!isRoundModeCurDirection(Rnd))
26456 return SDValue();
26457 }
26458
26459 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26460 Op.getOperand(1), Src2);
26461 }
26462 case INTR_TYPE_2OP_SAE: {
26463 SDValue Sae = Op.getOperand(3);
26464
26465 unsigned Opc;
26466 if (isRoundModeCurDirection(Sae))
26467 Opc = IntrData->Opc0;
26468 else if (isRoundModeSAE(Sae))
26469 Opc = IntrData->Opc1;
26470 else
26471 return SDValue();
26472
26473 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26474 Op.getOperand(2));
26475 }
26476 case INTR_TYPE_3OP:
26477 case INTR_TYPE_3OP_IMM8: {
26478 SDValue Src1 = Op.getOperand(1);
26479 SDValue Src2 = Op.getOperand(2);
26480 SDValue Src3 = Op.getOperand(3);
26481
26482 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26483 Src3.getValueType() != MVT::i8) {
26484 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26485 }
26486
26487 // We specify 2 possible opcodes for intrinsics with rounding modes.
26488 // First, we check if the intrinsic may have non-default rounding mode,
26489 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26490 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26491 if (IntrWithRoundingModeOpcode != 0) {
26492 SDValue Rnd = Op.getOperand(4);
26493 unsigned RC = 0;
26494 if (isRoundModeSAEToX(Rnd, RC))
26495 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26496 Src1, Src2, Src3,
26497 DAG.getTargetConstant(RC, dl, MVT::i32));
26498 if (!isRoundModeCurDirection(Rnd))
26499 return SDValue();
26500 }
26501
26502 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26503 {Src1, Src2, Src3});
26504 }
26505 case INTR_TYPE_4OP_IMM8: {
26506 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26507 SDValue Src4 = Op.getOperand(4);
26508 if (Src4.getValueType() != MVT::i8) {
26509 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26510 }
26511
26512 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26513 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26514 Src4);
26515 }
26516 case INTR_TYPE_1OP_MASK: {
26517 SDValue Src = Op.getOperand(1);
26518 SDValue PassThru = Op.getOperand(2);
26519 SDValue Mask = Op.getOperand(3);
26520 // We add rounding mode to the Node when
26521 // - RC Opcode is specified and
26522 // - RC is not "current direction".
26523 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26524 if (IntrWithRoundingModeOpcode != 0) {
26525 SDValue Rnd = Op.getOperand(4);
26526 unsigned RC = 0;
26527 if (isRoundModeSAEToX(Rnd, RC))
26528 return getVectorMaskingNode(
26529 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26530 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26531 Mask, PassThru, Subtarget, DAG);
26532 if (!isRoundModeCurDirection(Rnd))
26533 return SDValue();
26534 }
26535 return getVectorMaskingNode(
26536 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26537 Subtarget, DAG);
26538 }
26540 SDValue Src = Op.getOperand(1);
26541 SDValue PassThru = Op.getOperand(2);
26542 SDValue Mask = Op.getOperand(3);
26543 SDValue Rnd = Op.getOperand(4);
26544
26545 unsigned Opc;
26546 if (isRoundModeCurDirection(Rnd))
26547 Opc = IntrData->Opc0;
26548 else if (isRoundModeSAE(Rnd))
26549 Opc = IntrData->Opc1;
26550 else
26551 return SDValue();
26552
26553 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26554 Subtarget, DAG);
26555 }
26556 case INTR_TYPE_SCALAR_MASK: {
26557 SDValue Src1 = Op.getOperand(1);
26558 SDValue Src2 = Op.getOperand(2);
26559 SDValue passThru = Op.getOperand(3);
26560 SDValue Mask = Op.getOperand(4);
26561 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26562 // There are 2 kinds of intrinsics in this group:
26563 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26564 // (2) With rounding mode and sae - 7 operands.
26565 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26566 if (Op.getNumOperands() == (5U + HasRounding)) {
26567 if (HasRounding) {
26568 SDValue Rnd = Op.getOperand(5);
26569 unsigned RC = 0;
26570 if (isRoundModeSAEToX(Rnd, RC))
26571 return getScalarMaskingNode(
26572 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26573 DAG.getTargetConstant(RC, dl, MVT::i32)),
26574 Mask, passThru, Subtarget, DAG);
26575 if (!isRoundModeCurDirection(Rnd))
26576 return SDValue();
26577 }
26578 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26579 Src2),
26580 Mask, passThru, Subtarget, DAG);
26581 }
26582
26583 assert(Op.getNumOperands() == (6U + HasRounding) &&
26584 "Unexpected intrinsic form");
26585 SDValue RoundingMode = Op.getOperand(5);
26586 unsigned Opc = IntrData->Opc0;
26587 if (HasRounding) {
26588 SDValue Sae = Op.getOperand(6);
26589 if (isRoundModeSAE(Sae))
26590 Opc = IntrWithRoundingModeOpcode;
26591 else if (!isRoundModeCurDirection(Sae))
26592 return SDValue();
26593 }
26594 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26595 Src2, RoundingMode),
26596 Mask, passThru, Subtarget, DAG);
26597 }
26599 SDValue Src1 = Op.getOperand(1);
26600 SDValue Src2 = Op.getOperand(2);
26601 SDValue passThru = Op.getOperand(3);
26602 SDValue Mask = Op.getOperand(4);
26603 SDValue Rnd = Op.getOperand(5);
26604
26605 SDValue NewOp;
26606 unsigned RC = 0;
26607 if (isRoundModeCurDirection(Rnd))
26608 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26609 else if (isRoundModeSAEToX(Rnd, RC))
26610 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26611 DAG.getTargetConstant(RC, dl, MVT::i32));
26612 else
26613 return SDValue();
26614
26615 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26616 }
26618 SDValue Src1 = Op.getOperand(1);
26619 SDValue Src2 = Op.getOperand(2);
26620 SDValue passThru = Op.getOperand(3);
26621 SDValue Mask = Op.getOperand(4);
26622 SDValue Sae = Op.getOperand(5);
26623 unsigned Opc;
26624 if (isRoundModeCurDirection(Sae))
26625 Opc = IntrData->Opc0;
26626 else if (isRoundModeSAE(Sae))
26627 Opc = IntrData->Opc1;
26628 else
26629 return SDValue();
26630
26631 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26632 Mask, passThru, Subtarget, DAG);
26633 }
26634 case INTR_TYPE_2OP_MASK: {
26635 SDValue Src1 = Op.getOperand(1);
26636 SDValue Src2 = Op.getOperand(2);
26637 SDValue PassThru = Op.getOperand(3);
26638 SDValue Mask = Op.getOperand(4);
26639 SDValue NewOp;
26640 if (IntrData->Opc1 != 0) {
26641 SDValue Rnd = Op.getOperand(5);
26642 unsigned RC = 0;
26643 if (isRoundModeSAEToX(Rnd, RC))
26644 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26645 DAG.getTargetConstant(RC, dl, MVT::i32));
26646 else if (!isRoundModeCurDirection(Rnd))
26647 return SDValue();
26648 }
26649 if (!NewOp)
26650 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26651 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26652 }
26654 SDValue Src1 = Op.getOperand(1);
26655 SDValue Src2 = Op.getOperand(2);
26656 SDValue PassThru = Op.getOperand(3);
26657 SDValue Mask = Op.getOperand(4);
26658
26659 unsigned Opc = IntrData->Opc0;
26660 if (IntrData->Opc1 != 0) {
26661 SDValue Sae = Op.getOperand(5);
26662 if (isRoundModeSAE(Sae))
26663 Opc = IntrData->Opc1;
26664 else if (!isRoundModeCurDirection(Sae))
26665 return SDValue();
26666 }
26667
26668 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26669 Mask, PassThru, Subtarget, DAG);
26670 }
26672 SDValue Src1 = Op.getOperand(1);
26673 SDValue Src2 = Op.getOperand(2);
26674 SDValue Src3 = Op.getOperand(3);
26675 SDValue PassThru = Op.getOperand(4);
26676 SDValue Mask = Op.getOperand(5);
26677 SDValue Sae = Op.getOperand(6);
26678 unsigned Opc;
26679 if (isRoundModeCurDirection(Sae))
26680 Opc = IntrData->Opc0;
26681 else if (isRoundModeSAE(Sae))
26682 Opc = IntrData->Opc1;
26683 else
26684 return SDValue();
26685
26686 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26687 Mask, PassThru, Subtarget, DAG);
26688 }
26690 SDValue Src1 = Op.getOperand(1);
26691 SDValue Src2 = Op.getOperand(2);
26692 SDValue Src3 = Op.getOperand(3);
26693 SDValue PassThru = Op.getOperand(4);
26694 SDValue Mask = Op.getOperand(5);
26695
26696 unsigned Opc = IntrData->Opc0;
26697 if (IntrData->Opc1 != 0) {
26698 SDValue Sae = Op.getOperand(6);
26699 if (isRoundModeSAE(Sae))
26700 Opc = IntrData->Opc1;
26701 else if (!isRoundModeCurDirection(Sae))
26702 return SDValue();
26703 }
26704 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26705 Mask, PassThru, Subtarget, DAG);
26706 }
26707 case BLENDV: {
26708 SDValue Src1 = Op.getOperand(1);
26709 SDValue Src2 = Op.getOperand(2);
26710 SDValue Src3 = Op.getOperand(3);
26711
26713 Src3 = DAG.getBitcast(MaskVT, Src3);
26714
26715 // Reverse the operands to match VSELECT order.
26716 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26717 }
26718 case VPERM_2OP : {
26719 SDValue Src1 = Op.getOperand(1);
26720 SDValue Src2 = Op.getOperand(2);
26721
26722 // Swap Src1 and Src2 in the node creation
26723 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26724 }
26725 case CFMA_OP_MASKZ:
26726 case CFMA_OP_MASK: {
26727 SDValue Src1 = Op.getOperand(1);
26728 SDValue Src2 = Op.getOperand(2);
26729 SDValue Src3 = Op.getOperand(3);
26730 SDValue Mask = Op.getOperand(4);
26731 MVT VT = Op.getSimpleValueType();
26732
26733 SDValue PassThru = Src3;
26734 if (IntrData->Type == CFMA_OP_MASKZ)
26735 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26736
26737 // We add rounding mode to the Node when
26738 // - RC Opcode is specified and
26739 // - RC is not "current direction".
26740 SDValue NewOp;
26741 if (IntrData->Opc1 != 0) {
26742 SDValue Rnd = Op.getOperand(5);
26743 unsigned RC = 0;
26744 if (isRoundModeSAEToX(Rnd, RC))
26745 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26746 DAG.getTargetConstant(RC, dl, MVT::i32));
26747 else if (!isRoundModeCurDirection(Rnd))
26748 return SDValue();
26749 }
26750 if (!NewOp)
26751 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26752 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26753 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26754 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26755 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26756 }
26757 case IFMA_OP:
26758 // NOTE: We need to swizzle the operands to pass the multiply operands
26759 // first.
26760 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26761 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26762 case FPCLASSS: {
26763 SDValue Src1 = Op.getOperand(1);
26764 SDValue Imm = Op.getOperand(2);
26765 SDValue Mask = Op.getOperand(3);
26766 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26767 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26768 Subtarget, DAG);
26769 // Need to fill with zeros to ensure the bitcast will produce zeroes
26770 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26771 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26772 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26773 DAG.getVectorIdxConstant(0, dl));
26774 return DAG.getBitcast(MVT::i8, Ins);
26775 }
26776
26777 case CMP_MASK_CC: {
26778 MVT MaskVT = Op.getSimpleValueType();
26779 SDValue CC = Op.getOperand(3);
26780 SDValue Mask = Op.getOperand(4);
26781 // We specify 2 possible opcodes for intrinsics with rounding modes.
26782 // First, we check if the intrinsic may have non-default rounding mode,
26783 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26784 if (IntrData->Opc1 != 0) {
26785 SDValue Sae = Op.getOperand(5);
26786 if (isRoundModeSAE(Sae))
26787 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26788 Op.getOperand(2), CC, Mask, Sae);
26789 if (!isRoundModeCurDirection(Sae))
26790 return SDValue();
26791 }
26792 //default rounding mode
26793 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26794 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26795 }
26796 case CMP_MASK_SCALAR_CC: {
26797 SDValue Src1 = Op.getOperand(1);
26798 SDValue Src2 = Op.getOperand(2);
26799 SDValue CC = Op.getOperand(3);
26800 SDValue Mask = Op.getOperand(4);
26801
26802 SDValue Cmp;
26803 if (IntrData->Opc1 != 0) {
26804 SDValue Sae = Op.getOperand(5);
26805 if (isRoundModeSAE(Sae))
26806 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26807 else if (!isRoundModeCurDirection(Sae))
26808 return SDValue();
26809 }
26810 //default rounding mode
26811 if (!Cmp.getNode())
26812 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26813
26814 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26815 Subtarget, DAG);
26816 // Need to fill with zeros to ensure the bitcast will produce zeroes
26817 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26818 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26819 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26820 DAG.getVectorIdxConstant(0, dl));
26821 return DAG.getBitcast(MVT::i8, Ins);
26822 }
26823 case COMI: { // Comparison intrinsics
26824 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26825 SDValue LHS = Op.getOperand(1);
26826 SDValue RHS = Op.getOperand(2);
26827 // Some conditions require the operands to be swapped.
26828 if (CC == ISD::SETLT || CC == ISD::SETLE)
26829 std::swap(LHS, RHS);
26830
26831 // For AVX10.2, Support EQ and NE.
26832 bool HasAVX10_2_COMX =
26833 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26834
26835 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26836 // For BF type we need to fall back.
26837 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26838
26839 auto ComiOpCode = IntrData->Opc0;
26840 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26841
26842 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26843 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26844
26845 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26846
26847 SDValue SetCC;
26848 switch (CC) {
26849 case ISD::SETEQ: {
26850 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26851 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26852 break;
26853 // (ZF = 1 and PF = 0)
26854 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26855 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26856 break;
26857 }
26858 case ISD::SETNE: {
26859 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26860 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26861 break;
26862 // (ZF = 0 or PF = 1)
26863 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26864 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26865 break;
26866 }
26867 case ISD::SETGT: // (CF = 0 and ZF = 0)
26868 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26869 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26870 break;
26871 }
26872 case ISD::SETGE: // CF = 0
26873 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26874 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26875 break;
26876 default:
26877 llvm_unreachable("Unexpected illegal condition!");
26878 }
26879 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26880 }
26881 case COMI_RM: { // Comparison intrinsics with Sae
26882 SDValue LHS = Op.getOperand(1);
26883 SDValue RHS = Op.getOperand(2);
26884 unsigned CondVal = Op.getConstantOperandVal(3);
26885 SDValue Sae = Op.getOperand(4);
26886
26887 SDValue FCmp;
26888 if (isRoundModeCurDirection(Sae))
26889 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26890 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26891 else if (isRoundModeSAE(Sae))
26892 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26893 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26894 else
26895 return SDValue();
26896 // Need to fill with zeros to ensure the bitcast will produce zeroes
26897 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26898 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26899 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26900 DAG.getVectorIdxConstant(0, dl));
26901 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26902 DAG.getBitcast(MVT::i16, Ins));
26903 }
26904 case VSHIFT: {
26905 SDValue SrcOp = Op.getOperand(1);
26906 SDValue ShAmt = Op.getOperand(2);
26907 assert(ShAmt.getValueType() == MVT::i32 &&
26908 "Unexpected VSHIFT amount type");
26909
26910 // Catch shift-by-constant.
26911 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26912 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26913 Op.getSimpleValueType(), SrcOp,
26914 CShAmt->getZExtValue(), DAG);
26915
26916 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26917 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26918 SrcOp, ShAmt, 0, Subtarget, DAG);
26919 }
26921 SDValue Mask = Op.getOperand(3);
26922 SDValue DataToCompress = Op.getOperand(1);
26923 SDValue PassThru = Op.getOperand(2);
26924 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26925 return Op.getOperand(1);
26926
26927 // Avoid false dependency.
26928 if (PassThru.isUndef())
26929 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26930
26931 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26932 Mask);
26933 }
26934 case FIXUPIMM:
26935 case FIXUPIMM_MASKZ: {
26936 SDValue Src1 = Op.getOperand(1);
26937 SDValue Src2 = Op.getOperand(2);
26938 SDValue Src3 = Op.getOperand(3);
26939 SDValue Imm = Op.getOperand(4);
26940 SDValue Mask = Op.getOperand(5);
26941 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26942 ? Src1
26943 : getZeroVector(VT, Subtarget, DAG, dl);
26944
26945 unsigned Opc = IntrData->Opc0;
26946 if (IntrData->Opc1 != 0) {
26947 SDValue Sae = Op.getOperand(6);
26948 if (isRoundModeSAE(Sae))
26949 Opc = IntrData->Opc1;
26950 else if (!isRoundModeCurDirection(Sae))
26951 return SDValue();
26952 }
26953
26954 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26955
26957 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26958
26959 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26960 }
26961 case ROUNDP: {
26962 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26963 // Clear the upper bits of the rounding immediate so that the legacy
26964 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26965 uint64_t Round = Op.getConstantOperandVal(2);
26966 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26967 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26968 Op.getOperand(1), RoundingMode);
26969 }
26970 case ROUNDS: {
26971 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26972 // Clear the upper bits of the rounding immediate so that the legacy
26973 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26974 uint64_t Round = Op.getConstantOperandVal(3);
26975 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26976 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26977 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26978 }
26979 case BEXTRI: {
26980 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26981
26982 uint64_t Imm = Op.getConstantOperandVal(2);
26983 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26984 Op.getValueType());
26985 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26986 Op.getOperand(1), Control);
26987 }
26988 // ADC/SBB
26989 case ADX: {
26990 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26991 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26992
26993 SDValue Res;
26994 // If the carry in is zero, then we should just use ADD/SUB instead of
26995 // ADC/SBB.
26996 if (isNullConstant(Op.getOperand(1))) {
26997 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26998 Op.getOperand(3));
26999 } else {
27000 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27001 DAG.getAllOnesConstant(dl, MVT::i8));
27002 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27003 Op.getOperand(3), GenCF.getValue(1));
27004 }
27005 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27006 SDValue Results[] = { SetCC, Res };
27007 return DAG.getMergeValues(Results, dl);
27008 }
27009 case CVTPD2PS_MASK:
27010 case CVTPD2DQ_MASK:
27011 case CVTQQ2PS_MASK:
27012 case TRUNCATE_TO_REG: {
27013 SDValue Src = Op.getOperand(1);
27014 SDValue PassThru = Op.getOperand(2);
27015 SDValue Mask = Op.getOperand(3);
27016
27017 if (isAllOnesConstant(Mask))
27018 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27019
27020 MVT SrcVT = Src.getSimpleValueType();
27021 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27022 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27023 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27024 {Src, PassThru, Mask});
27025 }
27026 case TRUNCATE2_TO_REG: {
27027 SDValue Src = Op.getOperand(1);
27028 SDValue Src2 = Op.getOperand(2);
27029 SDValue PassThru = Op.getOperand(3);
27030 SDValue Mask = Op.getOperand(4);
27031
27032 if (isAllOnesConstant(Mask))
27033 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27034
27035 MVT Src2VT = Src2.getSimpleValueType();
27036 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27037 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27038 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27039 {Src, Src2, PassThru, Mask});
27040 }
27041 case CVTPS2PH_MASK: {
27042 SDValue Src = Op.getOperand(1);
27043 SDValue Rnd = Op.getOperand(2);
27044 SDValue PassThru = Op.getOperand(3);
27045 SDValue Mask = Op.getOperand(4);
27046
27047 unsigned RC = 0;
27048 unsigned Opc = IntrData->Opc0;
27049 bool SAE = Src.getValueType().is512BitVector() &&
27050 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27051 if (SAE) {
27053 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27054 }
27055
27056 if (isAllOnesConstant(Mask))
27057 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27058
27059 if (SAE)
27061 else
27062 Opc = IntrData->Opc1;
27063 MVT SrcVT = Src.getSimpleValueType();
27064 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27065 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27066 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27067 }
27068 case CVTNEPS2BF16_MASK: {
27069 SDValue Src = Op.getOperand(1);
27070 SDValue PassThru = Op.getOperand(2);
27071 SDValue Mask = Op.getOperand(3);
27072
27073 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27074 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27075
27076 // Break false dependency.
27077 if (PassThru.isUndef())
27078 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27079
27080 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27081 Mask);
27082 }
27083 default:
27084 break;
27085 }
27086 }
27087
27088 switch (IntNo) {
27089 default: return SDValue(); // Don't custom lower most intrinsics.
27090
27091 // ptest and testp intrinsics. The intrinsic these come from are designed to
27092 // return an integer value, not just an instruction so lower it to the ptest
27093 // or testp pattern and a setcc for the result.
27094 case Intrinsic::x86_avx512_ktestc_b:
27095 case Intrinsic::x86_avx512_ktestc_w:
27096 case Intrinsic::x86_avx512_ktestc_d:
27097 case Intrinsic::x86_avx512_ktestc_q:
27098 case Intrinsic::x86_avx512_ktestz_b:
27099 case Intrinsic::x86_avx512_ktestz_w:
27100 case Intrinsic::x86_avx512_ktestz_d:
27101 case Intrinsic::x86_avx512_ktestz_q:
27102 case Intrinsic::x86_sse41_ptestz:
27103 case Intrinsic::x86_sse41_ptestc:
27104 case Intrinsic::x86_sse41_ptestnzc:
27105 case Intrinsic::x86_avx_ptestz_256:
27106 case Intrinsic::x86_avx_ptestc_256:
27107 case Intrinsic::x86_avx_ptestnzc_256:
27108 case Intrinsic::x86_avx_vtestz_ps:
27109 case Intrinsic::x86_avx_vtestc_ps:
27110 case Intrinsic::x86_avx_vtestnzc_ps:
27111 case Intrinsic::x86_avx_vtestz_pd:
27112 case Intrinsic::x86_avx_vtestc_pd:
27113 case Intrinsic::x86_avx_vtestnzc_pd:
27114 case Intrinsic::x86_avx_vtestz_ps_256:
27115 case Intrinsic::x86_avx_vtestc_ps_256:
27116 case Intrinsic::x86_avx_vtestnzc_ps_256:
27117 case Intrinsic::x86_avx_vtestz_pd_256:
27118 case Intrinsic::x86_avx_vtestc_pd_256:
27119 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27120 unsigned TestOpc = X86ISD::PTEST;
27121 X86::CondCode X86CC;
27122 switch (IntNo) {
27123 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27124 case Intrinsic::x86_avx512_ktestc_b:
27125 case Intrinsic::x86_avx512_ktestc_w:
27126 case Intrinsic::x86_avx512_ktestc_d:
27127 case Intrinsic::x86_avx512_ktestc_q:
27128 // CF = 1
27129 TestOpc = X86ISD::KTEST;
27130 X86CC = X86::COND_B;
27131 break;
27132 case Intrinsic::x86_avx512_ktestz_b:
27133 case Intrinsic::x86_avx512_ktestz_w:
27134 case Intrinsic::x86_avx512_ktestz_d:
27135 case Intrinsic::x86_avx512_ktestz_q:
27136 TestOpc = X86ISD::KTEST;
27137 X86CC = X86::COND_E;
27138 break;
27139 case Intrinsic::x86_avx_vtestz_ps:
27140 case Intrinsic::x86_avx_vtestz_pd:
27141 case Intrinsic::x86_avx_vtestz_ps_256:
27142 case Intrinsic::x86_avx_vtestz_pd_256:
27143 TestOpc = X86ISD::TESTP;
27144 [[fallthrough]];
27145 case Intrinsic::x86_sse41_ptestz:
27146 case Intrinsic::x86_avx_ptestz_256:
27147 // ZF = 1
27148 X86CC = X86::COND_E;
27149 break;
27150 case Intrinsic::x86_avx_vtestc_ps:
27151 case Intrinsic::x86_avx_vtestc_pd:
27152 case Intrinsic::x86_avx_vtestc_ps_256:
27153 case Intrinsic::x86_avx_vtestc_pd_256:
27154 TestOpc = X86ISD::TESTP;
27155 [[fallthrough]];
27156 case Intrinsic::x86_sse41_ptestc:
27157 case Intrinsic::x86_avx_ptestc_256:
27158 // CF = 1
27159 X86CC = X86::COND_B;
27160 break;
27161 case Intrinsic::x86_avx_vtestnzc_ps:
27162 case Intrinsic::x86_avx_vtestnzc_pd:
27163 case Intrinsic::x86_avx_vtestnzc_ps_256:
27164 case Intrinsic::x86_avx_vtestnzc_pd_256:
27165 TestOpc = X86ISD::TESTP;
27166 [[fallthrough]];
27167 case Intrinsic::x86_sse41_ptestnzc:
27168 case Intrinsic::x86_avx_ptestnzc_256:
27169 // ZF and CF = 0
27170 X86CC = X86::COND_A;
27171 break;
27172 }
27173
27174 SDValue LHS = Op.getOperand(1);
27175 SDValue RHS = Op.getOperand(2);
27176 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27177 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27178 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27179 }
27180
27181 case Intrinsic::x86_sse42_pcmpistria128:
27182 case Intrinsic::x86_sse42_pcmpestria128:
27183 case Intrinsic::x86_sse42_pcmpistric128:
27184 case Intrinsic::x86_sse42_pcmpestric128:
27185 case Intrinsic::x86_sse42_pcmpistrio128:
27186 case Intrinsic::x86_sse42_pcmpestrio128:
27187 case Intrinsic::x86_sse42_pcmpistris128:
27188 case Intrinsic::x86_sse42_pcmpestris128:
27189 case Intrinsic::x86_sse42_pcmpistriz128:
27190 case Intrinsic::x86_sse42_pcmpestriz128: {
27191 unsigned Opcode;
27192 X86::CondCode X86CC;
27193 switch (IntNo) {
27194 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27195 case Intrinsic::x86_sse42_pcmpistria128:
27196 Opcode = X86ISD::PCMPISTR;
27197 X86CC = X86::COND_A;
27198 break;
27199 case Intrinsic::x86_sse42_pcmpestria128:
27200 Opcode = X86ISD::PCMPESTR;
27201 X86CC = X86::COND_A;
27202 break;
27203 case Intrinsic::x86_sse42_pcmpistric128:
27204 Opcode = X86ISD::PCMPISTR;
27205 X86CC = X86::COND_B;
27206 break;
27207 case Intrinsic::x86_sse42_pcmpestric128:
27208 Opcode = X86ISD::PCMPESTR;
27209 X86CC = X86::COND_B;
27210 break;
27211 case Intrinsic::x86_sse42_pcmpistrio128:
27212 Opcode = X86ISD::PCMPISTR;
27213 X86CC = X86::COND_O;
27214 break;
27215 case Intrinsic::x86_sse42_pcmpestrio128:
27216 Opcode = X86ISD::PCMPESTR;
27217 X86CC = X86::COND_O;
27218 break;
27219 case Intrinsic::x86_sse42_pcmpistris128:
27220 Opcode = X86ISD::PCMPISTR;
27221 X86CC = X86::COND_S;
27222 break;
27223 case Intrinsic::x86_sse42_pcmpestris128:
27224 Opcode = X86ISD::PCMPESTR;
27225 X86CC = X86::COND_S;
27226 break;
27227 case Intrinsic::x86_sse42_pcmpistriz128:
27228 Opcode = X86ISD::PCMPISTR;
27229 X86CC = X86::COND_E;
27230 break;
27231 case Intrinsic::x86_sse42_pcmpestriz128:
27232 Opcode = X86ISD::PCMPESTR;
27233 X86CC = X86::COND_E;
27234 break;
27235 }
27237 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27238 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27239 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27240 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27241 }
27242
27243 case Intrinsic::x86_sse42_pcmpistri128:
27244 case Intrinsic::x86_sse42_pcmpestri128: {
27245 unsigned Opcode;
27246 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27247 Opcode = X86ISD::PCMPISTR;
27248 else
27249 Opcode = X86ISD::PCMPESTR;
27250
27252 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27253 return DAG.getNode(Opcode, dl, VTs, NewOps);
27254 }
27255
27256 case Intrinsic::x86_sse42_pcmpistrm128:
27257 case Intrinsic::x86_sse42_pcmpestrm128: {
27258 unsigned Opcode;
27259 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27260 Opcode = X86ISD::PCMPISTR;
27261 else
27262 Opcode = X86ISD::PCMPESTR;
27263
27265 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27266 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27267 }
27268
27269 case Intrinsic::eh_sjlj_lsda: {
27271 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27272 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27273 auto &Context = MF.getContext();
27274 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27275 Twine(MF.getFunctionNumber()));
27276 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27277 DAG.getMCSymbol(S, PtrVT));
27278 }
27279
27280 case Intrinsic::x86_seh_lsda: {
27281 // Compute the symbol for the LSDA. We know it'll get emitted later.
27283 SDValue Op1 = Op.getOperand(1);
27284 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27287
27288 // Generate a simple absolute symbol reference. This intrinsic is only
27289 // supported on 32-bit Windows, which isn't PIC.
27290 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27291 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27292 }
27293
27294 case Intrinsic::eh_recoverfp: {
27295 SDValue FnOp = Op.getOperand(1);
27296 SDValue IncomingFPOp = Op.getOperand(2);
27297 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27298 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27299 if (!Fn)
27301 "llvm.eh.recoverfp must take a function as the first argument");
27302 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27303 }
27304
27305 case Intrinsic::localaddress: {
27306 // Returns one of the stack, base, or frame pointer registers, depending on
27307 // which is used to reference local variables.
27309 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27310 Register Reg;
27311 if (RegInfo->hasBasePointer(MF))
27312 Reg = RegInfo->getBaseRegister();
27313 else { // Handles the SP or FP case.
27314 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27315 if (CantUseFP)
27316 Reg = RegInfo->getPtrSizedStackRegister(MF);
27317 else
27318 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27319 }
27320 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27321 }
27322 case Intrinsic::x86_avx512_vp2intersect_q_512:
27323 case Intrinsic::x86_avx512_vp2intersect_q_256:
27324 case Intrinsic::x86_avx512_vp2intersect_q_128:
27325 case Intrinsic::x86_avx512_vp2intersect_d_512:
27326 case Intrinsic::x86_avx512_vp2intersect_d_256:
27327 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27328 SDLoc DL(Op);
27329 MVT MaskVT = Op.getSimpleValueType();
27330 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27332 Op.getOperand(1), Op.getOperand(2));
27333 SDValue Result0 =
27334 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27335 SDValue Result1 =
27336 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27337 return DAG.getMergeValues({Result0, Result1}, DL);
27338 }
27339 case Intrinsic::x86_mmx_pslli_w:
27340 case Intrinsic::x86_mmx_pslli_d:
27341 case Intrinsic::x86_mmx_pslli_q:
27342 case Intrinsic::x86_mmx_psrli_w:
27343 case Intrinsic::x86_mmx_psrli_d:
27344 case Intrinsic::x86_mmx_psrli_q:
27345 case Intrinsic::x86_mmx_psrai_w:
27346 case Intrinsic::x86_mmx_psrai_d: {
27347 SDLoc DL(Op);
27348 SDValue ShAmt = Op.getOperand(2);
27349 // If the argument is a constant, convert it to a target constant.
27350 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27351 // Clamp out of bounds shift amounts since they will otherwise be masked
27352 // to 8-bits which may make it no longer out of bounds.
27353 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27354 if (ShiftAmount == 0)
27355 return Op.getOperand(1);
27356
27357 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27358 Op.getOperand(0), Op.getOperand(1),
27359 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27360 }
27361
27362 unsigned NewIntrinsic;
27363 switch (IntNo) {
27364 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27365 case Intrinsic::x86_mmx_pslli_w:
27366 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27367 break;
27368 case Intrinsic::x86_mmx_pslli_d:
27369 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27370 break;
27371 case Intrinsic::x86_mmx_pslli_q:
27372 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27373 break;
27374 case Intrinsic::x86_mmx_psrli_w:
27375 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27376 break;
27377 case Intrinsic::x86_mmx_psrli_d:
27378 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27379 break;
27380 case Intrinsic::x86_mmx_psrli_q:
27381 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27382 break;
27383 case Intrinsic::x86_mmx_psrai_w:
27384 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27385 break;
27386 case Intrinsic::x86_mmx_psrai_d:
27387 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27388 break;
27389 }
27390
27391 // The vector shift intrinsics with scalars uses 32b shift amounts but
27392 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27393 // MMX register.
27394 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27395 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27396 DAG.getTargetConstant(NewIntrinsic, DL,
27398 Op.getOperand(1), ShAmt);
27399 }
27400 case Intrinsic::thread_pointer: {
27401 if (Subtarget.isTargetELF()) {
27402 SDLoc dl(Op);
27403 EVT PtrVT = Op.getValueType();
27404 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27406 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27407 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27409 }
27411 "Target OS doesn't support __builtin_thread_pointer() yet.");
27412 }
27413 }
27414}
27415
27417 SDValue Src, SDValue Mask, SDValue Base,
27418 SDValue Index, SDValue ScaleOp, SDValue Chain,
27419 const X86Subtarget &Subtarget) {
27420 SDLoc dl(Op);
27421 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27422 // Scale must be constant.
27423 if (!C)
27424 return SDValue();
27425 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27426 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27427 TLI.getPointerTy(DAG.getDataLayout()));
27428 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27429 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27430 // If source is undef or we know it won't be used, use a zero vector
27431 // to break register dependency.
27432 // TODO: use undef instead and let BreakFalseDeps deal with it?
27433 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27434 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27435
27436 // Cast mask to an integer type.
27437 Mask = DAG.getBitcast(MaskVT, Mask);
27438
27439 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27440
27441 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27442 SDValue Res =
27443 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27444 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27445 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27446}
27447
27449 SDValue Src, SDValue Mask, SDValue Base,
27450 SDValue Index, SDValue ScaleOp, SDValue Chain,
27451 const X86Subtarget &Subtarget) {
27452 MVT VT = Op.getSimpleValueType();
27453 SDLoc dl(Op);
27454 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27455 // Scale must be constant.
27456 if (!C)
27457 return SDValue();
27458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27459 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27460 TLI.getPointerTy(DAG.getDataLayout()));
27461 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27463 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27464
27465 // We support two versions of the gather intrinsics. One with scalar mask and
27466 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27467 if (Mask.getValueType() != MaskVT)
27468 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27469
27470 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27471 // If source is undef or we know it won't be used, use a zero vector
27472 // to break register dependency.
27473 // TODO: use undef instead and let BreakFalseDeps deal with it?
27474 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27475 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27476
27477 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27478
27479 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27480 SDValue Res =
27481 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27482 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27483 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27484}
27485
27487 SDValue Src, SDValue Mask, SDValue Base,
27488 SDValue Index, SDValue ScaleOp, SDValue Chain,
27489 const X86Subtarget &Subtarget) {
27490 SDLoc dl(Op);
27491 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27492 // Scale must be constant.
27493 if (!C)
27494 return SDValue();
27495 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27496 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27497 TLI.getPointerTy(DAG.getDataLayout()));
27498 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27499 Src.getSimpleValueType().getVectorNumElements());
27500 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27501
27502 // We support two versions of the scatter intrinsics. One with scalar mask and
27503 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27504 if (Mask.getValueType() != MaskVT)
27505 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27506
27507 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27508
27509 SDVTList VTs = DAG.getVTList(MVT::Other);
27510 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27511 SDValue Res =
27512 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
27513 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27514 return Res;
27515}
27516
27518 SDValue Mask, SDValue Base, SDValue Index,
27519 SDValue ScaleOp, SDValue Chain,
27520 const X86Subtarget &Subtarget) {
27521 SDLoc dl(Op);
27522 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27523 // Scale must be constant.
27524 if (!C)
27525 return SDValue();
27526 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27527 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27528 TLI.getPointerTy(DAG.getDataLayout()));
27529 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27530 SDValue Segment = DAG.getRegister(0, MVT::i32);
27531 MVT MaskVT =
27532 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27533 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27534 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27535 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27536 return SDValue(Res, 0);
27537}
27538
27539/// Handles the lowering of builtin intrinsics with chain that return their
27540/// value into registers EDX:EAX.
27541/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27542/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27543/// TargetOpcode.
27544/// Returns a Glue value which can be used to add extra copy-from-reg if the
27545/// expanded intrinsics implicitly defines extra registers (i.e. not just
27546/// EDX:EAX).
27548 SelectionDAG &DAG,
27549 unsigned TargetOpcode,
27550 unsigned SrcReg,
27551 const X86Subtarget &Subtarget,
27553 SDValue Chain = N->getOperand(0);
27554 SDValue Glue;
27555
27556 if (SrcReg) {
27557 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27558 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27559 Glue = Chain.getValue(1);
27560 }
27561
27562 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27563 SDValue N1Ops[] = {Chain, Glue};
27564 SDNode *N1 = DAG.getMachineNode(
27565 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27566 Chain = SDValue(N1, 0);
27567
27568 // Reads the content of XCR and returns it in registers EDX:EAX.
27569 SDValue LO, HI;
27570 if (Subtarget.is64Bit()) {
27571 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27572 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27573 LO.getValue(2));
27574 } else {
27575 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27576 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27577 LO.getValue(2));
27578 }
27579 Chain = HI.getValue(1);
27580 Glue = HI.getValue(2);
27581
27582 if (Subtarget.is64Bit()) {
27583 // Merge the two 32-bit values into a 64-bit one.
27584 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27585 DAG.getConstant(32, DL, MVT::i8));
27586 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27587 Results.push_back(Chain);
27588 return Glue;
27589 }
27590
27591 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27592 SDValue Ops[] = { LO, HI };
27593 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27594 Results.push_back(Pair);
27595 Results.push_back(Chain);
27596 return Glue;
27597}
27598
27599/// Handles the lowering of builtin intrinsics that read the time stamp counter
27600/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27601/// READCYCLECOUNTER nodes.
27602static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27603 SelectionDAG &DAG,
27604 const X86Subtarget &Subtarget,
27606 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27607 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27608 // and the EAX register is loaded with the low-order 32 bits.
27609 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27610 /* NoRegister */0, Subtarget,
27611 Results);
27612 if (Opcode != X86::RDTSCP)
27613 return;
27614
27615 SDValue Chain = Results[1];
27616 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27617 // the ECX register. Add 'ecx' explicitly to the chain.
27618 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27619 Results[1] = ecx;
27620 Results.push_back(ecx.getValue(1));
27621}
27622
27624 SelectionDAG &DAG) {
27626 SDLoc DL(Op);
27627 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27628 Results);
27629 return DAG.getMergeValues(Results, DL);
27630}
27631
27634 SDValue Chain = Op.getOperand(0);
27635 SDValue RegNode = Op.getOperand(2);
27636 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27637 if (!EHInfo)
27638 report_fatal_error("EH registrations only live in functions using WinEH");
27639
27640 // Cast the operand to an alloca, and remember the frame index.
27641 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27642 if (!FINode)
27643 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27644 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27645
27646 // Return the chain operand without making any DAG nodes.
27647 return Chain;
27648}
27649
27652 SDValue Chain = Op.getOperand(0);
27653 SDValue EHGuard = Op.getOperand(2);
27654 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27655 if (!EHInfo)
27656 report_fatal_error("EHGuard only live in functions using WinEH");
27657
27658 // Cast the operand to an alloca, and remember the frame index.
27659 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27660 if (!FINode)
27661 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27662 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27663
27664 // Return the chain operand without making any DAG nodes.
27665 return Chain;
27666}
27667
27668/// Emit Truncating Store with signed or unsigned saturation.
27669static SDValue
27670EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27671 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27672 SelectionDAG &DAG) {
27673 SDVTList VTs = DAG.getVTList(MVT::Other);
27674 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27675 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27676 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27677 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27678}
27679
27680/// Emit Masked Truncating Store with signed or unsigned saturation.
27681static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27682 const SDLoc &DL,
27683 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27684 MachineMemOperand *MMO, SelectionDAG &DAG) {
27685 SDVTList VTs = DAG.getVTList(MVT::Other);
27686 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27687 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27688 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27689}
27690
27692 const MachineFunction &MF) {
27693 if (!Subtarget.is64Bit())
27694 return false;
27695 // 64-bit targets support extended Swift async frame setup,
27696 // except for targets that use the windows 64 prologue.
27697 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27698}
27699
27701 SelectionDAG &DAG) {
27702 unsigned IntNo = Op.getConstantOperandVal(1);
27703 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27704 if (!IntrData) {
27705 switch (IntNo) {
27706
27707 case Intrinsic::swift_async_context_addr: {
27708 SDLoc dl(Op);
27709 auto &MF = DAG.getMachineFunction();
27710 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27711 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27713 X86FI->setHasSwiftAsyncContext(true);
27714 SDValue Chain = Op->getOperand(0);
27715 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27716 SDValue Result =
27717 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27718 DAG.getTargetConstant(8, dl, MVT::i32)),
27719 0);
27720 // Return { result, chain }.
27721 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27722 CopyRBP.getValue(1));
27723 } else {
27724 // No special extended frame, create or reuse an existing stack slot.
27725 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27726 if (!X86FI->getSwiftAsyncContextFrameIdx())
27727 X86FI->setSwiftAsyncContextFrameIdx(
27728 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27729 false));
27730 SDValue Result =
27731 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27732 PtrSize == 8 ? MVT::i64 : MVT::i32);
27733 // Return { result, chain }.
27734 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27735 Op->getOperand(0));
27736 }
27737 }
27738
27739 case llvm::Intrinsic::x86_seh_ehregnode:
27740 return MarkEHRegistrationNode(Op, DAG);
27741 case llvm::Intrinsic::x86_seh_ehguard:
27742 return MarkEHGuard(Op, DAG);
27743 case llvm::Intrinsic::x86_rdpkru: {
27744 SDLoc dl(Op);
27745 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27746 // Create a RDPKRU node and pass 0 to the ECX parameter.
27747 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27748 DAG.getConstant(0, dl, MVT::i32));
27749 }
27750 case llvm::Intrinsic::x86_wrpkru: {
27751 SDLoc dl(Op);
27752 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27753 // to the EDX and ECX parameters.
27754 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27755 Op.getOperand(0), Op.getOperand(2),
27756 DAG.getConstant(0, dl, MVT::i32),
27757 DAG.getConstant(0, dl, MVT::i32));
27758 }
27759 case llvm::Intrinsic::asan_check_memaccess: {
27760 // Mark this as adjustsStack because it will be lowered to a call.
27762 // Don't do anything here, we will expand these intrinsics out later.
27763 return Op;
27764 }
27765 case llvm::Intrinsic::x86_flags_read_u32:
27766 case llvm::Intrinsic::x86_flags_read_u64:
27767 case llvm::Intrinsic::x86_flags_write_u32:
27768 case llvm::Intrinsic::x86_flags_write_u64: {
27769 // We need a frame pointer because this will get lowered to a PUSH/POP
27770 // sequence.
27773 // Don't do anything here, we will expand these intrinsics out later
27774 // during FinalizeISel in EmitInstrWithCustomInserter.
27775 return Op;
27776 }
27777 case Intrinsic::x86_lwpins32:
27778 case Intrinsic::x86_lwpins64:
27779 case Intrinsic::x86_umwait:
27780 case Intrinsic::x86_tpause: {
27781 SDLoc dl(Op);
27782 SDValue Chain = Op->getOperand(0);
27783 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27784 unsigned Opcode;
27785
27786 switch (IntNo) {
27787 default: llvm_unreachable("Impossible intrinsic");
27788 case Intrinsic::x86_umwait:
27789 Opcode = X86ISD::UMWAIT;
27790 break;
27791 case Intrinsic::x86_tpause:
27792 Opcode = X86ISD::TPAUSE;
27793 break;
27794 case Intrinsic::x86_lwpins32:
27795 case Intrinsic::x86_lwpins64:
27796 Opcode = X86ISD::LWPINS;
27797 break;
27798 }
27799
27801 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27802 Op->getOperand(3), Op->getOperand(4));
27803 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27804 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27805 Operation.getValue(1));
27806 }
27807 case Intrinsic::x86_enqcmd:
27808 case Intrinsic::x86_enqcmds: {
27809 SDLoc dl(Op);
27810 SDValue Chain = Op.getOperand(0);
27811 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27812 unsigned Opcode;
27813 switch (IntNo) {
27814 default: llvm_unreachable("Impossible intrinsic!");
27815 case Intrinsic::x86_enqcmd:
27816 Opcode = X86ISD::ENQCMD;
27817 break;
27818 case Intrinsic::x86_enqcmds:
27819 Opcode = X86ISD::ENQCMDS;
27820 break;
27821 }
27822 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27823 Op.getOperand(3));
27824 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27825 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27826 Operation.getValue(1));
27827 }
27828 case Intrinsic::x86_aesenc128kl:
27829 case Intrinsic::x86_aesdec128kl:
27830 case Intrinsic::x86_aesenc256kl:
27831 case Intrinsic::x86_aesdec256kl: {
27832 SDLoc DL(Op);
27833 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27834 SDValue Chain = Op.getOperand(0);
27835 unsigned Opcode;
27836
27837 switch (IntNo) {
27838 default: llvm_unreachable("Impossible intrinsic");
27839 case Intrinsic::x86_aesenc128kl:
27840 Opcode = X86ISD::AESENC128KL;
27841 break;
27842 case Intrinsic::x86_aesdec128kl:
27843 Opcode = X86ISD::AESDEC128KL;
27844 break;
27845 case Intrinsic::x86_aesenc256kl:
27846 Opcode = X86ISD::AESENC256KL;
27847 break;
27848 case Intrinsic::x86_aesdec256kl:
27849 Opcode = X86ISD::AESDEC256KL;
27850 break;
27851 }
27852
27853 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27854 MachineMemOperand *MMO = MemIntr->getMemOperand();
27855 EVT MemVT = MemIntr->getMemoryVT();
27857 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27858 MMO);
27859 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27860
27861 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27862 {ZF, Operation.getValue(0), Operation.getValue(2)});
27863 }
27864 case Intrinsic::x86_aesencwide128kl:
27865 case Intrinsic::x86_aesdecwide128kl:
27866 case Intrinsic::x86_aesencwide256kl:
27867 case Intrinsic::x86_aesdecwide256kl: {
27868 SDLoc DL(Op);
27869 SDVTList VTs = DAG.getVTList(
27870 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27871 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27872 SDValue Chain = Op.getOperand(0);
27873 unsigned Opcode;
27874
27875 switch (IntNo) {
27876 default: llvm_unreachable("Impossible intrinsic");
27877 case Intrinsic::x86_aesencwide128kl:
27878 Opcode = X86ISD::AESENCWIDE128KL;
27879 break;
27880 case Intrinsic::x86_aesdecwide128kl:
27881 Opcode = X86ISD::AESDECWIDE128KL;
27882 break;
27883 case Intrinsic::x86_aesencwide256kl:
27884 Opcode = X86ISD::AESENCWIDE256KL;
27885 break;
27886 case Intrinsic::x86_aesdecwide256kl:
27887 Opcode = X86ISD::AESDECWIDE256KL;
27888 break;
27889 }
27890
27891 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27892 MachineMemOperand *MMO = MemIntr->getMemOperand();
27893 EVT MemVT = MemIntr->getMemoryVT();
27895 Opcode, DL, VTs,
27896 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27897 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27898 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27899 MemVT, MMO);
27900 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27901
27902 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27903 {ZF, Operation.getValue(1), Operation.getValue(2),
27904 Operation.getValue(3), Operation.getValue(4),
27905 Operation.getValue(5), Operation.getValue(6),
27906 Operation.getValue(7), Operation.getValue(8),
27907 Operation.getValue(9)});
27908 }
27909 case Intrinsic::x86_testui: {
27910 SDLoc dl(Op);
27911 SDValue Chain = Op.getOperand(0);
27912 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27913 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27914 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27915 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27916 Operation.getValue(1));
27917 }
27918 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27919 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27920 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27921 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27922 case Intrinsic::x86_t2rpntlvwz0_internal:
27923 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27924 case Intrinsic::x86_t2rpntlvwz1_internal:
27925 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27926 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27928 unsigned IntNo = Op.getConstantOperandVal(1);
27929 unsigned Opc = 0;
27930 switch (IntNo) {
27931 default:
27932 llvm_unreachable("Unexpected intrinsic!");
27933 case Intrinsic::x86_t2rpntlvwz0_internal:
27934 Opc = X86::PT2RPNTLVWZ0V;
27935 break;
27936 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27937 Opc = X86::PT2RPNTLVWZ0T1V;
27938 break;
27939 case Intrinsic::x86_t2rpntlvwz1_internal:
27940 Opc = X86::PT2RPNTLVWZ1V;
27941 break;
27942 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27943 Opc = X86::PT2RPNTLVWZ1T1V;
27944 break;
27945 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27946 Opc = X86::PT2RPNTLVWZ0RSV;
27947 break;
27948 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27949 Opc = X86::PT2RPNTLVWZ0RST1V;
27950 break;
27951 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27952 Opc = X86::PT2RPNTLVWZ1RSV;
27953 break;
27954 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27955 Opc = X86::PT2RPNTLVWZ1RST1V;
27956 break;
27957 }
27958
27959 SDLoc DL(Op);
27960 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27961
27962 SDValue Ops[] = {Op.getOperand(2), // Row
27963 Op.getOperand(3), // Col0
27964 Op.getOperand(4), // Col1
27965 Op.getOperand(5), // Base
27966 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27967 Op.getOperand(6), // Index
27968 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27969 DAG.getRegister(0, MVT::i16), // Segment
27970 Op.getOperand(0)}; // Chain
27971
27972 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27973 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27974 SDValue(Res, 0));
27975 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27976 SDValue(Res, 0));
27977 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
27978 }
27979 case Intrinsic::x86_atomic_bts_rm:
27980 case Intrinsic::x86_atomic_btc_rm:
27981 case Intrinsic::x86_atomic_btr_rm: {
27982 SDLoc DL(Op);
27983 MVT VT = Op.getSimpleValueType();
27984 SDValue Chain = Op.getOperand(0);
27985 SDValue Op1 = Op.getOperand(2);
27986 SDValue Op2 = Op.getOperand(3);
27987 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
27988 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
27990 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27991 SDValue Res =
27992 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27993 {Chain, Op1, Op2}, VT, MMO);
27994 Chain = Res.getValue(1);
27995 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27996 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27997 }
27998 case Intrinsic::x86_atomic_bts:
27999 case Intrinsic::x86_atomic_btc:
28000 case Intrinsic::x86_atomic_btr: {
28001 SDLoc DL(Op);
28002 MVT VT = Op.getSimpleValueType();
28003 SDValue Chain = Op.getOperand(0);
28004 SDValue Op1 = Op.getOperand(2);
28005 SDValue Op2 = Op.getOperand(3);
28006 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28007 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28008 : X86ISD::LBTR;
28009 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28010 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28011 SDValue Res =
28012 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28013 {Chain, Op1, Op2, Size}, VT, MMO);
28014 Chain = Res.getValue(1);
28015 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28016 unsigned Imm = Op2->getAsZExtVal();
28017 if (Imm)
28018 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28019 DAG.getShiftAmountConstant(Imm, VT, DL));
28020 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28021 }
28022 case Intrinsic::x86_cmpccxadd32:
28023 case Intrinsic::x86_cmpccxadd64: {
28024 SDLoc DL(Op);
28025 SDValue Chain = Op.getOperand(0);
28026 SDValue Addr = Op.getOperand(2);
28027 SDValue Src1 = Op.getOperand(3);
28028 SDValue Src2 = Op.getOperand(4);
28029 SDValue CC = Op.getOperand(5);
28030 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28032 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28033 MVT::i32, MMO);
28034 return Operation;
28035 }
28036 case Intrinsic::x86_aadd32:
28037 case Intrinsic::x86_aadd64:
28038 case Intrinsic::x86_aand32:
28039 case Intrinsic::x86_aand64:
28040 case Intrinsic::x86_aor32:
28041 case Intrinsic::x86_aor64:
28042 case Intrinsic::x86_axor32:
28043 case Intrinsic::x86_axor64: {
28044 SDLoc DL(Op);
28045 SDValue Chain = Op.getOperand(0);
28046 SDValue Op1 = Op.getOperand(2);
28047 SDValue Op2 = Op.getOperand(3);
28048 MVT VT = Op2.getSimpleValueType();
28049 unsigned Opc = 0;
28050 switch (IntNo) {
28051 default:
28052 llvm_unreachable("Unknown Intrinsic");
28053 case Intrinsic::x86_aadd32:
28054 case Intrinsic::x86_aadd64:
28055 Opc = X86ISD::AADD;
28056 break;
28057 case Intrinsic::x86_aand32:
28058 case Intrinsic::x86_aand64:
28059 Opc = X86ISD::AAND;
28060 break;
28061 case Intrinsic::x86_aor32:
28062 case Intrinsic::x86_aor64:
28063 Opc = X86ISD::AOR;
28064 break;
28065 case Intrinsic::x86_axor32:
28066 case Intrinsic::x86_axor64:
28067 Opc = X86ISD::AXOR;
28068 break;
28069 }
28070 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28071 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28072 {Chain, Op1, Op2}, VT, MMO);
28073 }
28074 case Intrinsic::x86_atomic_add_cc:
28075 case Intrinsic::x86_atomic_sub_cc:
28076 case Intrinsic::x86_atomic_or_cc:
28077 case Intrinsic::x86_atomic_and_cc:
28078 case Intrinsic::x86_atomic_xor_cc: {
28079 SDLoc DL(Op);
28080 SDValue Chain = Op.getOperand(0);
28081 SDValue Op1 = Op.getOperand(2);
28082 SDValue Op2 = Op.getOperand(3);
28083 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28084 MVT VT = Op2.getSimpleValueType();
28085 unsigned Opc = 0;
28086 switch (IntNo) {
28087 default:
28088 llvm_unreachable("Unknown Intrinsic");
28089 case Intrinsic::x86_atomic_add_cc:
28090 Opc = X86ISD::LADD;
28091 break;
28092 case Intrinsic::x86_atomic_sub_cc:
28093 Opc = X86ISD::LSUB;
28094 break;
28095 case Intrinsic::x86_atomic_or_cc:
28096 Opc = X86ISD::LOR;
28097 break;
28098 case Intrinsic::x86_atomic_and_cc:
28099 Opc = X86ISD::LAND;
28100 break;
28101 case Intrinsic::x86_atomic_xor_cc:
28102 Opc = X86ISD::LXOR;
28103 break;
28104 }
28105 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28106 SDValue LockArith =
28107 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28108 {Chain, Op1, Op2}, VT, MMO);
28109 Chain = LockArith.getValue(1);
28110 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28111 }
28112 }
28113 return SDValue();
28114 }
28115
28116 SDLoc dl(Op);
28117 switch(IntrData->Type) {
28118 default: llvm_unreachable("Unknown Intrinsic Type");
28119 case RDSEED:
28120 case RDRAND: {
28121 // Emit the node with the right value type.
28122 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28123 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28124
28125 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28126 // Otherwise return the value from Rand, which is always 0, casted to i32.
28127 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28128 DAG.getConstant(1, dl, Op->getValueType(1)),
28129 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28130 SDValue(Result.getNode(), 1)};
28131 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28132
28133 // Return { result, isValid, chain }.
28134 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28135 SDValue(Result.getNode(), 2));
28136 }
28137 case GATHER_AVX2: {
28138 SDValue Chain = Op.getOperand(0);
28139 SDValue Src = Op.getOperand(2);
28140 SDValue Base = Op.getOperand(3);
28141 SDValue Index = Op.getOperand(4);
28142 SDValue Mask = Op.getOperand(5);
28143 SDValue Scale = Op.getOperand(6);
28144 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28145 Scale, Chain, Subtarget);
28146 }
28147 case GATHER: {
28148 //gather(v1, mask, index, base, scale);
28149 SDValue Chain = Op.getOperand(0);
28150 SDValue Src = Op.getOperand(2);
28151 SDValue Base = Op.getOperand(3);
28152 SDValue Index = Op.getOperand(4);
28153 SDValue Mask = Op.getOperand(5);
28154 SDValue Scale = Op.getOperand(6);
28155 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28156 Chain, Subtarget);
28157 }
28158 case SCATTER: {
28159 //scatter(base, mask, index, v1, scale);
28160 SDValue Chain = Op.getOperand(0);
28161 SDValue Base = Op.getOperand(2);
28162 SDValue Mask = Op.getOperand(3);
28163 SDValue Index = Op.getOperand(4);
28164 SDValue Src = Op.getOperand(5);
28165 SDValue Scale = Op.getOperand(6);
28166 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28167 Scale, Chain, Subtarget);
28168 }
28169 case PREFETCH: {
28170 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28171 assert((HintVal == 2 || HintVal == 3) &&
28172 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28173 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28174 SDValue Chain = Op.getOperand(0);
28175 SDValue Mask = Op.getOperand(2);
28176 SDValue Index = Op.getOperand(3);
28177 SDValue Base = Op.getOperand(4);
28178 SDValue Scale = Op.getOperand(5);
28179 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28180 Subtarget);
28181 }
28182 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28183 case RDTSC: {
28185 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28186 Results);
28187 return DAG.getMergeValues(Results, dl);
28188 }
28189 // Read Performance Monitoring Counters.
28190 case RDPMC:
28191 // Read Processor Register.
28192 case RDPRU:
28193 // GetExtended Control Register.
28194 case XGETBV: {
28196
28197 // RDPMC uses ECX to select the index of the performance counter to read.
28198 // RDPRU uses ECX to select the processor register to read.
28199 // XGETBV uses ECX to select the index of the XCR register to return.
28200 // The result is stored into registers EDX:EAX.
28201 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28202 Subtarget, Results);
28203 return DAG.getMergeValues(Results, dl);
28204 }
28205 // XTEST intrinsics.
28206 case XTEST: {
28207 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28208 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28209
28210 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28211 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28212 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28213 Ret, SDValue(InTrans.getNode(), 1));
28214 }
28217 case TRUNCATE_TO_MEM_VI32: {
28218 SDValue Mask = Op.getOperand(4);
28219 SDValue DataToTruncate = Op.getOperand(3);
28220 SDValue Addr = Op.getOperand(2);
28221 SDValue Chain = Op.getOperand(0);
28222
28223 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
28224 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28225
28226 EVT MemVT = MemIntr->getMemoryVT();
28227
28228 uint16_t TruncationOp = IntrData->Opc0;
28229 switch (TruncationOp) {
28230 case X86ISD::VTRUNC: {
28231 if (isAllOnesConstant(Mask)) // return just a truncate store
28232 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28233 MemIntr->getMemOperand());
28234
28235 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28236 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28237 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28238
28239 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28240 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28241 true /* truncating */);
28242 }
28243 case X86ISD::VTRUNCUS:
28244 case X86ISD::VTRUNCS: {
28245 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28246 if (isAllOnesConstant(Mask))
28247 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28248 MemIntr->getMemOperand(), DAG);
28249
28250 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28251 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28252
28253 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28254 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28255 }
28256 default:
28257 llvm_unreachable("Unsupported truncstore intrinsic");
28258 }
28259 }
28260 case INTR_TYPE_CAST_MMX:
28261 return SDValue(); // handled in combineINTRINSIC_*
28262 }
28263}
28264
28265SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28266 SelectionDAG &DAG) const {
28268 MFI.setReturnAddressIsTaken(true);
28269
28270 unsigned Depth = Op.getConstantOperandVal(0);
28271 SDLoc dl(Op);
28272 EVT PtrVT = Op.getValueType();
28273
28274 if (Depth > 0) {
28275 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28276 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28277 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28278 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28279 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28281 }
28282
28283 // Just load the return address.
28284 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28285 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28287}
28288
28289SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28290 SelectionDAG &DAG) const {
28292 return getReturnAddressFrameIndex(DAG);
28293}
28294
28295SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28297 MachineFrameInfo &MFI = MF.getFrameInfo();
28299 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28300 EVT VT = Op.getValueType();
28301
28302 MFI.setFrameAddressIsTaken(true);
28303
28304 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28305 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28306 // is not possible to crawl up the stack without looking at the unwind codes
28307 // simultaneously.
28308 int FrameAddrIndex = FuncInfo->getFAIndex();
28309 if (!FrameAddrIndex) {
28310 // Set up a frame object for the return address.
28311 unsigned SlotSize = RegInfo->getSlotSize();
28312 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28313 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28314 FuncInfo->setFAIndex(FrameAddrIndex);
28315 }
28316 return DAG.getFrameIndex(FrameAddrIndex, VT);
28317 }
28318
28319 Register FrameReg =
28320 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28321 SDLoc dl(Op); // FIXME probably not meaningful
28322 unsigned Depth = Op.getConstantOperandVal(0);
28323 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28324 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28325 "Invalid Frame Register!");
28326 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28327 while (Depth--)
28328 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28330 return FrameAddr;
28331}
28332
28333// FIXME? Maybe this could be a TableGen attribute on some registers and
28334// this table could be generated automatically from RegInfo.
28336 const MachineFunction &MF) const {
28337 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28338
28340 .Case("esp", X86::ESP)
28341 .Case("rsp", X86::RSP)
28342 .Case("ebp", X86::EBP)
28343 .Case("rbp", X86::RBP)
28344 .Case("r14", X86::R14)
28345 .Case("r15", X86::R15)
28346 .Default(0);
28347
28348 if (Reg == X86::EBP || Reg == X86::RBP) {
28349 if (!TFI.hasFP(MF))
28350 report_fatal_error("register " + StringRef(RegName) +
28351 " is allocatable: function has no frame pointer");
28352#ifndef NDEBUG
28353 else {
28354 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28355 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28356 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28357 "Invalid Frame Register!");
28358 }
28359#endif
28360 }
28361
28362 return Reg;
28363}
28364
28365SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28366 SelectionDAG &DAG) const {
28367 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28368 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28369}
28370
28372 const Constant *PersonalityFn) const {
28373 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28374 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28375
28376 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28377}
28378
28380 const Constant *PersonalityFn) const {
28381 // Funclet personalities don't use selectors (the runtime does the selection).
28383 return X86::NoRegister;
28384 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28385}
28386
28388 return Subtarget.isTargetWin64();
28389}
28390
28391SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28392 SDValue Chain = Op.getOperand(0);
28393 SDValue Offset = Op.getOperand(1);
28394 SDValue Handler = Op.getOperand(2);
28395 SDLoc dl (Op);
28396
28397 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28398 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28399 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28400 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28401 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28402 "Invalid Frame Register!");
28403 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28404 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28405
28406 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28407 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28408 dl));
28409 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28410 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28411 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28412
28413 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28414 DAG.getRegister(StoreAddrReg, PtrVT));
28415}
28416
28417SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28418 SelectionDAG &DAG) const {
28419 SDLoc DL(Op);
28420 // If the subtarget is not 64bit, we may need the global base reg
28421 // after isel expand pseudo, i.e., after CGBR pass ran.
28422 // Therefore, ask for the GlobalBaseReg now, so that the pass
28423 // inserts the code for us in case we need it.
28424 // Otherwise, we will end up in a situation where we will
28425 // reference a virtual register that is not defined!
28426 if (!Subtarget.is64Bit()) {
28427 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28428 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28429 }
28430 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28431 DAG.getVTList(MVT::i32, MVT::Other),
28432 Op.getOperand(0), Op.getOperand(1));
28433}
28434
28435SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28436 SelectionDAG &DAG) const {
28437 SDLoc DL(Op);
28438 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28439 Op.getOperand(0), Op.getOperand(1));
28440}
28441
28442SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28443 SelectionDAG &DAG) const {
28444 SDLoc DL(Op);
28445 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28446 Op.getOperand(0));
28447}
28448
28450 return Op.getOperand(0);
28451}
28452
28453SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28454 SelectionDAG &DAG) const {
28455 SDValue Root = Op.getOperand(0);
28456 SDValue Trmp = Op.getOperand(1); // trampoline
28457 SDValue FPtr = Op.getOperand(2); // nested function
28458 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28459 SDLoc dl (Op);
28460
28461 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28462 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28463
28464 if (Subtarget.is64Bit()) {
28465 SDValue OutChains[6];
28466
28467 // Large code-model.
28468 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28469 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28470
28471 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28472 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28473
28474 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28475
28476 // Load the pointer to the nested function into R11.
28477 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28478 SDValue Addr = Trmp;
28479 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28480 Addr, MachinePointerInfo(TrmpAddr));
28481
28482 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28483 DAG.getConstant(2, dl, MVT::i64));
28484 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28485 MachinePointerInfo(TrmpAddr, 2), Align(2));
28486
28487 // Load the 'nest' parameter value into R10.
28488 // R10 is specified in X86CallingConv.td
28489 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28490 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28491 DAG.getConstant(10, dl, MVT::i64));
28492 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28493 Addr, MachinePointerInfo(TrmpAddr, 10));
28494
28495 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28496 DAG.getConstant(12, dl, MVT::i64));
28497 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28498 MachinePointerInfo(TrmpAddr, 12), Align(2));
28499
28500 // Jump to the nested function.
28501 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28502 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28503 DAG.getConstant(20, dl, MVT::i64));
28504 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28505 Addr, MachinePointerInfo(TrmpAddr, 20));
28506
28507 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28508 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28509 DAG.getConstant(22, dl, MVT::i64));
28510 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28511 Addr, MachinePointerInfo(TrmpAddr, 22));
28512
28513 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28514 } else {
28515 const Function *Func =
28516 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28517 CallingConv::ID CC = Func->getCallingConv();
28518 unsigned NestReg;
28519
28520 switch (CC) {
28521 default:
28522 llvm_unreachable("Unsupported calling convention");
28523 case CallingConv::C:
28525 // Pass 'nest' parameter in ECX.
28526 // Must be kept in sync with X86CallingConv.td
28527 NestReg = X86::ECX;
28528
28529 // Check that ECX wasn't needed by an 'inreg' parameter.
28530 FunctionType *FTy = Func->getFunctionType();
28531 const AttributeList &Attrs = Func->getAttributes();
28532
28533 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28534 unsigned InRegCount = 0;
28535 unsigned Idx = 0;
28536
28537 for (FunctionType::param_iterator I = FTy->param_begin(),
28538 E = FTy->param_end(); I != E; ++I, ++Idx)
28539 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28540 const DataLayout &DL = DAG.getDataLayout();
28541 // FIXME: should only count parameters that are lowered to integers.
28542 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28543 }
28544
28545 if (InRegCount > 2) {
28546 report_fatal_error("Nest register in use - reduce number of inreg"
28547 " parameters!");
28548 }
28549 }
28550 break;
28551 }
28554 case CallingConv::Fast:
28555 case CallingConv::Tail:
28557 // Pass 'nest' parameter in EAX.
28558 // Must be kept in sync with X86CallingConv.td
28559 NestReg = X86::EAX;
28560 break;
28561 }
28562
28563 SDValue OutChains[4];
28564 SDValue Addr, Disp;
28565
28566 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28567 DAG.getConstant(10, dl, MVT::i32));
28568 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28569
28570 // This is storing the opcode for MOV32ri.
28571 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28572 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28573 OutChains[0] =
28574 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28575 Trmp, MachinePointerInfo(TrmpAddr));
28576
28577 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28578 DAG.getConstant(1, dl, MVT::i32));
28579 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28580 MachinePointerInfo(TrmpAddr, 1), Align(1));
28581
28582 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28583 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28584 DAG.getConstant(5, dl, MVT::i32));
28585 OutChains[2] =
28586 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28587 MachinePointerInfo(TrmpAddr, 5), Align(1));
28588
28589 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28590 DAG.getConstant(6, dl, MVT::i32));
28591 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28592 MachinePointerInfo(TrmpAddr, 6), Align(1));
28593
28594 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28595 }
28596}
28597
28598SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28599 SelectionDAG &DAG) const {
28600 /*
28601 The rounding mode is in bits 11:10 of FPSR, and has the following
28602 settings:
28603 00 Round to nearest
28604 01 Round to -inf
28605 10 Round to +inf
28606 11 Round to 0
28607
28608 GET_ROUNDING, on the other hand, expects the following:
28609 -1 Undefined
28610 0 Round to 0
28611 1 Round to nearest
28612 2 Round to +inf
28613 3 Round to -inf
28614
28615 To perform the conversion, we use a packed lookup table of the four 2-bit
28616 values that we can index by FPSP[11:10]
28617 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28618
28619 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28620 */
28621
28623 MVT VT = Op.getSimpleValueType();
28624 SDLoc DL(Op);
28625
28626 // Save FP Control Word to stack slot
28627 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28628 SDValue StackSlot =
28629 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28630
28632
28633 SDValue Chain = Op.getOperand(0);
28634 SDValue Ops[] = {Chain, StackSlot};
28636 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28638
28639 // Load FP Control Word from stack slot
28640 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28641 Chain = CWD.getValue(1);
28642
28643 // Mask and turn the control bits into a shift for the lookup table.
28644 SDValue Shift =
28645 DAG.getNode(ISD::SRL, DL, MVT::i16,
28646 DAG.getNode(ISD::AND, DL, MVT::i16,
28647 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28648 DAG.getConstant(9, DL, MVT::i8));
28649 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28650
28651 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28652 SDValue RetVal =
28653 DAG.getNode(ISD::AND, DL, MVT::i32,
28654 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28655 DAG.getConstant(3, DL, MVT::i32));
28656
28657 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28658
28659 return DAG.getMergeValues({RetVal, Chain}, DL);
28660}
28661
28662SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28663 SelectionDAG &DAG) const {
28665 SDLoc DL(Op);
28666 SDValue Chain = Op.getNode()->getOperand(0);
28667
28668 // FP control word may be set only from data in memory. So we need to allocate
28669 // stack space to save/load FP control word.
28670 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28671 SDValue StackSlot =
28672 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28674 MachineMemOperand *MMO =
28676
28677 // Store FP control word into memory.
28678 SDValue Ops[] = {Chain, StackSlot};
28679 Chain = DAG.getMemIntrinsicNode(
28680 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28681
28682 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28683 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28684 Chain = CWD.getValue(1);
28685 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28686 DAG.getConstant(0xf3ff, DL, MVT::i16));
28687
28688 // Calculate new rounding mode.
28689 SDValue NewRM = Op.getNode()->getOperand(1);
28690 SDValue RMBits;
28691 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28692 uint64_t RM = CVal->getZExtValue();
28693 int FieldVal;
28694 switch (static_cast<RoundingMode>(RM)) {
28695 // clang-format off
28696 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
28697 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
28698 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
28699 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
28700 default:
28701 llvm_unreachable("rounding mode is not supported by X86 hardware");
28702 // clang-format on
28703 }
28704 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28705 } else {
28706 // Need to convert argument into bits of control word:
28707 // 0 Round to 0 -> 11
28708 // 1 Round to nearest -> 00
28709 // 2 Round to +inf -> 10
28710 // 3 Round to -inf -> 01
28711 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28712 // To make the conversion, put all these values into a value 0xc9 and shift
28713 // it left depending on the rounding mode:
28714 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28715 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28716 // ...
28717 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28718 SDValue ShiftValue =
28719 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28720 DAG.getNode(ISD::ADD, DL, MVT::i32,
28721 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28722 DAG.getConstant(1, DL, MVT::i8)),
28723 DAG.getConstant(4, DL, MVT::i32)));
28724 SDValue Shifted =
28725 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28726 ShiftValue);
28727 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28728 DAG.getConstant(0xc00, DL, MVT::i16));
28729 }
28730
28731 // Update rounding mode bits and store the new FP Control Word into stack.
28732 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28733 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28734
28735 // Load FP control word from the slot.
28736 SDValue OpsLD[] = {Chain, StackSlot};
28737 MachineMemOperand *MMOL =
28739 Chain = DAG.getMemIntrinsicNode(
28740 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28741
28742 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28743 // same way but in bits 14:13.
28744 if (Subtarget.hasSSE1()) {
28745 // Store MXCSR into memory.
28746 Chain = DAG.getNode(
28747 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28748 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28749 StackSlot);
28750
28751 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28752 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28753 Chain = CWD.getValue(1);
28754 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28755 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28756
28757 // Shift X87 RM bits from 11:10 to 14:13.
28758 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28759 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28760 DAG.getConstant(3, DL, MVT::i8));
28761
28762 // Update rounding mode bits and store the new FP Control Word into stack.
28763 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28764 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28765
28766 // Load MXCSR from the slot.
28767 Chain = DAG.getNode(
28768 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28769 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28770 StackSlot);
28771 }
28772
28773 return Chain;
28774}
28775
28776const unsigned X87StateSize = 28;
28777const unsigned FPStateSize = 32;
28778[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28779
28780SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28781 SelectionDAG &DAG) const {
28783 SDLoc DL(Op);
28784 SDValue Chain = Op->getOperand(0);
28785 SDValue Ptr = Op->getOperand(1);
28786 auto *Node = cast<FPStateAccessSDNode>(Op);
28787 EVT MemVT = Node->getMemoryVT();
28789 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28790
28791 // Get x87 state, if it presents.
28792 if (Subtarget.hasX87()) {
28793 Chain =
28794 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28795 {Chain, Ptr}, MemVT, MMO);
28796
28797 // FNSTENV changes the exception mask, so load back the stored environment.
28798 MachineMemOperand::Flags NewFlags =
28800 (MMO->getFlags() & ~MachineMemOperand::MOStore);
28801 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28802 Chain =
28803 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28804 {Chain, Ptr}, MemVT, MMO);
28805 }
28806
28807 // If target supports SSE, get MXCSR as well.
28808 if (Subtarget.hasSSE1()) {
28809 // Get pointer to the MXCSR location in memory.
28811 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28812 DAG.getConstant(X87StateSize, DL, PtrVT));
28813 // Store MXCSR into memory.
28814 Chain = DAG.getNode(
28815 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28816 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28817 MXCSRAddr);
28818 }
28819
28820 return Chain;
28821}
28822
28824 EVT MemVT, MachineMemOperand *MMO,
28825 SelectionDAG &DAG,
28826 const X86Subtarget &Subtarget) {
28827 // Set x87 state, if it presents.
28828 if (Subtarget.hasX87())
28829 Chain =
28830 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28831 {Chain, Ptr}, MemVT, MMO);
28832 // If target supports SSE, set MXCSR as well.
28833 if (Subtarget.hasSSE1()) {
28834 // Get pointer to the MXCSR location in memory.
28836 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28837 DAG.getConstant(X87StateSize, DL, PtrVT));
28838 // Load MXCSR from memory.
28839 Chain = DAG.getNode(
28840 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28841 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28842 MXCSRAddr);
28843 }
28844 return Chain;
28845}
28846
28847SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28848 SelectionDAG &DAG) const {
28849 SDLoc DL(Op);
28850 SDValue Chain = Op->getOperand(0);
28851 SDValue Ptr = Op->getOperand(1);
28852 auto *Node = cast<FPStateAccessSDNode>(Op);
28853 EVT MemVT = Node->getMemoryVT();
28855 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28856 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28857}
28858
28859SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28860 SelectionDAG &DAG) const {
28862 SDLoc DL(Op);
28863 SDValue Chain = Op.getNode()->getOperand(0);
28864
28865 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28866 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28868
28869 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28870 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28871 // for compatibility with glibc.
28872 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28873 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28874 Constant *Zero = ConstantInt::get(ItemTy, 0);
28875 for (unsigned I = 0; I < 6; ++I)
28876 FPEnvVals.push_back(Zero);
28877
28878 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28879 // all exceptions, sets DAZ and FTZ to 0.
28880 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28881 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28883 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28884 MachinePointerInfo MPI =
28888
28889 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28890}
28891
28892// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28893uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28894 assert((Amt < 8) && "Shift/Rotation amount out of range");
28895 switch (Opcode) {
28896 case ISD::BITREVERSE:
28897 return 0x8040201008040201ULL;
28898 case ISD::SHL:
28899 return ((0x0102040810204080ULL >> (Amt)) &
28900 (0x0101010101010101ULL * (0xFF >> (Amt))));
28901 case ISD::SRL:
28902 return ((0x0102040810204080ULL << (Amt)) &
28903 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28904 case ISD::SRA:
28905 return (getGFNICtrlImm(ISD::SRL, Amt) |
28906 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28907 case ISD::ROTL:
28908 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28909 case ISD::ROTR:
28910 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28911 }
28912 llvm_unreachable("Unsupported GFNI opcode");
28913}
28914
28915// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28916SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28917 MVT VT, unsigned Amt = 0) {
28918 assert(VT.getVectorElementType() == MVT::i8 &&
28919 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28920 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28921 SmallVector<SDValue> MaskBits;
28922 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28923 uint64_t Bits = (Imm >> (I % 64)) & 255;
28924 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28925 }
28926 return DAG.getBuildVector(VT, DL, MaskBits);
28927}
28928
28929/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28930//
28931// i8/i16 vector implemented using dword LZCNT vector instruction
28932// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28933// split the vector, perform operation on it's Lo a Hi part and
28934// concatenate the results.
28936 const X86Subtarget &Subtarget) {
28937 assert(Op.getOpcode() == ISD::CTLZ);
28938 SDLoc dl(Op);
28939 MVT VT = Op.getSimpleValueType();
28940 MVT EltVT = VT.getVectorElementType();
28941 unsigned NumElems = VT.getVectorNumElements();
28942
28943 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28944 "Unsupported element type");
28945
28946 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28947 if (NumElems > 16 ||
28948 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28949 return splitVectorIntUnary(Op, DAG, dl);
28950
28951 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28952 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28953 "Unsupported value type for operation");
28954
28955 // Use native supported vector instruction vplzcntd.
28956 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28957 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28958 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28959 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28960
28961 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28962}
28963
28964// Lower CTLZ using a PSHUFB lookup table implementation.
28966 const X86Subtarget &Subtarget,
28967 SelectionDAG &DAG) {
28968 MVT VT = Op.getSimpleValueType();
28969 int NumElts = VT.getVectorNumElements();
28970 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28971 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28972
28973 // Per-nibble leading zero PSHUFB lookup table.
28974 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28975 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28976 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28977 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28978
28980 for (int i = 0; i < NumBytes; ++i)
28981 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28982 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
28983
28984 // Begin by bitcasting the input to byte vector, then split those bytes
28985 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
28986 // If the hi input nibble is zero then we add both results together, otherwise
28987 // we just take the hi result (by masking the lo result to zero before the
28988 // add).
28989 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
28990 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28991
28992 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
28993 SDValue Lo = Op0;
28994 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
28995 SDValue HiZ;
28996 if (CurrVT.is512BitVector()) {
28997 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28998 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
28999 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29000 } else {
29001 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29002 }
29003
29004 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29005 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29006 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29007 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29008
29009 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29010 // of the current vector width in the same way we did for the nibbles.
29011 // If the upper half of the input element is zero then add the halves'
29012 // leading zero counts together, otherwise just use the upper half's.
29013 // Double the width of the result until we are at target width.
29014 while (CurrVT != VT) {
29015 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29016 int CurrNumElts = CurrVT.getVectorNumElements();
29017 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29018 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29019 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29020
29021 // Check if the upper half of the input element is zero.
29022 if (CurrVT.is512BitVector()) {
29023 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29024 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29025 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29026 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29027 } else {
29028 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29029 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29030 }
29031 HiZ = DAG.getBitcast(NextVT, HiZ);
29032
29033 // Move the upper/lower halves to the lower bits as we'll be extending to
29034 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29035 // together.
29036 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29037 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29038 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29039 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29040 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29041 CurrVT = NextVT;
29042 }
29043
29044 return Res;
29045}
29046
29048 const X86Subtarget &Subtarget,
29049 SelectionDAG &DAG) {
29050 MVT VT = Op.getSimpleValueType();
29051
29052 if (Subtarget.hasCDI() &&
29053 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29054 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29055 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29056
29057 // Decompose 256-bit ops into smaller 128-bit ops.
29058 if (VT.is256BitVector() && !Subtarget.hasInt256())
29059 return splitVectorIntUnary(Op, DAG, DL);
29060
29061 // Decompose 512-bit ops into smaller 256-bit ops.
29062 if (VT.is512BitVector() && !Subtarget.hasBWI())
29063 return splitVectorIntUnary(Op, DAG, DL);
29064
29065 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29066 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29067}
29068
29070 SelectionDAG &DAG,
29071 const X86Subtarget &Subtarget) {
29072 MVT VT = Op.getSimpleValueType();
29073 SDValue Input = Op.getOperand(0);
29074
29075 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29076 "Expected vXi8 input for GFNI-based CTLZ lowering");
29077
29078 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29079
29080 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29081 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29082
29083 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29084 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29085 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29086
29087 SDValue LZCNT =
29088 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29089 DAG.getTargetConstant(8, DL, MVT::i8));
29090 return LZCNT;
29091}
29092
29093static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29094 SelectionDAG &DAG) {
29095 MVT VT = Op.getSimpleValueType();
29096 MVT OpVT = VT;
29097 unsigned NumBits = VT.getSizeInBits();
29098 SDLoc dl(Op);
29099 unsigned Opc = Op.getOpcode();
29100
29101 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29102 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29103
29104 if (VT.isVector())
29105 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29106
29107 Op = Op.getOperand(0);
29108 if (VT == MVT::i8) {
29109 // Zero extend to i32 since there is not an i8 bsr.
29110 OpVT = MVT::i32;
29111 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29112 }
29113
29114 // Check if we can safely pass a result though BSR for zero sources.
29115 SDValue PassThru = DAG.getUNDEF(OpVT);
29116 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29117 !DAG.isKnownNeverZero(Op))
29118 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29119
29120 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29121 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29122 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29123
29124 // Skip CMOV if we're using a pass through value.
29125 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29126 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29127 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29128 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29129 Op.getValue(1)};
29130 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29131 }
29132
29133 // Finally xor with NumBits-1.
29134 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29135 DAG.getConstant(NumBits - 1, dl, OpVT));
29136
29137 if (VT == MVT::i8)
29138 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29139 return Op;
29140}
29141
29142static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29143 SelectionDAG &DAG) {
29144 MVT VT = Op.getSimpleValueType();
29145 unsigned NumBits = VT.getScalarSizeInBits();
29146 SDValue N0 = Op.getOperand(0);
29147 SDLoc dl(Op);
29148 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29149
29150 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29151 "Only scalar CTTZ requires custom lowering");
29152
29153 // Check if we can safely pass a result though BSF for zero sources.
29154 SDValue PassThru = DAG.getUNDEF(VT);
29155 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29156 PassThru = DAG.getConstant(NumBits, dl, VT);
29157
29158 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29159 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29160 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29161
29162 // Skip CMOV if src is never zero or we're using a pass through value.
29163 if (NonZeroSrc || !PassThru.isUndef())
29164 return Op;
29165
29166 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29167 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29168 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29169 Op.getValue(1)};
29170 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29171}
29172
29174 const X86Subtarget &Subtarget) {
29175 MVT VT = Op.getSimpleValueType();
29176 SDLoc DL(Op);
29177
29178 if (VT == MVT::i16 || VT == MVT::i32)
29179 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29180
29181 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29182 return splitVectorIntBinary(Op, DAG, DL);
29183
29184 assert(Op.getSimpleValueType().is256BitVector() &&
29185 Op.getSimpleValueType().isInteger() &&
29186 "Only handle AVX 256-bit vector integer operation");
29187 return splitVectorIntBinary(Op, DAG, DL);
29188}
29189
29191 const X86Subtarget &Subtarget) {
29192 MVT VT = Op.getSimpleValueType();
29193 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29194 unsigned Opcode = Op.getOpcode();
29195 SDLoc DL(Op);
29196
29197 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29198 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29199 assert(Op.getSimpleValueType().isInteger() &&
29200 "Only handle AVX vector integer operation");
29201 return splitVectorIntBinary(Op, DAG, DL);
29202 }
29203
29204 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29205 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29206 EVT SetCCResultType =
29207 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29208
29209 unsigned BitWidth = VT.getScalarSizeInBits();
29210 if (Opcode == ISD::USUBSAT) {
29211 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29212 // Handle a special-case with a bit-hack instead of cmp+select:
29213 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29214 // If the target can use VPTERNLOG, DAGToDAG will match this as
29215 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29216 // "broadcast" constant load.
29218 if (C && C->getAPIntValue().isSignMask()) {
29219 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29220 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29221 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29222 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29223 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29224 }
29225 }
29226 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29227 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29228 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29229 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29230 // TODO: Move this to DAGCombiner?
29231 if (SetCCResultType == VT &&
29232 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29233 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29234 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29235 }
29236 }
29237
29238 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29239 (!VT.isVector() || VT == MVT::v2i64)) {
29242 SDValue Zero = DAG.getConstant(0, DL, VT);
29243 SDValue Result =
29244 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29245 DAG.getVTList(VT, SetCCResultType), X, Y);
29246 SDValue SumDiff = Result.getValue(0);
29247 SDValue Overflow = Result.getValue(1);
29248 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29249 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29250 SDValue SumNeg =
29251 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29252 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29253 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29254 }
29255
29256 // Use default expansion.
29257 return SDValue();
29258}
29259
29260static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29261 SelectionDAG &DAG) {
29262 MVT VT = Op.getSimpleValueType();
29263 SDLoc DL(Op);
29264
29265 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29266 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29267 // 8-bit integer abs to NEG and CMOV.
29268 SDValue N0 = Op.getOperand(0);
29269 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29270 DAG.getConstant(0, DL, VT), N0);
29271 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29272 SDValue(Neg.getNode(), 1)};
29273 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29274 }
29275
29276 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29277 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29278 SDValue Src = Op.getOperand(0);
29279 SDValue Neg = DAG.getNegative(Src, DL, VT);
29280 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29281 }
29282
29283 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29284 assert(VT.isInteger() &&
29285 "Only handle AVX 256-bit vector integer operation");
29286 return splitVectorIntUnary(Op, DAG, DL);
29287 }
29288
29289 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29290 return splitVectorIntUnary(Op, DAG, DL);
29291
29292 // Default to expand.
29293 return SDValue();
29294}
29295
29296static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29297 SelectionDAG &DAG) {
29298 MVT VT = Op.getSimpleValueType();
29299 SDLoc DL(Op);
29300
29301 // For AVX1 cases, split to use legal ops.
29302 if (VT.is256BitVector() && !Subtarget.hasInt256())
29303 return splitVectorIntBinary(Op, DAG, DL);
29304
29305 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29306 return splitVectorIntBinary(Op, DAG, DL);
29307
29308 // Default to expand.
29309 return SDValue();
29310}
29311
29312static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29313 SelectionDAG &DAG) {
29314 MVT VT = Op.getSimpleValueType();
29315 SDLoc DL(Op);
29316
29317 // For AVX1 cases, split to use legal ops.
29318 if (VT.is256BitVector() && !Subtarget.hasInt256())
29319 return splitVectorIntBinary(Op, DAG, DL);
29320
29321 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29322 return splitVectorIntBinary(Op, DAG, DL);
29323
29324 // Default to expand.
29325 return SDValue();
29326}
29327
29329 SelectionDAG &DAG) {
29330 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29331 EVT VT = Op.getValueType();
29332 SDValue X = Op.getOperand(0);
29333 SDValue Y = Op.getOperand(1);
29334 SDLoc DL(Op);
29335 bool IsMaxOp =
29336 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29337 bool IsNum =
29338 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29339 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29340 unsigned Opc = 0;
29341 if (VT.isVector())
29343 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29345
29346 if (Opc) {
29347 SDValue Imm =
29348 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29349 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29350 }
29351 }
29352
29353 uint64_t SizeInBits = VT.getScalarSizeInBits();
29354 APInt PreferredZero = APInt::getZero(SizeInBits);
29355 APInt OppositeZero = PreferredZero;
29356 EVT IVT = VT.changeTypeToInteger();
29357 X86ISD::NodeType MinMaxOp;
29358 if (IsMaxOp) {
29359 MinMaxOp = X86ISD::FMAX;
29360 OppositeZero.setSignBit();
29361 } else {
29362 PreferredZero.setSignBit();
29363 MinMaxOp = X86ISD::FMIN;
29364 }
29365 EVT SetCCType =
29366 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29367
29368 // The tables below show the expected result of Max in cases of NaN and
29369 // signed zeros.
29370 //
29371 // Y Y
29372 // Num xNaN +0 -0
29373 // --------------- ---------------
29374 // Num | Max | Y | +0 | +0 | +0 |
29375 // X --------------- X ---------------
29376 // xNaN | X | X/Y | -0 | +0 | -0 |
29377 // --------------- ---------------
29378 //
29379 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29380 // reordering.
29381 //
29382 // We check if any of operands is NaN and return NaN. Then we check if any of
29383 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29384 // to ensure the correct zero is returned.
29385 auto MatchesZero = [](SDValue Op, APInt Zero) {
29387 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29388 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29389 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29390 return CstOp->getAPIntValue() == Zero;
29391 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29392 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29393 for (const SDValue &OpVal : Op->op_values()) {
29394 if (OpVal.isUndef())
29395 continue;
29396 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29397 if (!CstOp)
29398 return false;
29399 if (!CstOp->getValueAPF().isZero())
29400 continue;
29401 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29402 return false;
29403 }
29404 return true;
29405 }
29406 return false;
29407 };
29408
29409 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29410 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29411 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29412 Op->getFlags().hasNoSignedZeros() ||
29413 DAG.isKnownNeverZeroFloat(X) ||
29415 SDValue NewX, NewY;
29416 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29417 MatchesZero(X, OppositeZero)) {
29418 // Operands are already in right order or order does not matter.
29419 NewX = X;
29420 NewY = Y;
29421 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29422 NewX = Y;
29423 NewY = X;
29424 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29425 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29426 if (IsXNeverNaN)
29427 std::swap(X, Y);
29428 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29429 // xmm register.
29430 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29432 // Bits of classes:
29433 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29434 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29435 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29436 DL, MVT::i32);
29437 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29438 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29439 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29440 DAG.getVectorIdxConstant(0, DL));
29441 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29442 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29443 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29444 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29445 } else {
29446 SDValue IsXSigned;
29447 if (Subtarget.is64Bit() || VT != MVT::f64) {
29448 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29449 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29450 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29451 } else {
29452 assert(VT == MVT::f64);
29453 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29454 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29455 DAG.getVectorIdxConstant(0, DL));
29456 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29457 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29458 DAG.getVectorIdxConstant(1, DL));
29459 Hi = DAG.getBitcast(MVT::i32, Hi);
29460 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29461 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29462 *DAG.getContext(), MVT::i32);
29463 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29464 }
29465 if (MinMaxOp == X86ISD::FMAX) {
29466 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29467 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29468 } else {
29469 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29470 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29471 }
29472 }
29473
29474 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29475 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29476
29477 // If we did no ordering operands for signed zero handling and we need
29478 // to process NaN and we know that one of the operands is not NaN then:
29479 // - For minimum/maximum, put it in the first operand,
29480 // - For minimumnum/maximumnum, put it in the second operand,
29481 // and we will not need to post handle NaN after max/min.
29482 if (IgnoreSignedZero && !IgnoreNaN &&
29483 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29484 std::swap(NewX, NewY);
29485
29486 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29487
29488 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29489 return MinMax;
29490
29491 if (DAG.isKnownNeverNaN(NewX))
29492 NewX = NewY;
29493
29494 SDValue IsNaN =
29495 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29496
29497 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29498}
29499
29500static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29501 SelectionDAG &DAG) {
29502 MVT VT = Op.getSimpleValueType();
29503 SDLoc dl(Op);
29504
29505 // For AVX1 cases, split to use legal ops.
29506 if (VT.is256BitVector() && !Subtarget.hasInt256())
29507 return splitVectorIntBinary(Op, DAG, dl);
29508
29509 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29510 return splitVectorIntBinary(Op, DAG, dl);
29511
29512 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29513 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29514
29515 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29516 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29517 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29518
29519 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29520 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29521 if (VT.bitsGE(MVT::i32)) {
29522 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29523 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29524 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29525 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29526 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29527 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29528 DAG.getTargetConstant(CC, dl, MVT::i8),
29529 Diff1.getValue(1));
29530 }
29531
29532 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29533 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29534 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29535 MVT WideVT = MVT::getIntegerVT(WideBits);
29536 if (TLI.isTypeLegal(WideVT)) {
29537 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29538 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29539 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29540 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29541 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29542 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29543 DAG.getTargetConstant(CC, dl, MVT::i8),
29544 Diff1.getValue(1));
29545 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29546 }
29547 }
29548
29549 // Default to expand.
29550 return SDValue();
29551}
29552
29553static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29554 SelectionDAG &DAG) {
29555 SDLoc dl(Op);
29556 MVT VT = Op.getSimpleValueType();
29557
29558 // Decompose 256-bit ops into 128-bit ops.
29559 if (VT.is256BitVector() && !Subtarget.hasInt256())
29560 return splitVectorIntBinary(Op, DAG, dl);
29561
29562 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29563 return splitVectorIntBinary(Op, DAG, dl);
29564
29565 SDValue A = Op.getOperand(0);
29566 SDValue B = Op.getOperand(1);
29567
29568 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29569 // vector pairs, multiply and truncate.
29570 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29571 unsigned NumElts = VT.getVectorNumElements();
29572 unsigned NumLanes = VT.getSizeInBits() / 128;
29573 unsigned NumEltsPerLane = NumElts / NumLanes;
29574
29575 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29576 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29577 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29578 return DAG.getNode(
29579 ISD::TRUNCATE, dl, VT,
29580 DAG.getNode(ISD::MUL, dl, ExVT,
29581 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29582 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29583 }
29584
29585 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29586
29587 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29588 // Don't do this if we only need to unpack one half.
29589 if (Subtarget.hasSSSE3()) {
29590 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29591 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29592 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29593 if (BIsBuildVector) {
29594 for (auto [Idx, Val] : enumerate(B->ops())) {
29595 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29596 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29597 else
29598 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29599 }
29600 }
29601 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29602 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29603 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29604 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29605 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29606 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29607 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29608 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29609 DAG.getTargetConstant(8, dl, MVT::i8));
29610 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29611 }
29612 }
29613
29614 // Extract the lo/hi parts to any extend to i16.
29615 // We're going to mask off the low byte of each result element of the
29616 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29617 // element.
29618 SDValue Undef = DAG.getUNDEF(VT);
29619 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29620 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29621
29622 SDValue BLo, BHi;
29623 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29624 // If the RHS is a constant, manually unpackl/unpackh.
29625 SmallVector<SDValue, 16> LoOps, HiOps;
29626 for (unsigned i = 0; i != NumElts; i += 16) {
29627 for (unsigned j = 0; j != 8; ++j) {
29628 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29629 MVT::i16));
29630 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29631 MVT::i16));
29632 }
29633 }
29634
29635 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29636 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29637 } else {
29638 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29639 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29640 }
29641
29642 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29643 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29644 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29645 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29646 }
29647
29648 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29649 if (VT == MVT::v4i32) {
29650 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29651 "Should not custom lower when pmulld is available!");
29652
29653 // Extract the odd parts.
29654 static const int UnpackMask[] = {1, 1, 3, 3};
29655 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29656 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29657
29658 // Multiply the even parts.
29659 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29660 DAG.getBitcast(MVT::v2i64, A),
29661 DAG.getBitcast(MVT::v2i64, B));
29662 // Now multiply odd parts.
29663 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29664 DAG.getBitcast(MVT::v2i64, Aodds),
29665 DAG.getBitcast(MVT::v2i64, Bodds));
29666
29667 Evens = DAG.getBitcast(VT, Evens);
29668 Odds = DAG.getBitcast(VT, Odds);
29669
29670 // Merge the two vectors back together with a shuffle. This expands into 2
29671 // shuffles.
29672 static const int ShufMask[] = { 0, 4, 2, 6 };
29673 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29674 }
29675
29676 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29677 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29678 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29679
29680 // Ahi = psrlqi(a, 32);
29681 // Bhi = psrlqi(b, 32);
29682 //
29683 // AloBlo = pmuludq(a, b);
29684 // AloBhi = pmuludq(a, Bhi);
29685 // AhiBlo = pmuludq(Ahi, b);
29686 //
29687 // Hi = psllqi(AloBhi + AhiBlo, 32);
29688 // return AloBlo + Hi;
29689 KnownBits AKnown = DAG.computeKnownBits(A);
29690 KnownBits BKnown = DAG.computeKnownBits(B);
29691
29692 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29693 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29694 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29695
29696 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29697 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29698 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29699
29700 SDValue Zero = DAG.getConstant(0, dl, VT);
29701
29702 // Only multiply lo/hi halves that aren't known to be zero.
29703 SDValue AloBlo = Zero;
29704 if (!ALoIsZero && !BLoIsZero)
29705 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29706
29707 SDValue AloBhi = Zero;
29708 if (!ALoIsZero && !BHiIsZero) {
29709 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29710 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29711 }
29712
29713 SDValue AhiBlo = Zero;
29714 if (!AHiIsZero && !BLoIsZero) {
29715 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29716 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29717 }
29718
29719 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29720 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29721
29722 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29723}
29724
29726 MVT VT, bool IsSigned,
29727 const X86Subtarget &Subtarget,
29728 SelectionDAG &DAG,
29729 SDValue *Low = nullptr) {
29730 unsigned NumElts = VT.getVectorNumElements();
29731
29732 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29733 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29734 // lane results back together.
29735
29736 // We'll take different approaches for signed and unsigned.
29737 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29738 // and use pmullw to calculate the full 16-bit product.
29739 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29740 // shift them left into the upper byte of each word. This allows us to use
29741 // pmulhw to calculate the full 16-bit product. This trick means we don't
29742 // need to sign extend the bytes to use pmullw.
29743
29744 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29745 SDValue Zero = DAG.getConstant(0, dl, VT);
29746
29747 SDValue ALo, AHi;
29748 if (IsSigned) {
29749 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29750 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29751 } else {
29752 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29753 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29754 }
29755
29756 SDValue BLo, BHi;
29757 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29758 // If the RHS is a constant, manually unpackl/unpackh and extend.
29759 SmallVector<SDValue, 16> LoOps, HiOps;
29760 for (unsigned i = 0; i != NumElts; i += 16) {
29761 for (unsigned j = 0; j != 8; ++j) {
29762 SDValue LoOp = B.getOperand(i + j);
29763 SDValue HiOp = B.getOperand(i + j + 8);
29764
29765 if (IsSigned) {
29766 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29767 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29768 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29769 DAG.getConstant(8, dl, MVT::i16));
29770 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29771 DAG.getConstant(8, dl, MVT::i16));
29772 } else {
29773 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29774 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29775 }
29776
29777 LoOps.push_back(LoOp);
29778 HiOps.push_back(HiOp);
29779 }
29780 }
29781
29782 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29783 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29784 } else if (IsSigned) {
29785 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29786 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29787 } else {
29788 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29789 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29790 }
29791
29792 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29793 // pack back to vXi8.
29794 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29795 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29796 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29797
29798 if (Low)
29799 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29800
29801 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29802}
29803
29804static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29805 SelectionDAG &DAG) {
29806 SDLoc dl(Op);
29807 MVT VT = Op.getSimpleValueType();
29808 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29809 unsigned NumElts = VT.getVectorNumElements();
29810 SDValue A = Op.getOperand(0);
29811 SDValue B = Op.getOperand(1);
29812
29813 // Decompose 256-bit ops into 128-bit ops.
29814 if (VT.is256BitVector() && !Subtarget.hasInt256())
29815 return splitVectorIntBinary(Op, DAG, dl);
29816
29817 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29818 return splitVectorIntBinary(Op, DAG, dl);
29819
29820 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29821 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29822 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29823 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29824
29825 // PMULxD operations multiply each even value (starting at 0) of LHS with
29826 // the related value of RHS and produce a widen result.
29827 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29828 // => <2 x i64> <ae|cg>
29829 //
29830 // In other word, to have all the results, we need to perform two PMULxD:
29831 // 1. one with the even values.
29832 // 2. one with the odd values.
29833 // To achieve #2, with need to place the odd values at an even position.
29834 //
29835 // Place the odd value at an even position (basically, shift all values 1
29836 // step to the left):
29837 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29838 9, -1, 11, -1, 13, -1, 15, -1};
29839 // <a|b|c|d> => <b|undef|d|undef>
29840 SDValue Odd0 =
29841 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29842 // <e|f|g|h> => <f|undef|h|undef>
29843 SDValue Odd1 =
29844 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29845
29846 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29847 // ints.
29848 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29849 unsigned Opcode =
29850 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29851 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29852 // => <2 x i64> <ae|cg>
29853 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29854 DAG.getBitcast(MulVT, A),
29855 DAG.getBitcast(MulVT, B)));
29856 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29857 // => <2 x i64> <bf|dh>
29858 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29859 DAG.getBitcast(MulVT, Odd0),
29860 DAG.getBitcast(MulVT, Odd1)));
29861
29862 // Shuffle it back into the right order.
29863 SmallVector<int, 16> ShufMask(NumElts);
29864 for (int i = 0; i != (int)NumElts; ++i)
29865 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29866
29867 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29868
29869 // If we have a signed multiply but no PMULDQ fix up the result of an
29870 // unsigned multiply.
29871 if (IsSigned && !Subtarget.hasSSE41()) {
29872 SDValue Zero = DAG.getConstant(0, dl, VT);
29873 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29874 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29875 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29876 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29877
29878 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29879 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29880 }
29881
29882 return Res;
29883 }
29884
29885 // Only i8 vectors should need custom lowering after this.
29886 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29887 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29888 "Unsupported vector type");
29889
29890 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29891 // logical shift down the upper half and pack back to i8.
29892
29893 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29894 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29895
29896 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29897 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29898 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29899 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29900 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29901 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29902 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29903 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29904 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29905 }
29906
29907 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29908}
29909
29910// Custom lowering for SMULO/UMULO.
29911static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29912 SelectionDAG &DAG) {
29913 MVT VT = Op.getSimpleValueType();
29914
29915 // Scalars defer to LowerXALUO.
29916 if (!VT.isVector())
29917 return LowerXALUO(Op, DAG);
29918
29919 SDLoc dl(Op);
29920 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29921 SDValue A = Op.getOperand(0);
29922 SDValue B = Op.getOperand(1);
29923 EVT OvfVT = Op->getValueType(1);
29924
29925 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29926 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29927 // Extract the LHS Lo/Hi vectors
29928 SDValue LHSLo, LHSHi;
29929 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29930
29931 // Extract the RHS Lo/Hi vectors
29932 SDValue RHSLo, RHSHi;
29933 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29934
29935 EVT LoOvfVT, HiOvfVT;
29936 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29937 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29938 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29939
29940 // Issue the split operations.
29941 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29942 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29943
29944 // Join the separate data results and the overflow results.
29945 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29946 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29947 Hi.getValue(1));
29948
29949 return DAG.getMergeValues({Res, Ovf}, dl);
29950 }
29951
29952 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29953 EVT SetccVT =
29954 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29955
29956 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29957 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29958 unsigned NumElts = VT.getVectorNumElements();
29959 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29960 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29961 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29962 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29963 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29964
29965 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29966
29967 SDValue Ovf;
29968 if (IsSigned) {
29969 SDValue High, LowSign;
29970 if (OvfVT.getVectorElementType() == MVT::i1 &&
29971 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29972 // Rather the truncating try to do the compare on vXi16 or vXi32.
29973 // Shift the high down filling with sign bits.
29974 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29975 // Fill all 16 bits with the sign bit from the low.
29976 LowSign =
29977 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29978 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29979 15, DAG);
29980 SetccVT = OvfVT;
29981 if (!Subtarget.hasBWI()) {
29982 // We can't do a vXi16 compare so sign extend to v16i32.
29983 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29984 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29985 }
29986 } else {
29987 // Otherwise do the compare at vXi8.
29988 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29989 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29990 LowSign =
29991 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29992 }
29993
29994 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29995 } else {
29996 SDValue High =
29997 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29998 if (OvfVT.getVectorElementType() == MVT::i1 &&
29999 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30000 // Rather the truncating try to do the compare on vXi16 or vXi32.
30001 SetccVT = OvfVT;
30002 if (!Subtarget.hasBWI()) {
30003 // We can't do a vXi16 compare so sign extend to v16i32.
30004 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30005 }
30006 } else {
30007 // Otherwise do the compare at vXi8.
30008 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30009 }
30010
30011 Ovf =
30012 DAG.getSetCC(dl, SetccVT, High,
30013 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30014 }
30015
30016 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30017
30018 return DAG.getMergeValues({Low, Ovf}, dl);
30019 }
30020
30021 SDValue Low;
30022 SDValue High =
30023 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30024
30025 SDValue Ovf;
30026 if (IsSigned) {
30027 // SMULO overflows if the high bits don't match the sign of the low.
30028 SDValue LowSign =
30029 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30030 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30031 } else {
30032 // UMULO overflows if the high bits are non-zero.
30033 Ovf =
30034 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30035 }
30036
30037 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30038
30039 return DAG.getMergeValues({Low, Ovf}, dl);
30040}
30041
30042SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30043 assert(Subtarget.isTargetWin64() && "Unexpected target");
30044 EVT VT = Op.getValueType();
30045 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30046 "Unexpected return type for lowering");
30047
30048 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30050 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30051 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30052 }
30053
30054 RTLIB::Libcall LC;
30055 bool isSigned;
30056 switch (Op->getOpcode()) {
30057 // clang-format off
30058 default: llvm_unreachable("Unexpected request for libcall!");
30059 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30060 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30061 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30062 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30063 // clang-format on
30064 }
30065
30066 SDLoc dl(Op);
30067 SDValue InChain = DAG.getEntryNode();
30068
30070 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30071 EVT ArgVT = Op->getOperand(i).getValueType();
30072 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30073 "Unexpected argument type for lowering");
30074 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30075 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30076 MachinePointerInfo MPI =
30078 InChain =
30079 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30080 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30081 }
30082
30085
30087 CLI.setDebugLoc(dl)
30088 .setChain(InChain)
30089 .setLibCallee(
30091 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30092 std::move(Args))
30093 .setInRegister()
30094 .setSExtResult(isSigned)
30095 .setZExtResult(!isSigned);
30096
30097 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30098 return DAG.getBitcast(VT, CallInfo.first);
30099}
30100
30101SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30102 SelectionDAG &DAG,
30103 SDValue &Chain) const {
30104 assert(Subtarget.isTargetWin64() && "Unexpected target");
30105 EVT VT = Op.getValueType();
30106 bool IsStrict = Op->isStrictFPOpcode();
30107
30108 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30109 EVT ArgVT = Arg.getValueType();
30110
30111 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30112 "Unexpected return type for lowering");
30113
30114 RTLIB::Libcall LC;
30115 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30116 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30117 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30118 else
30119 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30120 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30121
30122 SDLoc dl(Op);
30123 MakeLibCallOptions CallOptions;
30124 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30125
30127 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30128 // expected VT (i128).
30129 std::tie(Result, Chain) =
30130 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30131 Result = DAG.getBitcast(VT, Result);
30132 return Result;
30133}
30134
30135SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30136 SelectionDAG &DAG) const {
30137 assert(Subtarget.isTargetWin64() && "Unexpected target");
30138 EVT VT = Op.getValueType();
30139 bool IsStrict = Op->isStrictFPOpcode();
30140
30141 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30142 EVT ArgVT = Arg.getValueType();
30143
30144 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30145 "Unexpected argument type for lowering");
30146
30147 RTLIB::Libcall LC;
30148 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30149 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30150 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30151 else
30152 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30153 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30154
30155 SDLoc dl(Op);
30156 MakeLibCallOptions CallOptions;
30157 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30158
30159 // Pass the i128 argument as an indirect argument on the stack.
30160 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30161 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30162 MachinePointerInfo MPI =
30164 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30165
30167 std::tie(Result, Chain) =
30168 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30169 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30170}
30171
30172// Return true if the required (according to Opcode) shift-imm form is natively
30173// supported by the Subtarget
30174static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30175 unsigned Opcode) {
30176 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30177 "Unexpected shift opcode");
30178
30179 if (!VT.isSimple())
30180 return false;
30181
30182 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30183 return false;
30184
30185 if (VT.getScalarSizeInBits() < 16)
30186 return false;
30187
30188 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30189 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30190 return true;
30191
30192 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30193 (VT.is256BitVector() && Subtarget.hasInt256());
30194
30195 bool AShift = LShift && (Subtarget.hasAVX512() ||
30196 (VT != MVT::v2i64 && VT != MVT::v4i64));
30197 return (Opcode == ISD::SRA) ? AShift : LShift;
30198}
30199
30200// The shift amount is a variable, but it is the same for all vector lanes.
30201// These instructions are defined together with shift-immediate.
30202static
30204 unsigned Opcode) {
30205 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30206}
30207
30208// Return true if the required (according to Opcode) variable-shift form is
30209// natively supported by the Subtarget
30210static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30211 unsigned Opcode) {
30212 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30213 "Unexpected shift opcode");
30214
30215 if (!VT.isSimple())
30216 return false;
30217
30218 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30219 return false;
30220
30221 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30222 return false;
30223
30224 // vXi16 supported only on AVX-512, BWI
30225 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30226 return false;
30227
30228 if (Subtarget.hasAVX512() &&
30229 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30230 return true;
30231
30232 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30233 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30234 return (Opcode == ISD::SRA) ? AShift : LShift;
30235}
30236
30238 const X86Subtarget &Subtarget) {
30239 MVT VT = Op.getSimpleValueType();
30240 SDLoc dl(Op);
30241 SDValue R = Op.getOperand(0);
30242 SDValue Amt = Op.getOperand(1);
30243 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30244 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30245
30246 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30247 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30248 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30249 SDValue Ex = DAG.getBitcast(ExVT, R);
30250
30251 // ashr(R, 63) === cmp_slt(R, 0)
30252 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30253 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30254 "Unsupported PCMPGT op");
30255 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30256 }
30257
30258 if (ShiftAmt >= 32) {
30259 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30260 SDValue Upper =
30261 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30263 ShiftAmt - 32, DAG);
30264 if (VT == MVT::v2i64)
30265 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30266 if (VT == MVT::v4i64)
30267 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30268 {9, 1, 11, 3, 13, 5, 15, 7});
30269 } else {
30270 // SRA upper i32, SRL whole i64 and select lower i32.
30272 ShiftAmt, DAG);
30273 SDValue Lower =
30274 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30275 Lower = DAG.getBitcast(ExVT, Lower);
30276 if (VT == MVT::v2i64)
30277 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30278 if (VT == MVT::v4i64)
30279 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30280 {8, 1, 10, 3, 12, 5, 14, 7});
30281 }
30282 return DAG.getBitcast(VT, Ex);
30283 };
30284
30285 // Optimize shl/srl/sra with constant shift amount.
30286 APInt APIntShiftAmt;
30287 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30288 return SDValue();
30289
30290 // If the shift amount is out of range, return undef.
30291 if (APIntShiftAmt.uge(EltSizeInBits))
30292 return DAG.getUNDEF(VT);
30293
30294 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30295
30296 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30297 // Hardware support for vector shifts is sparse which makes us scalarize the
30298 // vector operations in many cases. Also, on sandybridge ADD is faster than
30299 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
30300 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30301 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30302 // must be 0). (add undef, undef) however can be any value. To make this
30303 // safe, we must freeze R to ensure that register allocation uses the same
30304 // register for an undefined value. This ensures that the result will
30305 // still be even and preserves the original semantics.
30306 R = DAG.getFreeze(R);
30307 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30308 }
30309
30310 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30311 }
30312
30313 // i64 SRA needs to be performed as partial shifts.
30314 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30315 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30316 Op.getOpcode() == ISD::SRA)
30317 return ArithmeticShiftRight64(ShiftAmt);
30318
30319 // If we're logical shifting an all-signbits value then we can just perform as
30320 // a mask.
30321 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30322 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30323 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30324 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30325 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30326 }
30327
30328 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30329 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30330 unsigned NumElts = VT.getVectorNumElements();
30331 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30332
30333 // Simple i8 add case
30334 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30335 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30336 // must be 0). (add undef, undef) however can be any value. To make this
30337 // safe, we must freeze R to ensure that register allocation uses the same
30338 // register for an undefined value. This ensures that the result will
30339 // still be even and preserves the original semantics.
30340 R = DAG.getFreeze(R);
30341 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30342 }
30343
30344 // ashr(R, 7) === cmp_slt(R, 0)
30345 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30346 SDValue Zeros = DAG.getConstant(0, dl, VT);
30347 if (VT.is512BitVector()) {
30348 assert(VT == MVT::v64i8 && "Unexpected element type!");
30349 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30350 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30351 }
30352 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30353 }
30354
30355 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30356 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30357 return SDValue();
30358
30359 if (Subtarget.hasGFNI()) {
30360 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30361 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30362 DAG.getTargetConstant(0, dl, MVT::i8));
30363 }
30364
30365 if (Op.getOpcode() == ISD::SHL) {
30366 // Make a large shift.
30367 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30368 ShiftAmt, DAG);
30369 SHL = DAG.getBitcast(VT, SHL);
30370 // Zero out the rightmost bits.
30371 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30372 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30373 }
30374 if (Op.getOpcode() == ISD::SRL) {
30375 // Make a large shift.
30376 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30377 ShiftAmt, DAG);
30378 SRL = DAG.getBitcast(VT, SRL);
30379 // Zero out the leftmost bits.
30380 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30381 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30382 }
30383 if (Op.getOpcode() == ISD::SRA) {
30384 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30385 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30386
30387 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30388 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30389 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30390 return Res;
30391 }
30392 llvm_unreachable("Unknown shift opcode.");
30393 }
30394
30395 return SDValue();
30396}
30397
30399 const X86Subtarget &Subtarget) {
30400 MVT VT = Op.getSimpleValueType();
30401 SDLoc dl(Op);
30402 SDValue R = Op.getOperand(0);
30403 SDValue Amt = Op.getOperand(1);
30404 unsigned Opcode = Op.getOpcode();
30405 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30406
30407 int BaseShAmtIdx = -1;
30408 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30409 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30410 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30411 Subtarget, DAG);
30412
30413 // vXi8 shifts - shift as v8i16 + mask result.
30414 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30415 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30416 VT == MVT::v64i8) &&
30417 !Subtarget.hasXOP()) {
30418 unsigned NumElts = VT.getVectorNumElements();
30419 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30420 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30421 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30422 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30423
30424 // Create the mask using vXi16 shifts. For shift-rights we need to move
30425 // the upper byte down before splatting the vXi8 mask.
30426 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30427 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30428 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30429 if (Opcode != ISD::SHL)
30430 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30431 8, DAG);
30432 BitMask = DAG.getBitcast(VT, BitMask);
30433 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30434 SmallVector<int, 64>(NumElts, 0));
30435
30436 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30437 DAG.getBitcast(ExtVT, R), BaseShAmt,
30438 BaseShAmtIdx, Subtarget, DAG);
30439 Res = DAG.getBitcast(VT, Res);
30440 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30441
30442 if (Opcode == ISD::SRA) {
30443 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30444 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30445 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30446 SignMask =
30447 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30448 BaseShAmtIdx, Subtarget, DAG);
30449 SignMask = DAG.getBitcast(VT, SignMask);
30450 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30451 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30452 }
30453 return Res;
30454 }
30455 }
30456 }
30457
30458 return SDValue();
30459}
30460
30461// Convert a shift/rotate left amount to a multiplication scale factor.
30463 const X86Subtarget &Subtarget,
30464 SelectionDAG &DAG) {
30465 MVT VT = Amt.getSimpleValueType();
30466 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30467 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30468 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30469 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30470 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30471 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30472 return SDValue();
30473
30474 MVT SVT = VT.getVectorElementType();
30475 unsigned SVTBits = SVT.getSizeInBits();
30476 unsigned NumElems = VT.getVectorNumElements();
30477
30478 APInt UndefElts;
30479 SmallVector<APInt> EltBits;
30480 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30481 APInt One(SVTBits, 1);
30482 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30483 for (unsigned I = 0; I != NumElems; ++I) {
30484 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30485 continue;
30486 uint64_t ShAmt = EltBits[I].getZExtValue();
30487 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30488 }
30489 return DAG.getBuildVector(VT, dl, Elts);
30490 }
30491
30492 // If the target doesn't support variable shifts, use either FP conversion
30493 // or integer multiplication to avoid shifting each element individually.
30494 if (VT == MVT::v4i32) {
30495 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30496 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30497 DAG.getConstant(0x3f800000U, dl, VT));
30498 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30499 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30500 }
30501
30502 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30503 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30504 SDValue Z = DAG.getConstant(0, dl, VT);
30505 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30506 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30507 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30508 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30509 if (Subtarget.hasSSE41())
30510 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30511 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30512 }
30513
30514 return SDValue();
30515}
30516
30517static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30518 SelectionDAG &DAG) {
30519 MVT VT = Op.getSimpleValueType();
30520 SDLoc dl(Op);
30521 SDValue R = Op.getOperand(0);
30522 SDValue Amt = Op.getOperand(1);
30523 unsigned NumElts = VT.getVectorNumElements();
30524 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30525 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30526
30527 unsigned Opc = Op.getOpcode();
30528 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30529 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30530
30531 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30532 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30533
30534 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30535 return V;
30536
30537 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30538 return V;
30539
30540 if (supportedVectorVarShift(VT, Subtarget, Opc))
30541 return Op;
30542
30543 // i64 vector arithmetic shift can be emulated with the transform:
30544 // M = lshr(SIGN_MASK, Amt)
30545 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30546 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30547 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30548 Opc == ISD::SRA) {
30549 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30550 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30551 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30552 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30553 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30554 return R;
30555 }
30556
30557 // XOP has 128-bit variable logical/arithmetic shifts.
30558 // +ve/-ve Amt = shift left/right.
30559 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30560 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30561 if (Opc == ISD::SRL || Opc == ISD::SRA)
30562 Amt = DAG.getNegative(Amt, dl, VT);
30563 if (Opc == ISD::SHL || Opc == ISD::SRL)
30564 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30565 if (Opc == ISD::SRA)
30566 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30567 }
30568
30569 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30570 // shifts per-lane and then shuffle the partial results back together.
30571 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30572 // Splat the shift amounts so the scalar shifts above will catch it.
30573 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30574 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30575 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30576 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30577 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30578 }
30579
30580 // Build a map of inrange constant amounts with element mask where they occur.
30582 if (ConstantAmt) {
30583 for (unsigned I = 0; I != NumElts; ++I) {
30584 SDValue A = Amt.getOperand(I);
30585 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30586 continue;
30587 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30588 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30589 if (!Inserted) {
30590 It->second.setBit(I);
30591 continue;
30592 }
30593 It->second = APInt::getOneBitSet(NumElts, I);
30594 }
30595 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30596 }
30597
30598 // If possible, lower this shift as a sequence of two shifts by
30599 // constant plus a BLENDing shuffle instead of scalarizing it.
30600 // Example:
30601 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30602 //
30603 // Could be rewritten as:
30604 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30605 //
30606 // The advantage is that the two shifts from the example would be
30607 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30608 if (UniqueCstAmt.size() == 2 &&
30609 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30610 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30611 unsigned AmtA = UniqueCstAmt.begin()->first;
30612 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30613 const APInt &MaskA = UniqueCstAmt.begin()->second;
30614 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30615 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30616 for (unsigned I = 0; I != NumElts; ++I) {
30617 if (MaskA[I])
30618 ShuffleMask[I] = I;
30619 if (MaskB[I])
30620 ShuffleMask[I] = I + NumElts;
30621 }
30622
30623 // Only perform this blend if we can perform it without loading a mask.
30624 if ((VT != MVT::v16i16 ||
30625 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30626 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30627 canWidenShuffleElements(ShuffleMask))) {
30628 SDValue Shift1 =
30629 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30630 SDValue Shift2 =
30631 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30632 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30633 }
30634 }
30635
30636 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30637 // using vYiM vector operations where X*N == Y*M and M > N.
30638 if (ConstantAmt &&
30639 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30640 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30641 !Subtarget.hasXOP()) {
30642 MVT NarrowScalarVT = VT.getScalarType();
30643 // We can do this extra fast if each pair of narrow elements is shifted by
30644 // the same amount by doing this SWAR style: use a shift to move the valid
30645 // bits to the right position, mask out any bits which crossed from one
30646 // element to the other.
30647 // This optimized lowering is only valid if the elements in a pair can
30648 // be treated identically.
30649 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30650 SmallVector<SDValue, 32> TmpAmtWideElts;
30651 int WideEltSizeInBits = EltSizeInBits;
30652 while (WideEltSizeInBits < 32) {
30653 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30654 // unprofitable.
30655 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30656 break;
30657 }
30658 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30659 bool SameShifts = true;
30660 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30661 unsigned DstI = SrcI / 2;
30662 // Both elements are undef? Make a note and keep going.
30663 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30664 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30665 continue;
30666 }
30667 // Even element is undef? We will shift it by the same shift amount as
30668 // the odd element.
30669 if (AmtWideElts[SrcI].isUndef()) {
30670 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30671 continue;
30672 }
30673 // Odd element is undef? We will shift it by the same shift amount as
30674 // the even element.
30675 if (AmtWideElts[SrcI + 1].isUndef()) {
30676 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30677 continue;
30678 }
30679 // Both elements are equal.
30680 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30681 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30682 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30683 continue;
30684 }
30685 // One of the provisional wide elements will not have the same shift
30686 // amount. Let's bail.
30687 SameShifts = false;
30688 break;
30689 }
30690 if (!SameShifts) {
30691 break;
30692 }
30693 WideEltSizeInBits *= 2;
30694 std::swap(TmpAmtWideElts, AmtWideElts);
30695 }
30696 APInt APIntShiftAmt;
30697 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30698 bool Profitable = WidenShift;
30699 // AVX512BW brings support for vpsllvw.
30700 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30701 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30702 Profitable = false;
30703 }
30704 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30705 // fairly cheaply in other ways.
30706 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30707 Profitable = false;
30708 }
30709 // Leave it up to GFNI if we have it around.
30710 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30711 // is probably a win to use other strategies in some cases.
30712 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30713 Profitable = false;
30714 }
30715
30716 // AVX1 does not have vpand which makes our masking impractical. It does
30717 // have vandps but that is an FP instruction and crossing FP<->int typically
30718 // has some cost.
30719 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30720 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30721 Profitable = false;
30722 }
30723 unsigned WideNumElts = AmtWideElts.size();
30724 // We are only dealing with identical pairs.
30725 if (Profitable && WideNumElts != NumElts) {
30726 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30727 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30728 // Cast the operand to vXiM.
30729 SDValue RWide = DAG.getBitcast(WideVT, R);
30730 // Create our new vector of shift amounts.
30731 SDValue AmtWide = DAG.getBuildVector(
30732 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30733 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30734 // Perform the actual shift.
30735 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30736 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30737 // Now we need to construct a mask which will "drop" bits that get
30738 // shifted past the LSB/MSB. For a logical shift left, it will look
30739 // like:
30740 // FullMask = (1 << EltSizeInBits) - 1
30741 // Mask = FullMask << Amt
30742 //
30743 // This masking ensures that bits cannot migrate from one narrow lane to
30744 // another. The construction of this mask will be constant folded.
30745 // The mask for a logical right shift is nearly identical, the only
30746 // difference is that the all ones mask is shifted right instead of left.
30747 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30748 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30749 Mask = DAG.getBitcast(WideVT, Mask);
30750 // Finally, we mask the shifted vector with the SWAR mask.
30751 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30752 Masked = DAG.getBitcast(VT, Masked);
30753 if (Opc != ISD::SRA) {
30754 // Logical shifts are complete at this point.
30755 return Masked;
30756 }
30757 // At this point, we have done a *logical* shift right. We now need to
30758 // sign extend the result so that we get behavior equivalent to an
30759 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30760 // are `EltSizeInBits-AmtWide` bits wide.
30761 //
30762 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30763 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30764 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30765 // can use the following trick to accomplish this:
30766 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30767 // (Masked ^ SignBitMask) - SignBitMask
30768 //
30769 // When the sign bit is already clear, this will compute:
30770 // Masked + SignBitMask - SignBitMask
30771 //
30772 // This is equal to Masked which is what we want: the sign bit was clear
30773 // so sign extending should be a no-op.
30774 //
30775 // When the sign bit is set, this will compute:
30776 // Masked - SignBitmask - SignBitMask
30777 //
30778 // This is equal to Masked - 2*SignBitMask which will correctly sign
30779 // extend our result.
30780 SDValue SplatHighBit =
30781 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30782 // This does not induce recursion, all operands are constants.
30783 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30784 SDValue FlippedSignBit =
30785 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30786 SDValue Subtraction =
30787 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30788 return Subtraction;
30789 }
30790 }
30791
30792 // If possible, lower this packed shift into a vector multiply instead of
30793 // expanding it into a sequence of scalar shifts.
30794 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30795 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30796 Subtarget.canExtendTo512BW())))
30797 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30798 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30799
30800 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30801 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30802 if (Opc == ISD::SRL && ConstantAmt &&
30803 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30804 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30805 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30806 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30807 SDValue Zero = DAG.getConstant(0, dl, VT);
30808 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30809 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30810 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30811 }
30812 }
30813
30814 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30815 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30816 // TODO: Special case handling for shift by 0/1, really we can afford either
30817 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30818 if (Opc == ISD::SRA && ConstantAmt &&
30819 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30820 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30821 !Subtarget.hasAVX512()) ||
30822 DAG.isKnownNeverZero(Amt))) {
30823 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30824 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30825 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30826 SDValue Amt0 =
30827 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30828 SDValue Amt1 =
30829 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30830 SDValue Sra1 =
30831 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30832 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30833 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30834 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30835 }
30836 }
30837
30838 // v4i32 Non Uniform Shifts.
30839 // If the shift amount is constant we can shift each lane using the SSE2
30840 // immediate shifts, else we need to zero-extend each lane to the lower i64
30841 // and shift using the SSE2 variable shifts.
30842 // The separate results can then be blended together.
30843 if (VT == MVT::v4i32) {
30844 SDValue Amt0, Amt1, Amt2, Amt3;
30845 if (ConstantAmt) {
30846 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30847 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30848 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30849 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30850 } else {
30851 // The SSE2 shifts use the lower i64 as the same shift amount for
30852 // all lanes and the upper i64 is ignored. On AVX we're better off
30853 // just zero-extending, but for SSE just duplicating the top 16-bits is
30854 // cheaper and has the same effect for out of range values.
30855 if (Subtarget.hasAVX()) {
30856 SDValue Z = DAG.getConstant(0, dl, VT);
30857 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30858 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30859 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30860 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30861 } else {
30862 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30863 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30864 {4, 5, 6, 7, -1, -1, -1, -1});
30865 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30866 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30867 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30868 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30869 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30870 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30871 }
30872 }
30873
30874 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30875 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30876 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30877 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30878 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30879
30880 // Merge the shifted lane results optimally with/without PBLENDW.
30881 // TODO - ideally shuffle combining would handle this.
30882 if (Subtarget.hasSSE41()) {
30883 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30884 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30885 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30886 }
30887 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30888 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30889 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30890 }
30891
30892 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30893 // look up the pre-computed shift values.
30894 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30895 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30896 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30897 unsigned NumLanes = VT.getSizeInBits() / 128u;
30898 unsigned NumEltsPerLane = NumElts / NumLanes;
30900 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30901 unsigned LoElt = Lane * NumEltsPerLane;
30902 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30903 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30904 if (!KnownLane.isConstant())
30905 break;
30906 const APInt &LaneSplat = KnownLane.getConstant();
30907 for (unsigned I = 0; I != 8; ++I) {
30908 if (Opc == ISD::SHL)
30909 LUT.push_back(LaneSplat.shl(I));
30910 else if (Opc == ISD::SRL)
30911 LUT.push_back(LaneSplat.lshr(I));
30912 else if (Opc == ISD::SRA)
30913 LUT.push_back(LaneSplat.ashr(I));
30914 }
30915 LUT.append(8, APInt::getZero(8));
30916 }
30917 if (LUT.size() == NumElts) {
30918 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30919 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30920 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30921 }
30922 }
30923
30924 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30925 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30926 // make the existing SSE solution better.
30927 // NOTE: We honor prefered vector width before promoting to 512-bits.
30928 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30929 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30930 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30931 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30932 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30933 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30934 "Unexpected vector type");
30935 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30936 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30937 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30938 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30939 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30940 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30941 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30942 }
30943
30944 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30945 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30946 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30947 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30948 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30949 !Subtarget.hasXOP()) {
30950 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30951 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30952
30953 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30954 // isn't legal).
30955 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30956 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30957 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30958 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30960 "Constant build vector expected");
30961
30962 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30963 bool IsSigned = Opc == ISD::SRA;
30964 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30965 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30966 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30967 return DAG.getZExtOrTrunc(R, dl, VT);
30968 }
30969
30970 SmallVector<SDValue, 16> LoAmt, HiAmt;
30971 for (unsigned i = 0; i != NumElts; i += 16) {
30972 for (int j = 0; j != 8; ++j) {
30973 LoAmt.push_back(Amt.getOperand(i + j));
30974 HiAmt.push_back(Amt.getOperand(i + j + 8));
30975 }
30976 }
30977
30978 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30979 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30980
30981 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30982 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30983 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30984 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30985 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30986 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30987 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30988 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30989 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30990 }
30991
30992 if (VT == MVT::v16i8 ||
30993 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
30994 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30995 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30996
30997 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30998 if (VT.is512BitVector()) {
30999 // On AVX512BW targets we make use of the fact that VSELECT lowers
31000 // to a masked blend which selects bytes based just on the sign bit
31001 // extracted to a mask.
31002 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31003 V0 = DAG.getBitcast(VT, V0);
31004 V1 = DAG.getBitcast(VT, V1);
31005 Sel = DAG.getBitcast(VT, Sel);
31006 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31007 ISD::SETGT);
31008 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31009 } else if (Subtarget.hasSSE41()) {
31010 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31011 // on the sign bit.
31012 V0 = DAG.getBitcast(VT, V0);
31013 V1 = DAG.getBitcast(VT, V1);
31014 Sel = DAG.getBitcast(VT, Sel);
31015 return DAG.getBitcast(SelVT,
31016 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31017 }
31018 // On pre-SSE41 targets we test for the sign bit by comparing to
31019 // zero - a negative value will set all bits of the lanes to true
31020 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31021 SDValue Z = DAG.getConstant(0, dl, SelVT);
31022 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31023 return DAG.getSelect(dl, SelVT, C, V0, V1);
31024 };
31025
31026 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31027 // We can safely do this using i16 shifts as we're only interested in
31028 // the 3 lower bits of each byte.
31029 Amt = DAG.getBitcast(ExtVT, Amt);
31030 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31031 Amt = DAG.getBitcast(VT, Amt);
31032
31033 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31034 // r = VSELECT(r, shift(r, 4), a);
31035 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31036 R = SignBitSelect(VT, Amt, M, R);
31037
31038 // a += a
31039 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31040
31041 // r = VSELECT(r, shift(r, 2), a);
31042 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31043 R = SignBitSelect(VT, Amt, M, R);
31044
31045 // a += a
31046 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31047
31048 // return VSELECT(r, shift(r, 1), a);
31049 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31050 R = SignBitSelect(VT, Amt, M, R);
31051 return R;
31052 }
31053
31054 if (Opc == ISD::SRA) {
31055 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31056 // so we can correctly sign extend. We don't care what happens to the
31057 // lower byte.
31058 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31059 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31060 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31061 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31062 ALo = DAG.getBitcast(ExtVT, ALo);
31063 AHi = DAG.getBitcast(ExtVT, AHi);
31064 RLo = DAG.getBitcast(ExtVT, RLo);
31065 RHi = DAG.getBitcast(ExtVT, RHi);
31066
31067 // r = VSELECT(r, shift(r, 4), a);
31068 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31069 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31070 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31071 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31072
31073 // a += a
31074 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31075 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31076
31077 // r = VSELECT(r, shift(r, 2), a);
31078 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31079 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31080 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31081 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31082
31083 // a += a
31084 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31085 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31086
31087 // r = VSELECT(r, shift(r, 1), a);
31088 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31089 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31090 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31091 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31092
31093 // Logical shift the result back to the lower byte, leaving a zero upper
31094 // byte meaning that we can safely pack with PACKUSWB.
31095 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31096 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31097 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31098 }
31099 }
31100
31101 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31102 MVT ExtVT = MVT::v8i32;
31103 SDValue Z = DAG.getConstant(0, dl, VT);
31104 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31105 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31106 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31107 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31108 ALo = DAG.getBitcast(ExtVT, ALo);
31109 AHi = DAG.getBitcast(ExtVT, AHi);
31110 RLo = DAG.getBitcast(ExtVT, RLo);
31111 RHi = DAG.getBitcast(ExtVT, RHi);
31112 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31113 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31114 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31115 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31116 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31117 }
31118
31119 if (VT == MVT::v8i16) {
31120 // If we have a constant shift amount, the non-SSE41 path is best as
31121 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31122 bool UseSSE41 = Subtarget.hasSSE41() &&
31124
31125 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31126 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31127 // the sign bit.
31128 if (UseSSE41) {
31129 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31130 V0 = DAG.getBitcast(ExtVT, V0);
31131 V1 = DAG.getBitcast(ExtVT, V1);
31132 Sel = DAG.getBitcast(ExtVT, Sel);
31133 return DAG.getBitcast(
31134 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31135 }
31136 // On pre-SSE41 targets we splat the sign bit - a negative value will
31137 // set all bits of the lanes to true and VSELECT uses that in
31138 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31139 SDValue C =
31140 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31141 return DAG.getSelect(dl, VT, C, V0, V1);
31142 };
31143
31144 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31145 if (UseSSE41) {
31146 // On SSE41 targets we need to replicate the shift mask in both
31147 // bytes for PBLENDVB.
31148 Amt = DAG.getNode(
31149 ISD::OR, dl, VT,
31150 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31151 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31152 } else {
31153 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31154 }
31155
31156 // r = VSELECT(r, shift(r, 8), a);
31157 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31158 R = SignBitSelect(Amt, M, R);
31159
31160 // a += a
31161 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31162
31163 // r = VSELECT(r, shift(r, 4), a);
31164 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31165 R = SignBitSelect(Amt, M, R);
31166
31167 // a += a
31168 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31169
31170 // r = VSELECT(r, shift(r, 2), a);
31171 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31172 R = SignBitSelect(Amt, M, R);
31173
31174 // a += a
31175 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31176
31177 // return VSELECT(r, shift(r, 1), a);
31178 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31179 R = SignBitSelect(Amt, M, R);
31180 return R;
31181 }
31182
31183 // Decompose 256-bit shifts into 128-bit shifts.
31184 if (VT.is256BitVector())
31185 return splitVectorIntBinary(Op, DAG, dl);
31186
31187 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31188 return splitVectorIntBinary(Op, DAG, dl);
31189
31190 return SDValue();
31191}
31192
31194 SelectionDAG &DAG) {
31195 MVT VT = Op.getSimpleValueType();
31196 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31197 "Unexpected funnel shift opcode!");
31198
31199 SDLoc DL(Op);
31200 SDValue Op0 = Op.getOperand(0);
31201 SDValue Op1 = Op.getOperand(1);
31202 SDValue Amt = Op.getOperand(2);
31203 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31204 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31205
31206 if (VT.isVector()) {
31207 APInt APIntShiftAmt;
31208 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31209 unsigned NumElts = VT.getVectorNumElements();
31210
31211 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31212 if (IsFSHR)
31213 std::swap(Op0, Op1);
31214
31215 if (IsCstSplat) {
31216 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31217 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31218 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31219 {Op0, Op1, Imm}, DAG, Subtarget);
31220 }
31221 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31222 {Op0, Op1, Amt}, DAG, Subtarget);
31223 }
31224 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31225 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31226 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31227 "Unexpected funnel shift type!");
31228
31229 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31230 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31231 if (IsCstSplat) {
31232 // TODO: Can't use generic expansion as UNDEF amt elements can be
31233 // converted to other values when folded to shift amounts, losing the
31234 // splat.
31235 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31236 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31237 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31238 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31239 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31240
31241 if (EltSizeInBits == 8 &&
31242 (Subtarget.hasXOP() ||
31243 (useVPTERNLOG(Subtarget, VT) &&
31244 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31245 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31246 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31247 // the original vector width to handle cases where we split.
31248 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31249 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31250 SDValue ShX =
31251 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31252 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31253 SDValue ShY =
31254 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31255 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31256 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31257 DAG.getConstant(MaskX, DL, VT));
31258 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31259 DAG.getConstant(MaskY, DL, VT));
31260 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31261 }
31262
31263 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31264 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31265 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31266 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31267 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31268 }
31269
31270 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31271 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31272 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31273
31274 // Constant vXi16 funnel shifts can be efficiently handled by default.
31275 if (IsCst && EltSizeInBits == 16)
31276 return SDValue();
31277
31278 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31279 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31280 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31281
31282 // Split 256-bit integers on XOP/pre-AVX2 targets.
31283 // Split 512-bit integers on non 512-bit BWI targets.
31284 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31285 !Subtarget.hasAVX2())) ||
31286 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31287 EltSizeInBits < 32)) {
31288 // Pre-mask the amount modulo using the wider vector.
31289 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31290 return splitVectorOp(Op, DAG, DL);
31291 }
31292
31293 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31294 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31295 int ScalarAmtIdx = -1;
31296 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31297 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31298 if (EltSizeInBits == 16)
31299 return SDValue();
31300
31301 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31302 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31303 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31304 ScalarAmtIdx, Subtarget, DAG);
31305 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31306 ScalarAmtIdx, Subtarget, DAG);
31307 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31308 }
31309 }
31310
31311 MVT WideSVT = MVT::getIntegerVT(
31312 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31313 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31314
31315 // If per-element shifts are legal, fallback to generic expansion.
31316 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31317 return SDValue();
31318
31319 // Attempt to fold as:
31320 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31321 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31322 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31323 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31324 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31325 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31326 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31327 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31328 EltSizeInBits, DAG);
31329 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31330 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31331 if (!IsFSHR)
31332 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31333 EltSizeInBits, DAG);
31334 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31335 }
31336
31337 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31338 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31339 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31340 SDValue Z = DAG.getConstant(0, DL, VT);
31341 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31342 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31343 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31344 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31345 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31346 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31347 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31348 }
31349
31350 // Fallback to generic expansion.
31351 return SDValue();
31352 }
31353 assert(
31354 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31355 "Unexpected funnel shift type!");
31356
31357 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31358 bool OptForSize = DAG.shouldOptForSize();
31359 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31360
31361 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31362 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31363 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31364 !isa<ConstantSDNode>(Amt)) {
31365 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31366 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31367 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31368 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31369 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31370 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31371 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31372 if (IsFSHR) {
31373 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31374 } else {
31375 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31376 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31377 }
31378 return DAG.getZExtOrTrunc(Res, DL, VT);
31379 }
31380
31381 if (VT == MVT::i8 || ExpandFunnel)
31382 return SDValue();
31383
31384 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31385 if (VT == MVT::i16) {
31386 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31387 DAG.getConstant(15, DL, Amt.getValueType()));
31388 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31389 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31390 }
31391
31392 return Op;
31393}
31394
31395static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31396 SelectionDAG &DAG) {
31397 MVT VT = Op.getSimpleValueType();
31398 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31399
31400 SDLoc DL(Op);
31401 SDValue R = Op.getOperand(0);
31402 SDValue Amt = Op.getOperand(1);
31403 unsigned Opcode = Op.getOpcode();
31404 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31405 int NumElts = VT.getVectorNumElements();
31406 bool IsROTL = Opcode == ISD::ROTL;
31407
31408 // Check for constant splat rotation amount.
31409 APInt CstSplatValue;
31410 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31411
31412 // Check for splat rotate by zero.
31413 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31414 return R;
31415
31416 // AVX512 implicitly uses modulo rotation amounts.
31417 if ((Subtarget.hasVLX() ||
31418 (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
31419 32 <= EltSizeInBits) {
31420 // Attempt to rotate by immediate.
31421 if (IsCstSplat) {
31422 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31423 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31424 return DAG.getNode(RotOpc, DL, VT, R,
31425 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31426 }
31427
31428 // Else, fall-back on VPROLV/VPRORV.
31429 return Op;
31430 }
31431
31432 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31433 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31434 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31435 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31436 }
31437
31438 SDValue Z = DAG.getConstant(0, DL, VT);
31439
31440 if (!IsROTL) {
31441 // If the ISD::ROTR amount is constant, we're always better converting to
31442 // ISD::ROTL.
31443 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31444 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31445
31446 // XOP targets always prefers ISD::ROTL.
31447 if (Subtarget.hasXOP())
31448 return DAG.getNode(ISD::ROTL, DL, VT, R,
31449 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31450 }
31451
31452 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31453 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31455 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31456 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31457 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31458 DAG.getTargetConstant(0, DL, MVT::i8));
31459 }
31460
31461 // Split 256-bit integers on XOP/pre-AVX2 targets.
31462 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31463 return splitVectorIntBinary(Op, DAG, DL);
31464
31465 // XOP has 128-bit vector variable + immediate rotates.
31466 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31467 // XOP implicitly uses modulo rotation amounts.
31468 if (Subtarget.hasXOP()) {
31469 assert(IsROTL && "Only ROTL expected");
31470 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31471
31472 // Attempt to rotate by immediate.
31473 if (IsCstSplat) {
31474 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31475 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31476 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31477 }
31478
31479 // Use general rotate by variable (per-element).
31480 return Op;
31481 }
31482
31483 // Rotate by an uniform constant - expand back to shifts.
31484 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31485 // to other values when folded to shift amounts, losing the splat.
31486 if (IsCstSplat) {
31487 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31488 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31489 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31490 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31491 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31492 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31493 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31494 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31495 }
31496
31497 // Split 512-bit integers on non 512-bit BWI targets.
31498 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31499 return splitVectorIntBinary(Op, DAG, DL);
31500
31501 assert(
31502 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31503 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31504 Subtarget.hasAVX2()) ||
31505 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31506 "Only vXi32/vXi16/vXi8 vector rotates supported");
31507
31508 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31509 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31510
31511 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31512 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31513
31514 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31515 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31516 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31517 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31518 int BaseRotAmtIdx = -1;
31519 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31520 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31521 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31522 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31523 }
31524 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31525 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31526 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31527 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31528 BaseRotAmtIdx, Subtarget, DAG);
31529 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31530 BaseRotAmtIdx, Subtarget, DAG);
31531 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31532 }
31533 }
31534
31535 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31536 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31537
31538 // Attempt to fold as unpack(x,x) << zext(y):
31539 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31540 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31541 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31542 if (!(ConstantAmt && EltSizeInBits != 8) &&
31543 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31544 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31545 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31546 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31547 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31548 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31549 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31550 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31551 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31552 }
31553
31554 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31555 // the amount bit.
31556 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31557 if (EltSizeInBits == 8) {
31558 MVT WideVT =
31559 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31560
31561 // Attempt to fold as:
31562 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31563 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31564 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31565 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31566 // If we're rotating by constant, just use default promotion.
31567 if (ConstantAmt)
31568 return SDValue();
31569 // See if we can perform this by widening to vXi16 or vXi32.
31570 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31571 R = DAG.getNode(
31572 ISD::OR, DL, WideVT, R,
31573 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31574 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31575 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31576 if (IsROTL)
31577 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31578 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31579 }
31580
31581 // We don't need ModuloAmt here as we just peek at individual bits.
31582 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31583 if (Subtarget.hasSSE41()) {
31584 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31585 // on the sign bit.
31586 V0 = DAG.getBitcast(VT, V0);
31587 V1 = DAG.getBitcast(VT, V1);
31588 Sel = DAG.getBitcast(VT, Sel);
31589 return DAG.getBitcast(SelVT,
31590 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31591 }
31592 // On pre-SSE41 targets we test for the sign bit by comparing to
31593 // zero - a negative value will set all bits of the lanes to true
31594 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31595 SDValue Z = DAG.getConstant(0, DL, SelVT);
31596 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31597 return DAG.getSelect(DL, SelVT, C, V0, V1);
31598 };
31599
31600 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31601 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31602 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31603 IsROTL = true;
31604 }
31605
31606 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31607 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31608
31609 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31610 // We can safely do this using i16 shifts as we're only interested in
31611 // the 3 lower bits of each byte.
31612 Amt = DAG.getBitcast(ExtVT, Amt);
31613 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31614 Amt = DAG.getBitcast(VT, Amt);
31615
31616 // r = VSELECT(r, rot(r, 4), a);
31617 SDValue M;
31618 M = DAG.getNode(
31619 ISD::OR, DL, VT,
31620 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31621 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31622 R = SignBitSelect(VT, Amt, M, R);
31623
31624 // a += a
31625 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31626
31627 // r = VSELECT(r, rot(r, 2), a);
31628 M = DAG.getNode(
31629 ISD::OR, DL, VT,
31630 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31631 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31632 R = SignBitSelect(VT, Amt, M, R);
31633
31634 // a += a
31635 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31636
31637 // return VSELECT(r, rot(r, 1), a);
31638 M = DAG.getNode(
31639 ISD::OR, DL, VT,
31640 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31641 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31642 return SignBitSelect(VT, Amt, M, R);
31643 }
31644
31645 bool IsSplatAmt = DAG.isSplatValue(Amt);
31646 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31647 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31648
31649 // Fallback for splats + all supported variable shifts.
31650 // Fallback for non-constants AVX2 vXi16 as well.
31651 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31652 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31653 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31654 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31655 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31656 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31657 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31658 }
31659
31660 // Everything below assumes ISD::ROTL.
31661 if (!IsROTL) {
31662 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31663 IsROTL = true;
31664 }
31665
31666 // ISD::ROT* uses modulo rotate amounts.
31667 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31668
31669 assert(IsROTL && "Only ROTL supported");
31670
31671 // As with shifts, attempt to convert the rotation amount to a multiplication
31672 // factor, fallback to general expansion.
31673 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31674 if (!Scale)
31675 return SDValue();
31676
31677 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31678 if (EltSizeInBits == 16) {
31679 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31680 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31681 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31682 }
31683
31684 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31685 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31686 // that can then be OR'd with the lower 32-bits.
31687 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31688 static const int OddMask[] = {1, 1, 3, 3};
31689 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31690 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31691
31692 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31693 DAG.getBitcast(MVT::v2i64, R),
31694 DAG.getBitcast(MVT::v2i64, Scale));
31695 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31696 DAG.getBitcast(MVT::v2i64, R13),
31697 DAG.getBitcast(MVT::v2i64, Scale13));
31698 Res02 = DAG.getBitcast(VT, Res02);
31699 Res13 = DAG.getBitcast(VT, Res13);
31700
31701 return DAG.getNode(ISD::OR, DL, VT,
31702 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31703 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31704}
31705
31706/// Returns true if the operand type is exactly twice the native width, and
31707/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31708/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31709/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31710bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31711 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31712
31713 if (OpWidth == 64)
31714 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31715 if (OpWidth == 128)
31716 return Subtarget.canUseCMPXCHG16B();
31717
31718 return false;
31719}
31720
31722X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31723 Type *MemType = SI->getValueOperand()->getType();
31724
31725 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31726 !Subtarget.useSoftFloat()) {
31727 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31728 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31730
31731 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31732 Subtarget.hasAVX())
31734 }
31735
31736 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31738}
31739
31740// Note: this turns large loads into lock cmpxchg8b/16b.
31742X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31743 Type *MemType = LI->getType();
31744
31745 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31746 !Subtarget.useSoftFloat()) {
31747 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31748 // can use movq to do the load. If we have X87 we can load into an 80-bit
31749 // X87 register and store it to a stack temporary.
31750 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31751 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31753
31754 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31755 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31756 Subtarget.hasAVX())
31758 }
31759
31760 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31762}
31763
31764enum BitTestKind : unsigned {
31771
31772static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31773 using namespace llvm::PatternMatch;
31774 BitTestKind BTK = UndefBit;
31775 if (auto *C = dyn_cast<ConstantInt>(V)) {
31776 // Check if V is a power of 2 or NOT power of 2.
31777 if (isPowerOf2_64(C->getZExtValue()))
31778 BTK = ConstantBit;
31779 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31780 BTK = NotConstantBit;
31781 return {V, BTK};
31782 }
31783
31784 // Check if V is some power of 2 pattern known to be non-zero
31785 if (auto *I = dyn_cast<Instruction>(V)) {
31786 bool Not = false;
31787 // Check if we have a NOT
31788 Value *PeekI;
31789 if (match(I, m_Not(m_Value(PeekI))) ||
31790 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31791 Not = true;
31792 I = dyn_cast<Instruction>(PeekI);
31793
31794 // If I is constant, it will fold and we can evaluate later. If its an
31795 // argument or something of that nature, we can't analyze.
31796 if (I == nullptr)
31797 return {nullptr, UndefBit};
31798 }
31799 // We can only use 1 << X without more sophisticated analysis. C << X where
31800 // C is a power of 2 but not 1 can result in zero which cannot be translated
31801 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31802 if (I->getOpcode() == Instruction::Shl) {
31803 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31804 // -X` and some other provable power of 2 patterns that we can use CTZ on
31805 // may be profitable.
31806 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31807 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31808 // be provably a non-zero power of 2.
31809 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31810 // transformable to bittest.
31811 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31812 if (!ShiftVal)
31813 return {nullptr, UndefBit};
31814 if (ShiftVal->equalsInt(1))
31815 BTK = Not ? NotShiftBit : ShiftBit;
31816
31817 if (BTK == UndefBit)
31818 return {nullptr, UndefBit};
31819
31820 Value *BitV = I->getOperand(1);
31821
31822 // Read past a shiftmask instruction to find count
31823 Value *AndOp;
31824 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31825 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31826 BitV = AndOp;
31827
31828 return {BitV, BTK};
31829 }
31830 }
31831 return {nullptr, UndefBit};
31832}
31833
31835X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31836 using namespace llvm::PatternMatch;
31837 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31838 // prefix to a normal instruction for these operations.
31839 if (AI->use_empty())
31841
31842 if (AI->getOperation() == AtomicRMWInst::Xor) {
31843 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31844 // preferable to both `cmpxchg` and `btc`.
31845 if (match(AI->getOperand(1), m_SignMask()))
31847 }
31848
31849 // If the atomicrmw's result is used by a single bit AND, we may use
31850 // bts/btr/btc instruction for these operations.
31851 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31852 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31853 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31854 // detect it.
31855 Instruction *I = AI->user_back();
31856 auto BitChange = FindSingleBitChange(AI->getValOperand());
31857 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31858 I->getOpcode() != Instruction::And ||
31859 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31860 AI->getParent() != I->getParent())
31862
31863 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31864
31865 // This is a redundant AND, it should get cleaned up elsewhere.
31866 if (AI == I->getOperand(OtherIdx))
31868
31869 // The following instruction must be a AND single bit.
31870 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31871 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31872 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31873 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31875 }
31876 if (AI->getOperation() == AtomicRMWInst::And) {
31877 return ~C1->getValue() == C2->getValue()
31880 }
31883 }
31884
31885 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31886
31887 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31888 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31890
31891 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31892
31893 // If shift amounts are not the same we can't use BitTestIntrinsic.
31894 if (BitChange.first != BitTested.first)
31896
31897 // If atomic AND need to be masking all be one bit and testing the one bit
31898 // unset in the mask.
31899 if (AI->getOperation() == AtomicRMWInst::And)
31900 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31903
31904 // If atomic XOR/OR need to be setting and testing the same bit.
31905 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31908}
31909
31910void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31911 IRBuilder<> Builder(AI);
31912 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31915 switch (AI->getOperation()) {
31916 default:
31917 llvm_unreachable("Unknown atomic operation");
31918 case AtomicRMWInst::Or:
31919 IID_C = Intrinsic::x86_atomic_bts;
31920 IID_I = Intrinsic::x86_atomic_bts_rm;
31921 break;
31922 case AtomicRMWInst::Xor:
31923 IID_C = Intrinsic::x86_atomic_btc;
31924 IID_I = Intrinsic::x86_atomic_btc_rm;
31925 break;
31926 case AtomicRMWInst::And:
31927 IID_C = Intrinsic::x86_atomic_btr;
31928 IID_I = Intrinsic::x86_atomic_btr_rm;
31929 break;
31930 }
31931 Instruction *I = AI->user_back();
31932 LLVMContext &Ctx = AI->getContext();
31933 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31935 Value *Result = nullptr;
31936 auto BitTested = FindSingleBitChange(AI->getValOperand());
31937 assert(BitTested.first != nullptr);
31938
31939 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31940 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31941
31942 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31943 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31944 {Addr, Builder.getInt8(Imm)});
31945 } else {
31946 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31947
31948 Value *SI = BitTested.first;
31949 assert(SI != nullptr);
31950
31951 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31952 // mask it.
31953 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31954 Value *BitPos =
31955 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31956 // Todo(1): In many cases it may be provable that SI is less than
31957 // ShiftBits in which case this mask is unnecessary
31958 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31959 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31960 // favor of just a raw BT{S|R|C}.
31961
31962 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31963 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31964
31965 // If the result is only used for zero/non-zero status then we don't need to
31966 // shift value back. Otherwise do so.
31967 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31968 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31969 if (ICmp->isEquality()) {
31970 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31971 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31972 if (C0 || C1) {
31973 assert(C0 == nullptr || C1 == nullptr);
31974 if ((C0 ? C0 : C1)->isZero())
31975 continue;
31976 }
31977 }
31978 }
31979 Result = Builder.CreateShl(Result, BitPos);
31980 break;
31981 }
31982 }
31983
31984 I->replaceAllUsesWith(Result);
31985 I->eraseFromParent();
31986 AI->eraseFromParent();
31987}
31988
31990 using namespace llvm::PatternMatch;
31991 if (!AI->hasOneUse())
31992 return false;
31993
31994 Value *Op = AI->getOperand(1);
31995 CmpPredicate Pred;
31996 Instruction *I = AI->user_back();
31998 if (Opc == AtomicRMWInst::Add) {
31999 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32000 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32001 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32002 if (match(I->user_back(),
32004 return true;
32005 if (match(I->user_back(),
32007 return true;
32008 }
32009 return false;
32010 }
32011 if (Opc == AtomicRMWInst::Sub) {
32012 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32013 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32014 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32015 if (match(I->user_back(),
32017 return true;
32018 if (match(I->user_back(),
32020 return true;
32021 }
32022 return false;
32023 }
32024 if ((Opc == AtomicRMWInst::Or &&
32026 (Opc == AtomicRMWInst::And &&
32028 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32029 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32030 Pred == CmpInst::ICMP_SLT;
32031 if (match(I->user_back(),
32033 return true;
32034 return false;
32035 }
32036 if (Opc == AtomicRMWInst::Xor) {
32037 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32038 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32039 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32040 if (match(I->user_back(),
32042 return true;
32043 if (match(I->user_back(),
32045 return true;
32046 }
32047 return false;
32048 }
32049
32050 return false;
32051}
32052
32053void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32054 AtomicRMWInst *AI) const {
32055 IRBuilder<> Builder(AI);
32056 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32057 Instruction *TempI = nullptr;
32058 LLVMContext &Ctx = AI->getContext();
32059 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32060 if (!ICI) {
32061 TempI = AI->user_back();
32062 assert(TempI->hasOneUse() && "Must have one use");
32063 ICI = cast<ICmpInst>(TempI->user_back());
32064 }
32066 ICmpInst::Predicate Pred = ICI->getPredicate();
32067 switch (Pred) {
32068 default:
32069 llvm_unreachable("Not supported Pred");
32070 case CmpInst::ICMP_EQ:
32071 CC = X86::COND_E;
32072 break;
32073 case CmpInst::ICMP_NE:
32074 CC = X86::COND_NE;
32075 break;
32076 case CmpInst::ICMP_SLT:
32077 CC = X86::COND_S;
32078 break;
32079 case CmpInst::ICMP_SGT:
32080 CC = X86::COND_NS;
32081 break;
32082 }
32084 switch (AI->getOperation()) {
32085 default:
32086 llvm_unreachable("Unknown atomic operation");
32087 case AtomicRMWInst::Add:
32088 IID = Intrinsic::x86_atomic_add_cc;
32089 break;
32090 case AtomicRMWInst::Sub:
32091 IID = Intrinsic::x86_atomic_sub_cc;
32092 break;
32093 case AtomicRMWInst::Or:
32094 IID = Intrinsic::x86_atomic_or_cc;
32095 break;
32096 case AtomicRMWInst::And:
32097 IID = Intrinsic::x86_atomic_and_cc;
32098 break;
32099 case AtomicRMWInst::Xor:
32100 IID = Intrinsic::x86_atomic_xor_cc;
32101 break;
32102 }
32103 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32105 Value *Call = Builder.CreateIntrinsic(
32106 IID, AI->getType(),
32107 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32108 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32109 ICI->replaceAllUsesWith(Result);
32110 ICI->eraseFromParent();
32111 if (TempI)
32112 TempI->eraseFromParent();
32113 AI->eraseFromParent();
32114}
32115
32117X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32118 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32119 Type *MemType = AI->getType();
32120
32121 // If the operand is too big, we must see if cmpxchg8/16b is available
32122 // and default to library calls otherwise.
32123 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32124 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32126 }
32127
32129 switch (Op) {
32132 case AtomicRMWInst::Add:
32133 case AtomicRMWInst::Sub:
32136 // It's better to use xadd, xsub or xchg for these in other cases.
32138 case AtomicRMWInst::Or:
32139 case AtomicRMWInst::And:
32140 case AtomicRMWInst::Xor:
32143 return shouldExpandLogicAtomicRMWInIR(AI);
32145 case AtomicRMWInst::Max:
32146 case AtomicRMWInst::Min:
32157 default:
32158 // These always require a non-trivial set of data operations on x86. We must
32159 // use a cmpxchg loop.
32161 }
32162}
32163
32164LoadInst *
32165X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32166 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32167 Type *MemType = AI->getType();
32168 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32169 // there is no benefit in turning such RMWs into loads, and it is actually
32170 // harmful as it introduces a mfence.
32171 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32172 return nullptr;
32173
32174 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32175 // lowering available in lowerAtomicArith.
32176 // TODO: push more cases through this path.
32177 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32178 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32179 AI->use_empty())
32180 return nullptr;
32181
32182 IRBuilder<> Builder(AI);
32183 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32184 auto SSID = AI->getSyncScopeID();
32185 // We must restrict the ordering to avoid generating loads with Release or
32186 // ReleaseAcquire orderings.
32188
32189 // Before the load we need a fence. Here is an example lifted from
32190 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32191 // is required:
32192 // Thread 0:
32193 // x.store(1, relaxed);
32194 // r1 = y.fetch_add(0, release);
32195 // Thread 1:
32196 // y.fetch_add(42, acquire);
32197 // r2 = x.load(relaxed);
32198 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32199 // lowered to just a load without a fence. A mfence flushes the store buffer,
32200 // making the optimization clearly correct.
32201 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32202 // otherwise, we might be able to be more aggressive on relaxed idempotent
32203 // rmw. In practice, they do not look useful, so we don't try to be
32204 // especially clever.
32205
32206 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32207 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32208 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32209
32210 // Finally we can emit the atomic load.
32211 LoadInst *Loaded = Builder.CreateAlignedLoad(
32212 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32213 Loaded->setAtomic(Order, SSID);
32214 AI->replaceAllUsesWith(Loaded);
32215 AI->eraseFromParent();
32216 return Loaded;
32217}
32218
32219/// Emit a locked operation on a stack location which does not change any
32220/// memory location, but does involve a lock prefix. Location is chosen to be
32221/// a) very likely accessed only by a single thread to minimize cache traffic,
32222/// and b) definitely dereferenceable. Returns the new Chain result.
32224 const X86Subtarget &Subtarget, SDValue Chain,
32225 const SDLoc &DL) {
32226 // Implementation notes:
32227 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32228 // operations issued by the current processor. As such, the location
32229 // referenced is not relevant for the ordering properties of the instruction.
32230 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32231 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32232 // 2) Using an immediate operand appears to be the best encoding choice
32233 // here since it doesn't require an extra register.
32234 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32235 // is small enough it might just be measurement noise.)
32236 // 4) When choosing offsets, there are several contributing factors:
32237 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32238 // line aligned stack object to improve this case.)
32239 // b) To minimize our chances of introducing a false dependence, we prefer
32240 // to offset the stack usage from TOS slightly.
32241 // c) To minimize concerns about cross thread stack usage - in particular,
32242 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32243 // captures state in the TOS frame and accesses it from many threads -
32244 // we want to use an offset such that the offset is in a distinct cache
32245 // line from the TOS frame.
32246 //
32247 // For a general discussion of the tradeoffs and benchmark results, see:
32248 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32249
32250 auto &MF = DAG.getMachineFunction();
32251 auto &TFL = *Subtarget.getFrameLowering();
32252 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32253
32254 if (Subtarget.is64Bit()) {
32255 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32256 SDValue Ops[] = {
32257 DAG.getRegister(X86::RSP, MVT::i64), // Base
32258 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32259 DAG.getRegister(0, MVT::i64), // Index
32260 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32261 DAG.getRegister(0, MVT::i16), // Segment.
32262 Zero,
32263 Chain};
32264 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32265 MVT::Other, Ops);
32266 return SDValue(Res, 1);
32267 }
32268
32269 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32270 SDValue Ops[] = {
32271 DAG.getRegister(X86::ESP, MVT::i32), // Base
32272 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32273 DAG.getRegister(0, MVT::i32), // Index
32274 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32275 DAG.getRegister(0, MVT::i16), // Segment.
32276 Zero,
32277 Chain
32278 };
32279 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32280 MVT::Other, Ops);
32281 return SDValue(Res, 1);
32282}
32283
32285 SelectionDAG &DAG) {
32286 SDLoc dl(Op);
32287 AtomicOrdering FenceOrdering =
32288 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32289 SyncScope::ID FenceSSID =
32290 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32291
32292 // The only fence that needs an instruction is a sequentially-consistent
32293 // cross-thread fence.
32294 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32295 FenceSSID == SyncScope::System) {
32296 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32297 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32298
32299 SDValue Chain = Op.getOperand(0);
32300 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32301 }
32302
32303 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32304 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32305}
32306
32308 SelectionDAG &DAG) {
32309 MVT T = Op.getSimpleValueType();
32310 SDLoc DL(Op);
32311 unsigned Reg = 0;
32312 unsigned size = 0;
32313 switch(T.SimpleTy) {
32314 default: llvm_unreachable("Invalid value type!");
32315 case MVT::i8: Reg = X86::AL; size = 1; break;
32316 case MVT::i16: Reg = X86::AX; size = 2; break;
32317 case MVT::i32: Reg = X86::EAX; size = 4; break;
32318 case MVT::i64:
32319 assert(Subtarget.is64Bit() && "Node not type legal!");
32320 Reg = X86::RAX; size = 8;
32321 break;
32322 }
32323 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32324 Op.getOperand(2), SDValue());
32325 SDValue Ops[] = { cpIn.getValue(0),
32326 Op.getOperand(1),
32327 Op.getOperand(3),
32328 DAG.getTargetConstant(size, DL, MVT::i8),
32329 cpIn.getValue(1) };
32330 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32331 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32333 Ops, T, MMO);
32334
32335 SDValue cpOut =
32336 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32337 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32338 MVT::i32, cpOut.getValue(2));
32339 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32340
32341 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32342 cpOut, Success, EFLAGS.getValue(1));
32343}
32344
32345// Create MOVMSKB, taking into account whether we need to split for AVX1.
32347 const X86Subtarget &Subtarget) {
32348 MVT InVT = V.getSimpleValueType();
32349
32350 if (InVT == MVT::v64i8) {
32351 SDValue Lo, Hi;
32352 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32353 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32354 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32355 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32356 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32357 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32358 DAG.getConstant(32, DL, MVT::i8));
32359 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32360 }
32361 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32362 SDValue Lo, Hi;
32363 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32364 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32365 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32366 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32367 DAG.getConstant(16, DL, MVT::i8));
32368 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32369 }
32370
32371 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32372}
32373
32374static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32375 SelectionDAG &DAG) {
32376 SDValue Src = Op.getOperand(0);
32377 MVT SrcVT = Src.getSimpleValueType();
32378 MVT DstVT = Op.getSimpleValueType();
32379
32380 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32381 // half to v32i1 and concatenating the result.
32382 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32383 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32384 assert(Subtarget.hasBWI() && "Expected BWI target");
32385 SDLoc dl(Op);
32386 SDValue Lo, Hi;
32387 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32388 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32389 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32390 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32391 }
32392
32393 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32394 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32395 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32396 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32397 SDLoc DL(Op);
32398 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32399 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32400 return DAG.getZExtOrTrunc(V, DL, DstVT);
32401 }
32402
32403 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32404 SrcVT == MVT::i64) && "Unexpected VT!");
32405
32406 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32407 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32408 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32409 // This conversion needs to be expanded.
32410 return SDValue();
32411
32412 SDLoc dl(Op);
32413 if (SrcVT.isVector()) {
32414 // Widen the vector in input in the case of MVT::v2i32.
32415 // Example: from MVT::v2i32 to MVT::v4i32.
32417 SrcVT.getVectorNumElements() * 2);
32418 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32419 DAG.getUNDEF(SrcVT));
32420 } else {
32421 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32422 "Unexpected source type in LowerBITCAST");
32423 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32424 }
32425
32426 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32427 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32428
32429 if (DstVT == MVT::x86mmx)
32430 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32431
32432 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32433 DAG.getVectorIdxConstant(0, dl));
32434}
32435
32436/// Compute the horizontal sum of bytes in V for the elements of VT.
32437///
32438/// Requires V to be a byte vector and VT to be an integer vector type with
32439/// wider elements than V's type. The width of the elements of VT determines
32440/// how many bytes of V are summed horizontally to produce each element of the
32441/// result.
32443 const X86Subtarget &Subtarget,
32444 SelectionDAG &DAG) {
32445 SDLoc DL(V);
32446 MVT ByteVecVT = V.getSimpleValueType();
32447 MVT EltVT = VT.getVectorElementType();
32448 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32449 "Expected value to have byte element type.");
32450 assert(EltVT != MVT::i8 &&
32451 "Horizontal byte sum only makes sense for wider elements!");
32452 unsigned VecSize = VT.getSizeInBits();
32453 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32454
32455 // PSADBW instruction horizontally add all bytes and leave the result in i64
32456 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32457 if (EltVT == MVT::i64) {
32458 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32459 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32460 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32461 return DAG.getBitcast(VT, V);
32462 }
32463
32464 if (EltVT == MVT::i32) {
32465 // We unpack the low half and high half into i32s interleaved with zeros so
32466 // that we can use PSADBW to horizontally sum them. The most useful part of
32467 // this is that it lines up the results of two PSADBW instructions to be
32468 // two v2i64 vectors which concatenated are the 4 population counts. We can
32469 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32470 SDValue Zeros = DAG.getConstant(0, DL, VT);
32471 SDValue V32 = DAG.getBitcast(VT, V);
32472 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32473 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32474
32475 // Do the horizontal sums into two v2i64s.
32476 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32477 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32478 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32479 DAG.getBitcast(ByteVecVT, Low), Zeros);
32480 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32481 DAG.getBitcast(ByteVecVT, High), Zeros);
32482
32483 // Merge them together.
32484 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32485 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32486 DAG.getBitcast(ShortVecVT, Low),
32487 DAG.getBitcast(ShortVecVT, High));
32488
32489 return DAG.getBitcast(VT, V);
32490 }
32491
32492 // The only element type left is i16.
32493 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32494
32495 // To obtain pop count for each i16 element starting from the pop count for
32496 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32497 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32498 // directly supported.
32499 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32500 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32501 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32502 DAG.getBitcast(ByteVecVT, V));
32503 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32504}
32505
32507 const X86Subtarget &Subtarget,
32508 SelectionDAG &DAG) {
32509 MVT VT = Op.getSimpleValueType();
32510 MVT EltVT = VT.getVectorElementType();
32511 int NumElts = VT.getVectorNumElements();
32512 (void)EltVT;
32513 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32514
32515 // Implement a lookup table in register by using an algorithm based on:
32516 // http://wm.ite.pl/articles/sse-popcount.html
32517 //
32518 // The general idea is that every lower byte nibble in the input vector is an
32519 // index into a in-register pre-computed pop count table. We then split up the
32520 // input vector in two new ones: (1) a vector with only the shifted-right
32521 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32522 // masked out higher ones) for each byte. PSHUFB is used separately with both
32523 // to index the in-register table. Next, both are added and the result is a
32524 // i8 vector where each element contains the pop count for input byte.
32525 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32526 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32527 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32528 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32529
32531 for (int i = 0; i < NumElts; ++i)
32532 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32533 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32534 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32535
32536 // High nibbles
32537 SDValue FourV = DAG.getConstant(4, DL, VT);
32538 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32539
32540 // Low nibbles
32541 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32542
32543 // The input vector is used as the shuffle mask that index elements into the
32544 // LUT. After counting low and high nibbles, add the vector to obtain the
32545 // final pop count per i8 element.
32546 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32547 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32548 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32549}
32550
32551// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32552// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32554 const X86Subtarget &Subtarget,
32555 SelectionDAG &DAG) {
32556 MVT VT = Op.getSimpleValueType();
32557 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32558 "Unknown CTPOP type to handle");
32559 SDValue Op0 = Op.getOperand(0);
32560
32561 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32562 if (Subtarget.hasVPOPCNTDQ()) {
32563 unsigned NumElems = VT.getVectorNumElements();
32564 assert((VT.getVectorElementType() == MVT::i8 ||
32565 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32566 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32567 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32568 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32569 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32570 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32571 }
32572 }
32573
32574 // Decompose 256-bit ops into smaller 128-bit ops.
32575 if (VT.is256BitVector() && !Subtarget.hasInt256())
32576 return splitVectorIntUnary(Op, DAG, DL);
32577
32578 // Decompose 512-bit ops into smaller 256-bit ops.
32579 if (VT.is512BitVector() && !Subtarget.hasBWI())
32580 return splitVectorIntUnary(Op, DAG, DL);
32581
32582 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32583 if (VT.getScalarType() != MVT::i8) {
32584 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32585 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32586 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32587 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32588 }
32589
32590 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32591 if (!Subtarget.hasSSSE3())
32592 return SDValue();
32593
32594 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32595}
32596
32597static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32598 SelectionDAG &DAG) {
32599 MVT VT = N.getSimpleValueType();
32600 SDValue Op = N.getOperand(0);
32601 SDLoc DL(N);
32602
32603 if (VT.isScalarInteger()) {
32604 // Compute the lower/upper bounds of the active bits of the value,
32605 // allowing us to shift the active bits down if necessary to fit into the
32606 // special cases below.
32607 KnownBits Known = DAG.computeKnownBits(Op);
32608 if (Known.isConstant())
32609 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32610 unsigned LZ = Known.countMinLeadingZeros();
32611 unsigned TZ = Known.countMinTrailingZeros();
32612 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32613 unsigned ActiveBits = Known.getBitWidth() - LZ;
32614 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32615
32616 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32617 if (ShiftedActiveBits <= 2) {
32618 if (ActiveBits > 2)
32619 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32620 DAG.getShiftAmountConstant(TZ, VT, DL));
32621 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32622 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32623 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32624 DAG.getShiftAmountConstant(1, VT, DL)));
32625 return DAG.getZExtOrTrunc(Op, DL, VT);
32626 }
32627
32628 // i3 CTPOP - perform LUT into i32 integer.
32629 if (ShiftedActiveBits <= 3) {
32630 if (ActiveBits > 3)
32631 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32632 DAG.getShiftAmountConstant(TZ, VT, DL));
32633 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32634 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32635 DAG.getShiftAmountConstant(1, VT, DL));
32636 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32637 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32638 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32639 DAG.getConstant(0x3, DL, MVT::i32));
32640 return DAG.getZExtOrTrunc(Op, DL, VT);
32641 }
32642
32643 // i4 CTPOP - perform LUT into i64 integer.
32644 if (ShiftedActiveBits <= 4 &&
32645 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32646 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32647 if (ActiveBits > 4)
32648 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32649 DAG.getShiftAmountConstant(TZ, VT, DL));
32650 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32651 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32652 DAG.getConstant(4, DL, MVT::i32));
32653 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32654 DAG.getShiftAmountOperand(MVT::i64, Op));
32655 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32656 DAG.getConstant(0x7, DL, MVT::i64));
32657 return DAG.getZExtOrTrunc(Op, DL, VT);
32658 }
32659
32660 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32661 if (ShiftedActiveBits <= 8) {
32662 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32663 if (ActiveBits > 8)
32664 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32665 DAG.getShiftAmountConstant(TZ, VT, DL));
32666 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32667 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32668 DAG.getConstant(0x08040201U, DL, MVT::i32));
32669 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32670 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32671 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32672 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32673 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32674 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32675 return DAG.getZExtOrTrunc(Op, DL, VT);
32676 }
32677
32678 return SDValue(); // fallback to generic expansion.
32679 }
32680
32681 assert(VT.isVector() &&
32682 "We only do custom lowering for vector population count.");
32683 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32684}
32685
32687 MVT VT = Op.getSimpleValueType();
32688 SDValue In = Op.getOperand(0);
32689 SDLoc DL(Op);
32690
32691 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32692 // perform the BITREVERSE.
32693 if (!VT.isVector()) {
32694 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32695 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32696 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32697 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32698 DAG.getVectorIdxConstant(0, DL));
32699 }
32700
32701 int NumElts = VT.getVectorNumElements();
32702 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32703
32704 // Decompose 256-bit ops into smaller 128-bit ops.
32705 if (VT.is256BitVector())
32706 return splitVectorIntUnary(Op, DAG, DL);
32707
32708 assert(VT.is128BitVector() &&
32709 "Only 128-bit vector bitreverse lowering supported.");
32710
32711 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32712 // perform the BSWAP in the shuffle.
32713 // Its best to shuffle using the second operand as this will implicitly allow
32714 // memory folding for multiple vectors.
32715 SmallVector<SDValue, 16> MaskElts;
32716 for (int i = 0; i != NumElts; ++i) {
32717 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32718 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32719 int PermuteByte = SourceByte | (2 << 5);
32720 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32721 }
32722 }
32723
32724 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32725 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32726 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32727 Res, Mask);
32728 return DAG.getBitcast(VT, Res);
32729}
32730
32732 SelectionDAG &DAG) {
32733 MVT VT = Op.getSimpleValueType();
32734
32735 if (Subtarget.hasXOP() && !VT.is512BitVector())
32736 return LowerBITREVERSE_XOP(Op, DAG);
32737
32738 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32739 "SSSE3 or GFNI required for BITREVERSE");
32740
32741 SDValue In = Op.getOperand(0);
32742 SDLoc DL(Op);
32743
32744 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32745 if (VT.is512BitVector() && !Subtarget.hasBWI())
32746 return splitVectorIntUnary(Op, DAG, DL);
32747
32748 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32749 if (VT.is256BitVector() && !Subtarget.hasInt256())
32750 return splitVectorIntUnary(Op, DAG, DL);
32751
32752 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32753 if (!VT.isVector()) {
32754 assert(
32755 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32756 "Only tested for i8/i16/i32/i64");
32757 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32758 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32759 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32760 DAG.getBitcast(MVT::v16i8, Res));
32761 Res =
32762 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32763 DAG.getVectorIdxConstant(0, DL));
32764 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32765 }
32766
32767 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32768
32769 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32770 if (VT.getScalarType() != MVT::i8) {
32771 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32772 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32773 Res = DAG.getBitcast(ByteVT, Res);
32774 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32775 return DAG.getBitcast(VT, Res);
32776 }
32777 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32778 "Only byte vector BITREVERSE supported");
32779
32780 unsigned NumElts = VT.getVectorNumElements();
32781
32782 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32783 if (Subtarget.hasGFNI()) {
32785 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32786 DAG.getTargetConstant(0, DL, MVT::i8));
32787 }
32788
32789 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32790 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32791 // 0-15 value (moved to the other nibble).
32792 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32793 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32794 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32795
32796 const int LoLUT[16] = {
32797 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32798 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32799 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32800 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32801 const int HiLUT[16] = {
32802 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32803 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32804 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32805 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32806
32807 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32808 for (unsigned i = 0; i < NumElts; ++i) {
32809 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32810 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32811 }
32812
32813 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32814 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32815 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32816 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32817 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32818}
32819
32820static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32821 SelectionDAG &DAG) {
32822 SDLoc DL(Op);
32823 SDValue X = Op.getOperand(0);
32824 MVT VT = Op.getSimpleValueType();
32825
32826 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32827 if (VT == MVT::i8 ||
32829 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32830 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32831 DAG.getConstant(0, DL, MVT::i8));
32832 // Copy the inverse of the parity flag into a register with setcc.
32833 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32834 // Extend to the original type.
32835 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32836 }
32837
32838 // If we have POPCNT, use the default expansion.
32839 if (Subtarget.hasPOPCNT())
32840 return SDValue();
32841
32842 if (VT == MVT::i64) {
32843 // Xor the high and low 16-bits together using a 32-bit operation.
32844 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32845 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32846 DAG.getConstant(32, DL, MVT::i8)));
32847 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32848 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32849 }
32850
32851 if (VT != MVT::i16) {
32852 // Xor the high and low 16-bits together using a 32-bit operation.
32853 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32854 DAG.getConstant(16, DL, MVT::i8));
32855 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32856 } else {
32857 // If the input is 16-bits, we need to extend to use an i32 shift below.
32858 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32859 }
32860
32861 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32862 // This should allow an h-reg to be used to save a shift.
32863 SDValue Hi = DAG.getNode(
32864 ISD::TRUNCATE, DL, MVT::i8,
32865 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32866 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32867 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32868 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32869
32870 // Copy the inverse of the parity flag into a register with setcc.
32871 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32872 // Extend to the original type.
32873 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32874}
32875
32877 const X86Subtarget &Subtarget) {
32878 unsigned NewOpc = 0;
32879 switch (N->getOpcode()) {
32881 NewOpc = X86ISD::LADD;
32882 break;
32884 NewOpc = X86ISD::LSUB;
32885 break;
32887 NewOpc = X86ISD::LOR;
32888 break;
32890 NewOpc = X86ISD::LXOR;
32891 break;
32893 NewOpc = X86ISD::LAND;
32894 break;
32895 default:
32896 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32897 }
32898
32899 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32900
32901 return DAG.getMemIntrinsicNode(
32902 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32903 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32904 /*MemVT=*/N->getSimpleValueType(0), MMO);
32905}
32906
32907/// Lower atomic_load_ops into LOCK-prefixed operations.
32909 const X86Subtarget &Subtarget) {
32910 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32911 SDValue Chain = N->getOperand(0);
32912 SDValue LHS = N->getOperand(1);
32913 SDValue RHS = N->getOperand(2);
32914 unsigned Opc = N->getOpcode();
32915 MVT VT = N->getSimpleValueType(0);
32916 SDLoc DL(N);
32917
32918 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32919 // can only be lowered when the result is unused. They should have already
32920 // been transformed into a cmpxchg loop in AtomicExpand.
32921 if (N->hasAnyUseOfValue(0)) {
32922 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32923 // select LXADD if LOCK_SUB can't be selected.
32924 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32925 // can use LXADD as opposed to cmpxchg.
32926 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32928 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32929 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32930
32932 "Used AtomicRMW ops other than Add should have been expanded!");
32933 return N;
32934 }
32935
32936 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32937 // The core idea here is that since the memory location isn't actually
32938 // changing, all we need is a lowering for the *ordering* impacts of the
32939 // atomicrmw. As such, we can chose a different operation and memory
32940 // location to minimize impact on other code.
32941 // The above holds unless the node is marked volatile in which
32942 // case it needs to be preserved according to the langref.
32943 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32944 // On X86, the only ordering which actually requires an instruction is
32945 // seq_cst which isn't SingleThread, everything just needs to be preserved
32946 // during codegen and then dropped. Note that we expect (but don't assume),
32947 // that orderings other than seq_cst and acq_rel have been canonicalized to
32948 // a store or load.
32951 // Prefer a locked operation against a stack location to minimize cache
32952 // traffic. This assumes that stack locations are very likely to be
32953 // accessed only by the owning thread.
32954 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32955 assert(!N->hasAnyUseOfValue(0));
32956 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32957 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32958 DAG.getUNDEF(VT), NewChain);
32959 }
32960 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32961 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32962 assert(!N->hasAnyUseOfValue(0));
32963 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32964 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32965 DAG.getUNDEF(VT), NewChain);
32966 }
32967
32968 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32969 // RAUW the chain, but don't worry about the result, as it's unused.
32970 assert(!N->hasAnyUseOfValue(0));
32971 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32972 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32973 DAG.getUNDEF(VT), LockOp.getValue(1));
32974}
32975
32977 const X86Subtarget &Subtarget) {
32978 auto *Node = cast<AtomicSDNode>(Op.getNode());
32979 SDLoc dl(Node);
32980 EVT VT = Node->getMemoryVT();
32981
32982 bool IsSeqCst =
32983 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32984 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32985
32986 // If this store is not sequentially consistent and the type is legal
32987 // we can just keep it.
32988 if (!IsSeqCst && IsTypeLegal)
32989 return Op;
32990
32991 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32993 Attribute::NoImplicitFloat)) {
32994 SDValue Chain;
32995 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
32996 // vector store.
32997 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
32998 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
32999 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
33000 Node->getMemOperand());
33001 }
33002
33003 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33004 // is enabled.
33005 if (VT == MVT::i64) {
33006 if (Subtarget.hasSSE1()) {
33007 SDValue SclToVec =
33008 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33009 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33010 SclToVec = DAG.getBitcast(StVT, SclToVec);
33011 SDVTList Tys = DAG.getVTList(MVT::Other);
33012 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33013 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33014 MVT::i64, Node->getMemOperand());
33015 } else if (Subtarget.hasX87()) {
33016 // First load this into an 80-bit X87 register using a stack temporary.
33017 // This will put the whole integer into the significand.
33018 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33019 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33020 MachinePointerInfo MPI =
33022 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33024 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33025 SDValue LdOps[] = {Chain, StackPtr};
33027 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33028 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33029 Chain = Value.getValue(1);
33030
33031 // Now use an FIST to do the atomic store.
33032 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33033 Chain =
33034 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33035 StoreOps, MVT::i64, Node->getMemOperand());
33036 }
33037 }
33038
33039 if (Chain) {
33040 // If this is a sequentially consistent store, also emit an appropriate
33041 // barrier.
33042 if (IsSeqCst)
33043 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33044
33045 return Chain;
33046 }
33047 }
33048
33049 // Convert seq_cst store -> xchg
33050 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33051 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33052 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33053 Node->getOperand(0), Node->getOperand(2),
33054 Node->getOperand(1), Node->getMemOperand());
33055 return Swap.getValue(1);
33056}
33057
33059 SDNode *N = Op.getNode();
33060 MVT VT = N->getSimpleValueType(0);
33061 unsigned Opc = Op.getOpcode();
33062
33063 // Let legalize expand this if it isn't a legal type yet.
33064 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33065 return SDValue();
33066
33067 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33068 SDLoc DL(N);
33069
33070 // Set the carry flag.
33071 SDValue Carry = Op.getOperand(2);
33072 EVT CarryVT = Carry.getValueType();
33073 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33074 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33075
33076 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33077 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33078 Op.getOperand(0), Op.getOperand(1),
33079 Carry.getValue(1));
33080
33081 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33082 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33083 Sum.getValue(1), DL, DAG);
33084 if (N->getValueType(1) == MVT::i1)
33085 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33086
33087 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33088}
33089
33090static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33091 SelectionDAG &DAG) {
33092 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33093
33094 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33095 // which returns the values as { float, float } (in XMM0) or
33096 // { double, double } (which is returned in XMM0, XMM1).
33097 SDLoc dl(Op);
33098 SDValue Arg = Op.getOperand(0);
33099 EVT ArgVT = Arg.getValueType();
33100 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33101
33103 Args.emplace_back(Arg, ArgTy);
33104
33105 bool isF64 = ArgVT == MVT::f64;
33106 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33107 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33108 // the results are returned via SRet in memory.
33109 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33110 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33111 const char *LibcallName = TLI.getLibcallName(LC);
33112 SDValue Callee =
33113 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33114
33115 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33116 : (Type *)FixedVectorType::get(ArgTy, 4);
33117
33119 CLI.setDebugLoc(dl)
33120 .setChain(DAG.getEntryNode())
33121 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33122
33123 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33124
33125 if (isF64)
33126 // Returned in xmm0 and xmm1.
33127 return CallResult.first;
33128
33129 // Returned in bits 0:31 and 32:64 xmm0.
33130 SDValue SinVal =
33131 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33132 DAG.getVectorIdxConstant(0, dl));
33133 SDValue CosVal =
33134 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33135 DAG.getVectorIdxConstant(1, dl));
33136 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33137 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33138}
33139
33140/// Widen a vector input to a vector of NVT. The
33141/// input vector must have the same element type as NVT.
33143 bool FillWithZeroes = false) {
33144 // Check if InOp already has the right width.
33145 MVT InVT = InOp.getSimpleValueType();
33146 if (InVT == NVT)
33147 return InOp;
33148
33149 if (InOp.isUndef())
33150 return DAG.getUNDEF(NVT);
33151
33153 "input and widen element type must match");
33154
33155 unsigned InNumElts = InVT.getVectorNumElements();
33156 unsigned WidenNumElts = NVT.getVectorNumElements();
33157 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33158 "Unexpected request for vector widening");
33159
33160 SDLoc dl(InOp);
33161 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33162 SDValue N1 = InOp.getOperand(1);
33163 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33164 N1.isUndef()) {
33165 InOp = InOp.getOperand(0);
33166 InVT = InOp.getSimpleValueType();
33167 InNumElts = InVT.getVectorNumElements();
33168 }
33169 }
33172 EVT EltVT = InOp.getOperand(0).getValueType();
33173 SDValue FillVal =
33174 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33175 SmallVector<SDValue, 16> Ops(InOp->ops());
33176 Ops.append(WidenNumElts - InNumElts, FillVal);
33177 return DAG.getBuildVector(NVT, dl, Ops);
33178 }
33179 SDValue FillVal =
33180 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33181 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33182 DAG.getVectorIdxConstant(0, dl));
33183}
33184
33186 SelectionDAG &DAG) {
33187 assert(Subtarget.hasAVX512() &&
33188 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33189
33190 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
33191 SDValue Src = N->getValue();
33192 MVT VT = Src.getSimpleValueType();
33193 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33194 SDLoc dl(Op);
33195
33196 SDValue Scale = N->getScale();
33197 SDValue Index = N->getIndex();
33198 SDValue Mask = N->getMask();
33199 SDValue Chain = N->getChain();
33200 SDValue BasePtr = N->getBasePtr();
33201
33202 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33203 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33204 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33205 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33206 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33207 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33208 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33209 SDVTList VTs = DAG.getVTList(MVT::Other);
33210 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33211 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33212 N->getMemoryVT(), N->getMemOperand());
33213 }
33214 return SDValue();
33215 }
33216
33217 MVT IndexVT = Index.getSimpleValueType();
33218
33219 // If the index is v2i32, we're being called by type legalization and we
33220 // should just let the default handling take care of it.
33221 if (IndexVT == MVT::v2i32)
33222 return SDValue();
33223
33224 // If we don't have VLX and neither the passthru or index is 512-bits, we
33225 // need to widen until one is.
33226 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33227 !Index.getSimpleValueType().is512BitVector()) {
33228 // Determine how much we need to widen by to get a 512-bit type.
33229 unsigned Factor = std::min(512/VT.getSizeInBits(),
33230 512/IndexVT.getSizeInBits());
33231 unsigned NumElts = VT.getVectorNumElements() * Factor;
33232
33233 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33234 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33235 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33236
33237 Src = ExtendToType(Src, VT, DAG);
33238 Index = ExtendToType(Index, IndexVT, DAG);
33239 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33240 }
33241
33242 SDVTList VTs = DAG.getVTList(MVT::Other);
33243 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33244 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33245 N->getMemoryVT(), N->getMemOperand());
33246}
33247
33248static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33249 SelectionDAG &DAG) {
33250
33251 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
33252 MVT VT = Op.getSimpleValueType();
33253 MVT ScalarVT = VT.getScalarType();
33254 SDValue Mask = N->getMask();
33255 MVT MaskVT = Mask.getSimpleValueType();
33256 SDValue PassThru = N->getPassThru();
33257 SDLoc dl(Op);
33258
33259 // Handle AVX masked loads which don't support passthru other than 0.
33260 if (MaskVT.getVectorElementType() != MVT::i1) {
33261 // We also allow undef in the isel pattern.
33262 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33263 return Op;
33264
33265 SDValue NewLoad = DAG.getMaskedLoad(
33266 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33267 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33268 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33269 N->isExpandingLoad());
33270 // Emit a blend.
33271 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33272 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33273 }
33274
33275 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33276 "Expanding masked load is supported on AVX-512 target only!");
33277
33278 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33279 "Expanding masked load is supported for 32 and 64-bit types only!");
33280
33281 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33282 "Cannot lower masked load op.");
33283
33284 assert((ScalarVT.getSizeInBits() >= 32 ||
33285 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33286 ScalarVT == MVT::f16))) &&
33287 "Unsupported masked load op.");
33288
33289 // This operation is legal for targets with VLX, but without
33290 // VLX the vector should be widened to 512 bit
33291 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33292 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33293 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33294
33295 // Mask element has to be i1.
33296 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33297 "Unexpected mask type");
33298
33299 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33300
33301 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33302 SDValue NewLoad = DAG.getMaskedLoad(
33303 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33304 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33305 N->getExtensionType(), N->isExpandingLoad());
33306
33307 SDValue Extract =
33308 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33309 DAG.getVectorIdxConstant(0, dl));
33310 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33311 return DAG.getMergeValues(RetOps, dl);
33312}
33313
33314static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33315 SelectionDAG &DAG) {
33316 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
33317 SDValue DataToStore = N->getValue();
33318 MVT VT = DataToStore.getSimpleValueType();
33319 MVT ScalarVT = VT.getScalarType();
33320 SDValue Mask = N->getMask();
33321 SDLoc dl(Op);
33322
33323 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33324 "Expanding masked load is supported on AVX-512 target only!");
33325
33326 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33327 "Expanding masked load is supported for 32 and 64-bit types only!");
33328
33329 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33330 "Cannot lower masked store op.");
33331
33332 assert((ScalarVT.getSizeInBits() >= 32 ||
33333 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33334 ScalarVT == MVT::f16))) &&
33335 "Unsupported masked store op.");
33336
33337 // This operation is legal for targets with VLX, but without
33338 // VLX the vector should be widened to 512 bit
33339 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33340 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33341
33342 // Mask element has to be i1.
33343 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33344 "Unexpected mask type");
33345
33346 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33347
33348 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33349 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33350 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33351 N->getOffset(), Mask, N->getMemoryVT(),
33352 N->getMemOperand(), N->getAddressingMode(),
33353 N->isTruncatingStore(), N->isCompressingStore());
33354}
33355
33356static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33357 SelectionDAG &DAG) {
33358 assert(Subtarget.hasAVX2() &&
33359 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33360
33361 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
33362 SDLoc dl(Op);
33363 MVT VT = Op.getSimpleValueType();
33364 SDValue Index = N->getIndex();
33365 SDValue Mask = N->getMask();
33366 SDValue PassThru = N->getPassThru();
33367 MVT IndexVT = Index.getSimpleValueType();
33368
33369 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33370
33371 // If the index is v2i32, we're being called by type legalization.
33372 if (IndexVT == MVT::v2i32)
33373 return SDValue();
33374
33375 // If we don't have VLX and neither the passthru or index is 512-bits, we
33376 // need to widen until one is.
33377 MVT OrigVT = VT;
33378 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33379 !IndexVT.is512BitVector()) {
33380 // Determine how much we need to widen by to get a 512-bit type.
33381 unsigned Factor = std::min(512/VT.getSizeInBits(),
33382 512/IndexVT.getSizeInBits());
33383
33384 unsigned NumElts = VT.getVectorNumElements() * Factor;
33385
33386 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33387 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33388 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33389
33390 PassThru = ExtendToType(PassThru, VT, DAG);
33391 Index = ExtendToType(Index, IndexVT, DAG);
33392 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33393 }
33394
33395 // Break dependency on the data register.
33396 if (PassThru.isUndef())
33397 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33398
33399 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33400 N->getScale() };
33401 SDValue NewGather = DAG.getMemIntrinsicNode(
33402 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33403 N->getMemOperand());
33404 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33405 DAG.getVectorIdxConstant(0, dl));
33406 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33407}
33408
33410 SDLoc dl(Op);
33411 SDValue Src = Op.getOperand(0);
33412 MVT DstVT = Op.getSimpleValueType();
33413
33414 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
33415 unsigned SrcAS = N->getSrcAddressSpace();
33416
33417 assert(SrcAS != N->getDestAddressSpace() &&
33418 "addrspacecast must be between different address spaces");
33419
33420 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33421 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33422 } else if (DstVT == MVT::i64) {
33423 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33424 } else if (DstVT == MVT::i32) {
33425 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33426 } else {
33427 report_fatal_error("Bad address space in addrspacecast");
33428 }
33429 return Op;
33430}
33431
33432SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33433 SelectionDAG &DAG) const {
33434 // TODO: Eventually, the lowering of these nodes should be informed by or
33435 // deferred to the GC strategy for the function in which they appear. For
33436 // now, however, they must be lowered to something. Since they are logically
33437 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33438 // require special handling for these nodes), lower them as literal NOOPs for
33439 // the time being.
33441 Ops.push_back(Op.getOperand(0));
33442 if (Op->getGluedNode())
33443 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33444
33445 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33446 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33447}
33448
33449// Custom split CVTPS2PH with wide types.
33451 SDLoc dl(Op);
33452 EVT VT = Op.getValueType();
33453 SDValue Lo, Hi;
33454 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33455 EVT LoVT, HiVT;
33456 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33457 SDValue RC = Op.getOperand(1);
33458 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33459 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33460 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33461}
33462
33464 SelectionDAG &DAG) {
33465 unsigned IsData = Op.getConstantOperandVal(4);
33466
33467 // We don't support non-data prefetch without PREFETCHI.
33468 // Just preserve the chain.
33469 if (!IsData && !Subtarget.hasPREFETCHI())
33470 return Op.getOperand(0);
33471
33472 return Op;
33473}
33474
33476 SDNode *N = Op.getNode();
33477 SDValue Operand = N->getOperand(0);
33478 EVT VT = Operand.getValueType();
33479 SDLoc dl(N);
33480
33481 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33482
33483 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33484 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33485 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33486 // promote this operator's result!
33487 SDValue Chain = DAG.getEntryNode();
33488 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33489 {Chain, Operand, One});
33490 return StrictFmul;
33491}
33492
33494 unsigned OpNo) {
33495 const APInt Operand(32, OpNo);
33496 std::string OpNoStr = llvm::toString(Operand, 10, false);
33497 std::string Str(" $");
33498
33499 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33500 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33501
33502 auto I = StringRef::npos;
33503 for (auto &AsmStr : AsmStrs) {
33504 // Match the OpNo string. We should match exactly to exclude match
33505 // sub-string, e.g. "$12" contain "$1"
33506 if (AsmStr.ends_with(OpNoStr1))
33507 I = AsmStr.size() - OpNoStr1.size();
33508
33509 // Get the index of operand in AsmStr.
33510 if (I == StringRef::npos)
33511 I = AsmStr.find(OpNoStr1 + ",");
33512 if (I == StringRef::npos)
33513 I = AsmStr.find(OpNoStr2);
33514
33515 if (I == StringRef::npos)
33516 continue;
33517
33518 assert(I > 0 && "Unexpected inline asm string!");
33519 // Remove the operand string and label (if exsit).
33520 // For example:
33521 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33522 // ==>
33523 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33524 // ==>
33525 // "call dword ptr "
33526 auto TmpStr = AsmStr.substr(0, I);
33527 I = TmpStr.rfind(':');
33528 if (I != StringRef::npos)
33529 TmpStr = TmpStr.substr(I + 1);
33530 return TmpStr.take_while(llvm::isAlpha);
33531 }
33532
33533 return StringRef();
33534}
33535
33537 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33538 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33539 // changed from indirect TargetLowering::C_Memory to direct
33540 // TargetLowering::C_Address.
33541 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33542 // location.
33543 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33544 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33545}
33546
33548 SDValue Mask) {
33549 EVT Ty = MVT::i8;
33550 auto V = DAG.getBitcast(MVT::i1, Mask);
33551 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33552 auto Zero = DAG.getConstant(0, DL, Ty);
33553 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33554 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33555 return SDValue(CmpZero.getNode(), 1);
33556}
33557
33559 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33560 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33561 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33562 // ->
33563 // _, flags = SUB 0, mask
33564 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33565 // bit_cast_to_vector<res>
33566 EVT VTy = PassThru.getValueType();
33567 EVT Ty = VTy.getVectorElementType();
33568 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33569 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33570 : DAG.getBitcast(Ty, PassThru);
33571 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33572 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33573 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33574 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33575 return DAG.getBitcast(VTy, NewLoad);
33576}
33577
33579 SDValue Chain,
33581 SDValue Val, SDValue Mask) const {
33582 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33583 // ->
33584 // _, flags = SUB 0, mask
33585 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33587 SDVTList Tys = DAG.getVTList(MVT::Other);
33588 auto ScalarVal = DAG.getBitcast(Ty, Val);
33589 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33590 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33591 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33592 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33593}
33594
33595/// Provide custom lowering hooks for some operations.
33597 switch (Op.getOpcode()) {
33598 // clang-format off
33599 default: llvm_unreachable("Should not custom lower this!");
33600 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33602 return LowerCMP_SWAP(Op, Subtarget, DAG);
33603 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33608 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33609 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33610 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33611 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33612 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33613 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33614 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33615 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33616 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33617 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33618 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33619 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33620 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33621 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33622 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33623 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33624 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33625 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33626 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33627 case ISD::SHL_PARTS:
33628 case ISD::SRA_PARTS:
33629 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33630 case ISD::FSHL:
33631 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33632 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33634 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33636 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33637 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33638 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33639 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33640 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33643 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33644 case ISD::FP_TO_SINT:
33646 case ISD::FP_TO_UINT:
33647 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33649 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33650 case ISD::FP_EXTEND:
33651 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33652 case ISD::FP_ROUND:
33653 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33654 case ISD::FP16_TO_FP:
33655 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33656 case ISD::FP_TO_FP16:
33657 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33658 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33659 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33660 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33661 case ISD::FADD:
33662 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33663 case ISD::FROUND: return LowerFROUND(Op, DAG);
33664 case ISD::FABS:
33665 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33666 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33667 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33668 case ISD::LRINT:
33669 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33670 case ISD::SETCC:
33671 case ISD::STRICT_FSETCC:
33672 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33673 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33674 case ISD::SELECT: return LowerSELECT(Op, DAG);
33675 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33676 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33677 case ISD::VASTART: return LowerVASTART(Op, DAG);
33678 case ISD::VAARG: return LowerVAARG(Op, DAG);
33679 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33680 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33682 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33683 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33684 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33685 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33687 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33688 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33689 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33690 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33691 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33693 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33694 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33696 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33697 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33698 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33699 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33700 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33701 case ISD::CTLZ:
33702 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33703 case ISD::CTTZ:
33704 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33705 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33706 case ISD::MULHS:
33707 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33708 case ISD::ROTL:
33709 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33710 case ISD::SRA:
33711 case ISD::SRL:
33712 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33713 case ISD::SADDO:
33714 case ISD::UADDO:
33715 case ISD::SSUBO:
33716 case ISD::USUBO: return LowerXALUO(Op, DAG);
33717 case ISD::SMULO:
33718 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33719 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33720 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33721 case ISD::SADDO_CARRY:
33722 case ISD::SSUBO_CARRY:
33723 case ISD::UADDO_CARRY:
33724 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33725 case ISD::ADD:
33726 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33727 case ISD::UADDSAT:
33728 case ISD::SADDSAT:
33729 case ISD::USUBSAT:
33730 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33731 case ISD::SMAX:
33732 case ISD::SMIN:
33733 case ISD::UMAX:
33734 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33735 case ISD::FMINIMUM:
33736 case ISD::FMAXIMUM:
33737 case ISD::FMINIMUMNUM:
33738 case ISD::FMAXIMUMNUM:
33739 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33740 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33741 case ISD::ABDS:
33742 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33743 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33744 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33745 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33746 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33747 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33748 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33750 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33751 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33752 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33753 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33754 // clang-format on
33755 }
33756}
33757
33758/// Replace a node with an illegal result type with a new node built out of
33759/// custom code.
33762 SelectionDAG &DAG) const {
33763 SDLoc dl(N);
33764 unsigned Opc = N->getOpcode();
33765 switch (Opc) {
33766 default:
33767#ifndef NDEBUG
33768 dbgs() << "ReplaceNodeResults: ";
33769 N->dump(&DAG);
33770#endif
33771 llvm_unreachable("Do not know how to custom type legalize this operation!");
33772 case X86ISD::CVTPH2PS: {
33773 EVT VT = N->getValueType(0);
33774 SDValue Lo, Hi;
33775 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33776 EVT LoVT, HiVT;
33777 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33778 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33779 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33780 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33781 Results.push_back(Res);
33782 return;
33783 }
33785 EVT VT = N->getValueType(0);
33786 SDValue Lo, Hi;
33787 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33788 EVT LoVT, HiVT;
33789 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33790 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33791 {N->getOperand(0), Lo});
33792 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33793 {N->getOperand(0), Hi});
33794 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33795 Lo.getValue(1), Hi.getValue(1));
33796 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33797 Results.push_back(Res);
33798 Results.push_back(Chain);
33799 return;
33800 }
33801 case X86ISD::CVTPS2PH:
33802 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33803 return;
33804 case ISD::CTPOP: {
33805 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33806 // If we have at most 32 active bits, then perform as i32 CTPOP.
33807 // TODO: Perform this in generic legalizer?
33808 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33809 unsigned LZ = Known.countMinLeadingZeros();
33810 unsigned TZ = Known.countMinTrailingZeros();
33811 if ((LZ + TZ) >= 32) {
33812 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33813 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33814 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33815 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33816 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33817 Results.push_back(Op);
33818 return;
33819 }
33820 // Use a v2i64 if possible.
33821 bool NoImplicitFloatOps =
33823 Attribute::NoImplicitFloat);
33824 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33825 SDValue Wide =
33826 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33827 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33828 // Bit count should fit in 32-bits, extract it as that and then zero
33829 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33830 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33831 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33832 DAG.getVectorIdxConstant(0, dl));
33833 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33834 Results.push_back(Wide);
33835 }
33836 return;
33837 }
33838 case ISD::MUL: {
33839 EVT VT = N->getValueType(0);
33841 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33842 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33843 // elements are needed.
33844 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33845 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33846 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33847 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33848 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33849 unsigned NumConcats = 16 / VT.getVectorNumElements();
33850 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33851 ConcatOps[0] = Res;
33852 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33853 Results.push_back(Res);
33854 return;
33855 }
33856 case ISD::SMULO:
33857 case ISD::UMULO: {
33858 EVT VT = N->getValueType(0);
33860 VT == MVT::v2i32 && "Unexpected VT!");
33861 bool IsSigned = Opc == ISD::SMULO;
33862 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33863 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33864 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33865 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33866 // Extract the high 32 bits from each result using PSHUFD.
33867 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33868 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33869 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33870 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33871 DAG.getVectorIdxConstant(0, dl));
33872
33873 // Truncate the low bits of the result. This will become PSHUFD.
33874 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33875
33876 SDValue HiCmp;
33877 if (IsSigned) {
33878 // SMULO overflows if the high bits don't match the sign of the low.
33879 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33880 } else {
33881 // UMULO overflows if the high bits are non-zero.
33882 HiCmp = DAG.getConstant(0, dl, VT);
33883 }
33884 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33885
33886 // Widen the result with by padding with undef.
33887 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33888 DAG.getUNDEF(VT));
33889 Results.push_back(Res);
33890 Results.push_back(Ovf);
33891 return;
33892 }
33893 case X86ISD::VPMADDWD: {
33894 // Legalize types for X86ISD::VPMADDWD by widening.
33895 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33896
33897 EVT VT = N->getValueType(0);
33898 EVT InVT = N->getOperand(0).getValueType();
33899 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33900 "Expected a VT that divides into 128 bits.");
33902 "Unexpected type action!");
33903 unsigned NumConcat = 128 / InVT.getSizeInBits();
33904
33905 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33906 InVT.getVectorElementType(),
33907 NumConcat * InVT.getVectorNumElements());
33908 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33910 NumConcat * VT.getVectorNumElements());
33911
33912 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33913 Ops[0] = N->getOperand(0);
33914 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33915 Ops[0] = N->getOperand(1);
33916 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33917
33918 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33919 Results.push_back(Res);
33920 return;
33921 }
33922 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33923 case X86ISD::FMINC:
33924 case X86ISD::FMIN:
33925 case X86ISD::FMAXC:
33926 case X86ISD::FMAX:
33928 case X86ISD::STRICT_FMAX: {
33929 EVT VT = N->getValueType(0);
33930 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33931 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33932 SDValue UNDEF = DAG.getUNDEF(VT);
33933 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33934 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33935 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33936 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33937 SDValue Res;
33938 if (IsStrict)
33939 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33940 {N->getOperand(0), LHS, RHS});
33941 else
33942 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33943 Results.push_back(Res);
33944 if (IsStrict)
33945 Results.push_back(Res.getValue(1));
33946 return;
33947 }
33948 case ISD::SDIV:
33949 case ISD::UDIV:
33950 case ISD::SREM:
33951 case ISD::UREM: {
33952 EVT VT = N->getValueType(0);
33953 if (VT.isVector()) {
33955 "Unexpected type action!");
33956 // If this RHS is a constant splat vector we can widen this and let
33957 // division/remainder by constant optimize it.
33958 // TODO: Can we do something for non-splat?
33959 APInt SplatVal;
33960 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33961 unsigned NumConcats = 128 / VT.getSizeInBits();
33962 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33963 Ops0[0] = N->getOperand(0);
33964 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33965 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33966 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33967 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33968 Results.push_back(Res);
33969 }
33970 return;
33971 }
33972
33973 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33974 Results.push_back(V);
33975 return;
33976 }
33977 case ISD::TRUNCATE: {
33978 MVT VT = N->getSimpleValueType(0);
33979 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33980 return;
33981
33982 // The generic legalizer will try to widen the input type to the same
33983 // number of elements as the widened result type. But this isn't always
33984 // the best thing so do some custom legalization to avoid some cases.
33985 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33986 SDValue In = N->getOperand(0);
33987 EVT InVT = In.getValueType();
33988 EVT InEltVT = InVT.getVectorElementType();
33989 EVT EltVT = VT.getVectorElementType();
33990 unsigned MinElts = VT.getVectorNumElements();
33991 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33992 unsigned InBits = InVT.getSizeInBits();
33993
33994 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33995 unsigned PackOpcode;
33996 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
33997 Subtarget, N->getFlags())) {
33998 if (SDValue Res =
33999 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
34000 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
34001 Results.push_back(Res);
34002 return;
34003 }
34004 }
34005
34006 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34007 // 128 bit and smaller inputs should avoid truncate all together and
34008 // use a shuffle.
34009 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34010 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34011 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34012 for (unsigned I = 0; I < MinElts; ++I)
34013 TruncMask[I] = Scale * I;
34014 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34015 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34016 "Illegal vector type in truncation");
34017 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34018 Results.push_back(
34019 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34020 return;
34021 }
34022 }
34023
34024 // With AVX512 there are some cases that can use a target specific
34025 // truncate node to go from 256/512 to less than 128 with zeros in the
34026 // upper elements of the 128 bit result.
34027 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34028 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34029 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34030 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34031 return;
34032 }
34033 // There's one case we can widen to 512 bits and use VTRUNC.
34034 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34035 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34036 DAG.getUNDEF(MVT::v4i64));
34037 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34038 return;
34039 }
34040 }
34041 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34042 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34043 isTypeLegal(MVT::v4i64)) {
34044 // Input needs to be split and output needs to widened. Let's use two
34045 // VTRUNCs, and shuffle their results together into the wider type.
34046 SDValue Lo, Hi;
34047 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34048
34049 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34050 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34051 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34052 { 0, 1, 2, 3, 16, 17, 18, 19,
34053 -1, -1, -1, -1, -1, -1, -1, -1 });
34054 Results.push_back(Res);
34055 return;
34056 }
34057
34058 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34059 // this via type legalization.
34060 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34061 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34062 (!Subtarget.hasSSSE3() ||
34063 (!isTypeLegal(InVT) &&
34064 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34065 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34066 InEltVT.getSizeInBits() * WidenNumElts);
34067 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34068 return;
34069 }
34070
34071 return;
34072 }
34073 case ISD::ANY_EXTEND:
34074 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34075 // It's intended to custom handle the input type.
34076 assert(N->getValueType(0) == MVT::v8i8 &&
34077 "Do not know how to legalize this Node");
34078 return;
34079 case ISD::SIGN_EXTEND:
34080 case ISD::ZERO_EXTEND: {
34081 EVT VT = N->getValueType(0);
34082 SDValue In = N->getOperand(0);
34083 EVT InVT = In.getValueType();
34084 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34085 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34087 "Unexpected type action!");
34088 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34089 // Custom split this so we can extend i8/i16->i32 invec. This is better
34090 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34091 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34092 // we allow the sra from the extend to i32 to be shared by the split.
34093 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34094
34095 // Fill a vector with sign bits for each element.
34096 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34097 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34098
34099 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34100 // to v2i64.
34101 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34102 {0, 4, 1, 5});
34103 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34104 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34105 {2, 6, 3, 7});
34106 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34107
34108 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34109 Results.push_back(Res);
34110 return;
34111 }
34112
34113 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34114 if (!InVT.is128BitVector()) {
34115 // Not a 128 bit vector, but maybe type legalization will promote
34116 // it to 128 bits.
34117 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34118 return;
34119 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34120 if (!InVT.is128BitVector())
34121 return;
34122
34123 // Promote the input to 128 bits. Type legalization will turn this into
34124 // zext_inreg/sext_inreg.
34125 In = DAG.getNode(Opc, dl, InVT, In);
34126 }
34127
34128 // Perform custom splitting instead of the two stage extend we would get
34129 // by default.
34130 EVT LoVT, HiVT;
34131 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34132 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34133
34134 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34135
34136 // We need to shift the input over by half the number of elements.
34137 unsigned NumElts = InVT.getVectorNumElements();
34138 unsigned HalfNumElts = NumElts / 2;
34139 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34140 for (unsigned i = 0; i != HalfNumElts; ++i)
34141 ShufMask[i] = i + HalfNumElts;
34142
34143 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34144 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34145
34146 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34147 Results.push_back(Res);
34148 }
34149 return;
34150 }
34152 case ISD::FP_TO_UINT_SAT: {
34153 if (!Subtarget.hasAVX10_2())
34154 return;
34155
34156 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34157 EVT VT = N->getValueType(0);
34158 SDValue Op = N->getOperand(0);
34159 EVT OpVT = Op.getValueType();
34160 SDValue Res;
34161
34162 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34163 if (IsSigned)
34164 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34165 else
34166 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34167 Results.push_back(Res);
34168 }
34169 return;
34170 }
34171 case ISD::FP_TO_SINT:
34173 case ISD::FP_TO_UINT:
34175 bool IsStrict = N->isStrictFPOpcode();
34176 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34177 EVT VT = N->getValueType(0);
34178 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34179 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34180 EVT SrcVT = Src.getValueType();
34181
34182 SDValue Res;
34183 if (isSoftF16(SrcVT, Subtarget)) {
34184 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34185 if (IsStrict) {
34186 Res =
34187 DAG.getNode(Opc, dl, {VT, MVT::Other},
34188 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34189 {NVT, MVT::Other}, {Chain, Src})});
34190 Chain = Res.getValue(1);
34191 } else {
34192 Res =
34193 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34194 }
34195 Results.push_back(Res);
34196 if (IsStrict)
34197 Results.push_back(Chain);
34198
34199 return;
34200 }
34201
34202 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34203 SrcVT.getVectorElementType() == MVT::f16) {
34204 EVT EleVT = VT.getVectorElementType();
34205 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34206
34207 if (SrcVT != MVT::v8f16) {
34208 SDValue Tmp =
34209 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34210 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34211 Ops[0] = Src;
34212 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34213 }
34214
34215 if (IsStrict) {
34217 Res =
34218 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34219 Chain = Res.getValue(1);
34220 } else {
34221 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34222 Res = DAG.getNode(Opc, dl, ResVT, Src);
34223 }
34224
34225 // TODO: Need to add exception check code for strict FP.
34226 if (EleVT.getSizeInBits() < 16) {
34227 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34228 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34229
34230 // Now widen to 128 bits.
34231 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34232 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34233 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34234 ConcatOps[0] = Res;
34235 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34236 }
34237
34238 Results.push_back(Res);
34239 if (IsStrict)
34240 Results.push_back(Chain);
34241
34242 return;
34243 }
34244
34245 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34247 "Unexpected type action!");
34248
34249 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34250 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34251 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34253 SDValue Res;
34254 SDValue Chain;
34255 if (IsStrict) {
34256 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34257 {N->getOperand(0), Src});
34258 Chain = Res.getValue(1);
34259 } else
34260 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34261
34262 // Preserve what we know about the size of the original result. If the
34263 // result is v2i32, we have to manually widen the assert.
34264 if (PromoteVT == MVT::v2i32)
34265 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34266 DAG.getUNDEF(MVT::v2i32));
34267
34268 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34269 Res.getValueType(), Res,
34271
34272 if (PromoteVT == MVT::v2i32)
34273 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34274 DAG.getVectorIdxConstant(0, dl));
34275
34276 // Truncate back to the original width.
34277 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34278
34279 // Now widen to 128 bits.
34280 unsigned NumConcats = 128 / VT.getSizeInBits();
34282 VT.getVectorNumElements() * NumConcats);
34283 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34284 ConcatOps[0] = Res;
34285 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34286 Results.push_back(Res);
34287 if (IsStrict)
34288 Results.push_back(Chain);
34289 return;
34290 }
34291
34292
34293 if (VT == MVT::v2i32) {
34294 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34295 "Strict unsigned conversion requires AVX512");
34296 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34298 "Unexpected type action!");
34299 if (Src.getValueType() == MVT::v2f64) {
34300 if (!IsSigned && !Subtarget.hasAVX512()) {
34301 SDValue Res =
34302 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34303 Results.push_back(Res);
34304 return;
34305 }
34306
34307 if (IsStrict)
34309 else
34310 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34311
34312 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34313 if (!IsSigned && !Subtarget.hasVLX()) {
34314 // Otherwise we can defer to the generic legalizer which will widen
34315 // the input as well. This will be further widened during op
34316 // legalization to v8i32<-v8f64.
34317 // For strict nodes we'll need to widen ourselves.
34318 // FIXME: Fix the type legalizer to safely widen strict nodes?
34319 if (!IsStrict)
34320 return;
34321 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34322 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34323 Opc = N->getOpcode();
34324 }
34325 SDValue Res;
34326 SDValue Chain;
34327 if (IsStrict) {
34328 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34329 {N->getOperand(0), Src});
34330 Chain = Res.getValue(1);
34331 } else {
34332 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34333 }
34334 Results.push_back(Res);
34335 if (IsStrict)
34336 Results.push_back(Chain);
34337 return;
34338 }
34339
34340 // Custom widen strict v2f32->v2i32 by padding with zeros.
34341 // FIXME: Should generic type legalizer do this?
34342 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34343 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34344 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34345 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34346 {N->getOperand(0), Src});
34347 Results.push_back(Res);
34348 Results.push_back(Res.getValue(1));
34349 return;
34350 }
34351
34352 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34353 // so early out here.
34354 return;
34355 }
34356
34357 assert(!VT.isVector() && "Vectors should have been handled above!");
34358
34359 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34360 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34361 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34362 assert(!Subtarget.is64Bit() && "i64 should be legal");
34363 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34364 // If we use a 128-bit result we might need to use a target specific node.
34365 unsigned SrcElts =
34366 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34367 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34368 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34369 if (NumElts != SrcElts) {
34370 if (IsStrict)
34372 else
34373 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34374 }
34375
34376 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34377 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34378 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34379 ZeroIdx);
34380 SDValue Chain;
34381 if (IsStrict) {
34382 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34383 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34384 Chain = Res.getValue(1);
34385 } else
34386 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34387 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34388 Results.push_back(Res);
34389 if (IsStrict)
34390 Results.push_back(Chain);
34391 return;
34392 }
34393
34394 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34395 SDValue Chain;
34396 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34397 Results.push_back(V);
34398 if (IsStrict)
34399 Results.push_back(Chain);
34400 return;
34401 }
34402
34403 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34404 Results.push_back(V);
34405 if (IsStrict)
34406 Results.push_back(Chain);
34407 }
34408 return;
34409 }
34410 case ISD::LRINT:
34411 if (N->getValueType(0) == MVT::v2i32) {
34412 SDValue Src = N->getOperand(0);
34413 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34414 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34415 DAG.getUNDEF(MVT::v2f16));
34416 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34417 DAG.getUNDEF(MVT::v4f16));
34418 } else if (Src.getValueType() != MVT::v2f64) {
34419 return;
34420 }
34421 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34422 return;
34423 }
34424 [[fallthrough]];
34425 case ISD::LLRINT: {
34426 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34427 Results.push_back(V);
34428 return;
34429 }
34430
34431 case ISD::SINT_TO_FP:
34433 case ISD::UINT_TO_FP:
34435 bool IsStrict = N->isStrictFPOpcode();
34436 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34437 EVT VT = N->getValueType(0);
34438 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34439 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34440 Subtarget.hasVLX()) {
34441 if (Src.getValueType().getVectorElementType() == MVT::i16)
34442 return;
34443
34444 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34445 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34446 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34447 : DAG.getUNDEF(MVT::v2i32));
34448 if (IsStrict) {
34449 unsigned Opc =
34451 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34452 {N->getOperand(0), Src});
34453 Results.push_back(Res);
34454 Results.push_back(Res.getValue(1));
34455 } else {
34456 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34457 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34458 }
34459 return;
34460 }
34461 if (VT != MVT::v2f32)
34462 return;
34463 EVT SrcVT = Src.getValueType();
34464 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34465 if (IsStrict) {
34466 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34468 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34469 {N->getOperand(0), Src});
34470 Results.push_back(Res);
34471 Results.push_back(Res.getValue(1));
34472 } else {
34473 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34474 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34475 }
34476 return;
34477 }
34478 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34479 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34480 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34481 SDValue One = DAG.getConstant(1, dl, SrcVT);
34482 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34483 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34484 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34485 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34486 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34487 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34488 for (int i = 0; i != 2; ++i) {
34489 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34490 SignSrc, DAG.getVectorIdxConstant(i, dl));
34491 if (IsStrict)
34492 SignCvts[i] =
34493 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34494 {N->getOperand(0), Elt});
34495 else
34496 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34497 };
34498 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34499 SDValue Slow, Chain;
34500 if (IsStrict) {
34501 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34502 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34503 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34504 {Chain, SignCvt, SignCvt});
34505 Chain = Slow.getValue(1);
34506 } else {
34507 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34508 }
34509 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34510 IsNeg =
34511 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34512 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34513 Results.push_back(Cvt);
34514 if (IsStrict)
34515 Results.push_back(Chain);
34516 return;
34517 }
34518
34519 if (SrcVT != MVT::v2i32)
34520 return;
34521
34522 if (IsSigned || Subtarget.hasAVX512()) {
34523 if (!IsStrict)
34524 return;
34525
34526 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34527 // FIXME: Should generic type legalizer do this?
34528 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34529 DAG.getConstant(0, dl, MVT::v2i32));
34530 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34531 {N->getOperand(0), Src});
34532 Results.push_back(Res);
34533 Results.push_back(Res.getValue(1));
34534 return;
34535 }
34536
34537 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34538 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34539 SDValue VBias = DAG.getConstantFP(
34540 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34541 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34542 DAG.getBitcast(MVT::v2i64, VBias));
34543 Or = DAG.getBitcast(MVT::v2f64, Or);
34544 if (IsStrict) {
34545 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34546 {N->getOperand(0), Or, VBias});
34548 {MVT::v4f32, MVT::Other},
34549 {Sub.getValue(1), Sub});
34550 Results.push_back(Res);
34551 Results.push_back(Res.getValue(1));
34552 } else {
34553 // TODO: Are there any fast-math-flags to propagate here?
34554 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34555 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34556 }
34557 return;
34558 }
34560 case ISD::FP_ROUND: {
34561 bool IsStrict = N->isStrictFPOpcode();
34562 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34563 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34564 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34565 EVT SrcVT = Src.getValueType();
34566 EVT VT = N->getValueType(0);
34567 SDValue V;
34568 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34569 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34570 : DAG.getUNDEF(MVT::v2f32);
34571 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34572 }
34573 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34574 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34575 if (SrcVT.getVectorElementType() != MVT::f32)
34576 return;
34577
34578 if (IsStrict)
34579 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34580 {Chain, Src, Rnd});
34581 else
34582 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34583
34584 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34585 if (IsStrict)
34586 Results.push_back(V.getValue(1));
34587 return;
34588 }
34589 if (!isTypeLegal(Src.getValueType()))
34590 return;
34591 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34592 if (IsStrict)
34593 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34594 {Chain, Src});
34595 else
34596 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34597 Results.push_back(V);
34598 if (IsStrict)
34599 Results.push_back(V.getValue(1));
34600 return;
34601 }
34602 case ISD::FP_EXTEND:
34603 case ISD::STRICT_FP_EXTEND: {
34604 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34605 // No other ValueType for FP_EXTEND should reach this point.
34606 assert(N->getValueType(0) == MVT::v2f32 &&
34607 "Do not know how to legalize this Node");
34608 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34609 return;
34610 bool IsStrict = N->isStrictFPOpcode();
34611 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34612 if (Src.getValueType().getVectorElementType() != MVT::f16)
34613 return;
34614 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34615 : DAG.getUNDEF(MVT::v2f16);
34616 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34617 if (IsStrict)
34618 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34619 {N->getOperand(0), V});
34620 else
34621 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34622 Results.push_back(V);
34623 if (IsStrict)
34624 Results.push_back(V.getValue(1));
34625 return;
34626 }
34628 unsigned IntNo = N->getConstantOperandVal(1);
34629 switch (IntNo) {
34630 default : llvm_unreachable("Do not know how to custom type "
34631 "legalize this intrinsic operation!");
34632 case Intrinsic::x86_rdtsc:
34633 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34634 Results);
34635 case Intrinsic::x86_rdtscp:
34636 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34637 Results);
34638 case Intrinsic::x86_rdpmc:
34639 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34640 Results);
34641 return;
34642 case Intrinsic::x86_rdpru:
34643 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34644 Results);
34645 return;
34646 case Intrinsic::x86_xgetbv:
34647 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34648 Results);
34649 return;
34650 }
34651 }
34652 case ISD::READCYCLECOUNTER: {
34653 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34654 }
34656 EVT T = N->getValueType(0);
34657 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34658 bool Regs64bit = T == MVT::i128;
34659 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34660 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34661 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34662 SDValue cpInL, cpInH;
34663 std::tie(cpInL, cpInH) =
34664 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34665 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34666 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34667 cpInH =
34668 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34669 cpInH, cpInL.getValue(1));
34670 SDValue swapInL, swapInH;
34671 std::tie(swapInL, swapInH) =
34672 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34673 swapInH =
34674 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34675 swapInH, cpInH.getValue(1));
34676
34677 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34678 // until later. So we keep the RBX input in a vreg and use a custom
34679 // inserter.
34680 // Since RBX will be a reserved register the register allocator will not
34681 // make sure its value will be properly saved and restored around this
34682 // live-range.
34683 SDValue Result;
34684 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34685 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34686 if (Regs64bit) {
34687 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34688 swapInH.getValue(1)};
34689 Result =
34690 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34691 } else {
34692 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34693 swapInH.getValue(1));
34694 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34695 swapInL.getValue(1)};
34696 Result =
34697 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34698 }
34699
34700 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34701 Regs64bit ? X86::RAX : X86::EAX,
34702 HalfT, Result.getValue(1));
34703 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34704 Regs64bit ? X86::RDX : X86::EDX,
34705 HalfT, cpOutL.getValue(2));
34706 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34707
34708 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34709 MVT::i32, cpOutH.getValue(2));
34710 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34711 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34712
34713 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34714 Results.push_back(Success);
34715 Results.push_back(EFLAGS.getValue(1));
34716 return;
34717 }
34718 case ISD::ATOMIC_LOAD: {
34719 assert(
34720 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34721 "Unexpected VT!");
34722 bool NoImplicitFloatOps =
34724 Attribute::NoImplicitFloat);
34725 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34726 auto *Node = cast<AtomicSDNode>(N);
34727
34728 if (N->getValueType(0) == MVT::i128) {
34729 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34730 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34731 Node->getBasePtr(), Node->getMemOperand());
34732 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34733 DAG.getVectorIdxConstant(0, dl));
34734 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34735 DAG.getVectorIdxConstant(1, dl));
34736 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34737 {ResL, ResH}));
34738 Results.push_back(Ld.getValue(1));
34739 return;
34740 }
34741 break;
34742 }
34743 if (Subtarget.hasSSE1()) {
34744 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34745 // Then extract the lower 64-bits.
34746 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34747 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34748 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34749 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34750 MVT::i64, Node->getMemOperand());
34751 if (Subtarget.hasSSE2()) {
34752 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34753 DAG.getVectorIdxConstant(0, dl));
34754 Results.push_back(Res);
34755 Results.push_back(Ld.getValue(1));
34756 return;
34757 }
34758 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34759 // then casts to i64. This avoids a 128-bit stack temporary being
34760 // created by type legalization if we were to cast v4f32->v2i64.
34761 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34762 DAG.getVectorIdxConstant(0, dl));
34763 Res = DAG.getBitcast(MVT::i64, Res);
34764 Results.push_back(Res);
34765 Results.push_back(Ld.getValue(1));
34766 return;
34767 }
34768 if (Subtarget.hasX87()) {
34769 // First load this into an 80-bit X87 register. This will put the whole
34770 // integer into the significand.
34771 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34772 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34774 dl, Tys, Ops, MVT::i64,
34775 Node->getMemOperand());
34776 SDValue Chain = Result.getValue(1);
34777
34778 // Now store the X87 register to a stack temporary and convert to i64.
34779 // This store is not atomic and doesn't need to be.
34780 // FIXME: We don't need a stack temporary if the result of the load
34781 // is already being stored. We could just directly store there.
34782 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34783 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34784 MachinePointerInfo MPI =
34786 SDValue StoreOps[] = { Chain, Result, StackPtr };
34787 Chain = DAG.getMemIntrinsicNode(
34788 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34789 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34790
34791 // Finally load the value back from the stack temporary and return it.
34792 // This load is not atomic and doesn't need to be.
34793 // This load will be further type legalized.
34794 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34795 Results.push_back(Result);
34796 Results.push_back(Result.getValue(1));
34797 return;
34798 }
34799 }
34800 // TODO: Use MOVLPS when SSE1 is available?
34801 // Delegate to generic TypeLegalization. Situations we can really handle
34802 // should have already been dealt with by AtomicExpandPass.cpp.
34803 break;
34804 }
34805 case ISD::ATOMIC_SWAP:
34816 // Delegate to generic TypeLegalization. Situations we can really handle
34817 // should have already been dealt with by AtomicExpandPass.cpp.
34818 break;
34819
34820 case ISD::BITCAST: {
34821 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34822 EVT DstVT = N->getValueType(0);
34823 EVT SrcVT = N->getOperand(0).getValueType();
34824
34825 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34826 // we can split using the k-register rather than memory.
34827 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34828 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34829 SDValue Lo, Hi;
34830 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34831 Lo = DAG.getBitcast(MVT::i32, Lo);
34832 Hi = DAG.getBitcast(MVT::i32, Hi);
34833 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34834 Results.push_back(Res);
34835 return;
34836 }
34837
34838 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34839 // FIXME: Use v4f32 for SSE1?
34840 assert(Subtarget.hasSSE2() && "Requires SSE2");
34841 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34842 "Unexpected type action!");
34843 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34844 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34845 N->getOperand(0));
34846 Res = DAG.getBitcast(WideVT, Res);
34847 Results.push_back(Res);
34848 return;
34849 }
34850
34851 return;
34852 }
34853 case ISD::MGATHER: {
34854 EVT VT = N->getValueType(0);
34855 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34856 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34857 auto *Gather = cast<MaskedGatherSDNode>(N);
34858 SDValue Index = Gather->getIndex();
34859 if (Index.getValueType() != MVT::v2i64)
34860 return;
34862 "Unexpected type action!");
34863 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34864 SDValue Mask = Gather->getMask();
34865 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34866 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34867 Gather->getPassThru(),
34868 DAG.getUNDEF(VT));
34869 if (!Subtarget.hasVLX()) {
34870 // We need to widen the mask, but the instruction will only use 2
34871 // of its elements. So we can use undef.
34872 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34873 DAG.getUNDEF(MVT::v2i1));
34874 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34875 }
34876 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34877 Gather->getBasePtr(), Index, Gather->getScale() };
34878 SDValue Res = DAG.getMemIntrinsicNode(
34879 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34880 Gather->getMemoryVT(), Gather->getMemOperand());
34881 Results.push_back(Res);
34882 Results.push_back(Res.getValue(1));
34883 return;
34884 }
34885 return;
34886 }
34887 case ISD::LOAD: {
34888 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34889 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34890 // cast since type legalization will try to use an i64 load.
34891 MVT VT = N->getSimpleValueType(0);
34892 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34894 "Unexpected type action!");
34895 if (!ISD::isNON_EXTLoad(N))
34896 return;
34897 auto *Ld = cast<LoadSDNode>(N);
34898 if (Subtarget.hasSSE2()) {
34899 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34900 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34901 Ld->getPointerInfo(), Ld->getBaseAlign(),
34902 Ld->getMemOperand()->getFlags());
34903 SDValue Chain = Res.getValue(1);
34904 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34905 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34906 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34907 Res = DAG.getBitcast(WideVT, Res);
34908 Results.push_back(Res);
34909 Results.push_back(Chain);
34910 return;
34911 }
34912 assert(Subtarget.hasSSE1() && "Expected SSE");
34913 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34914 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34915 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34916 MVT::i64, Ld->getMemOperand());
34917 Results.push_back(Res);
34918 Results.push_back(Res.getValue(1));
34919 return;
34920 }
34921 case ISD::ADDRSPACECAST: {
34922 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34923 Results.push_back(V);
34924 return;
34925 }
34926 case ISD::BITREVERSE: {
34927 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34928 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34929 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34930 // We'll need to move the scalar in two i32 pieces.
34931 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34932 return;
34933 }
34935 // f16 = extract vXf16 %vec, i64 %idx
34936 assert(N->getSimpleValueType(0) == MVT::f16 &&
34937 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34938 assert(Subtarget.hasFP16() && "Expected FP16");
34939 SDValue VecOp = N->getOperand(0);
34941 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34942 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34943 N->getOperand(1));
34944 Split = DAG.getBitcast(MVT::f16, Split);
34945 Results.push_back(Split);
34946 return;
34947 }
34948 }
34949}
34950
34951const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34952 switch ((X86ISD::NodeType)Opcode) {
34953 case X86ISD::FIRST_NUMBER: break;
34954#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34955 NODE_NAME_CASE(BSF)
34956 NODE_NAME_CASE(BSR)
34957 NODE_NAME_CASE(FSHL)
34958 NODE_NAME_CASE(FSHR)
34959 NODE_NAME_CASE(FAND)
34960 NODE_NAME_CASE(FANDN)
34961 NODE_NAME_CASE(FOR)
34962 NODE_NAME_CASE(FXOR)
34963 NODE_NAME_CASE(FILD)
34964 NODE_NAME_CASE(FIST)
34965 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34966 NODE_NAME_CASE(FLD)
34967 NODE_NAME_CASE(FST)
34968 NODE_NAME_CASE(CALL)
34969 NODE_NAME_CASE(CALL_RVMARKER)
34970 NODE_NAME_CASE(IMP_CALL)
34972 NODE_NAME_CASE(CMP)
34973 NODE_NAME_CASE(FCMP)
34974 NODE_NAME_CASE(STRICT_FCMP)
34975 NODE_NAME_CASE(STRICT_FCMPS)
34977 NODE_NAME_CASE(UCOMI)
34978 NODE_NAME_CASE(COMX)
34979 NODE_NAME_CASE(UCOMX)
34980 NODE_NAME_CASE(CMPM)
34981 NODE_NAME_CASE(CMPMM)
34982 NODE_NAME_CASE(STRICT_CMPM)
34983 NODE_NAME_CASE(CMPMM_SAE)
34984 NODE_NAME_CASE(SETCC)
34985 NODE_NAME_CASE(SETCC_CARRY)
34986 NODE_NAME_CASE(FSETCC)
34987 NODE_NAME_CASE(FSETCCM)
34988 NODE_NAME_CASE(FSETCCM_SAE)
34989 NODE_NAME_CASE(CMOV)
34990 NODE_NAME_CASE(BRCOND)
34991 NODE_NAME_CASE(RET_GLUE)
34992 NODE_NAME_CASE(IRET)
34993 NODE_NAME_CASE(REP_STOS)
34994 NODE_NAME_CASE(REP_MOVS)
34995 NODE_NAME_CASE(GlobalBaseReg)
34997 NODE_NAME_CASE(WrapperRIP)
34998 NODE_NAME_CASE(MOVQ2DQ)
34999 NODE_NAME_CASE(MOVDQ2Q)
35000 NODE_NAME_CASE(MMX_MOVD2W)
35001 NODE_NAME_CASE(MMX_MOVW2D)
35002 NODE_NAME_CASE(PEXTRB)
35003 NODE_NAME_CASE(PEXTRW)
35004 NODE_NAME_CASE(INSERTPS)
35005 NODE_NAME_CASE(PINSRB)
35006 NODE_NAME_CASE(PINSRW)
35007 NODE_NAME_CASE(PSHUFB)
35008 NODE_NAME_CASE(ANDNP)
35009 NODE_NAME_CASE(BLENDI)
35011 NODE_NAME_CASE(HADD)
35012 NODE_NAME_CASE(HSUB)
35013 NODE_NAME_CASE(FHADD)
35014 NODE_NAME_CASE(FHSUB)
35015 NODE_NAME_CASE(CONFLICT)
35016 NODE_NAME_CASE(FMAX)
35017 NODE_NAME_CASE(FMAXS)
35018 NODE_NAME_CASE(FMAX_SAE)
35019 NODE_NAME_CASE(FMAXS_SAE)
35020 NODE_NAME_CASE(STRICT_FMAX)
35021 NODE_NAME_CASE(FMIN)
35022 NODE_NAME_CASE(FMINS)
35023 NODE_NAME_CASE(FMIN_SAE)
35024 NODE_NAME_CASE(FMINS_SAE)
35025 NODE_NAME_CASE(STRICT_FMIN)
35026 NODE_NAME_CASE(FMAXC)
35027 NODE_NAME_CASE(FMINC)
35028 NODE_NAME_CASE(FRSQRT)
35029 NODE_NAME_CASE(FRCP)
35030 NODE_NAME_CASE(EXTRQI)
35031 NODE_NAME_CASE(INSERTQI)
35032 NODE_NAME_CASE(TLSADDR)
35033 NODE_NAME_CASE(TLSBASEADDR)
35034 NODE_NAME_CASE(TLSCALL)
35035 NODE_NAME_CASE(TLSDESC)
35036 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35037 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35038 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35039 NODE_NAME_CASE(EH_RETURN)
35040 NODE_NAME_CASE(TC_RETURN)
35041 NODE_NAME_CASE(FNSTCW16m)
35042 NODE_NAME_CASE(FLDCW16m)
35043 NODE_NAME_CASE(FNSTENVm)
35044 NODE_NAME_CASE(FLDENVm)
35045 NODE_NAME_CASE(LCMPXCHG_DAG)
35046 NODE_NAME_CASE(LCMPXCHG8_DAG)
35047 NODE_NAME_CASE(LCMPXCHG16_DAG)
35048 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35049 NODE_NAME_CASE(LADD)
35050 NODE_NAME_CASE(LSUB)
35051 NODE_NAME_CASE(LOR)
35052 NODE_NAME_CASE(LXOR)
35053 NODE_NAME_CASE(LAND)
35054 NODE_NAME_CASE(LBTS)
35055 NODE_NAME_CASE(LBTC)
35056 NODE_NAME_CASE(LBTR)
35057 NODE_NAME_CASE(LBTS_RM)
35058 NODE_NAME_CASE(LBTC_RM)
35059 NODE_NAME_CASE(LBTR_RM)
35060 NODE_NAME_CASE(AADD)
35061 NODE_NAME_CASE(AOR)
35062 NODE_NAME_CASE(AXOR)
35063 NODE_NAME_CASE(AAND)
35064 NODE_NAME_CASE(VZEXT_MOVL)
35065 NODE_NAME_CASE(VZEXT_LOAD)
35066 NODE_NAME_CASE(VEXTRACT_STORE)
35067 NODE_NAME_CASE(VTRUNC)
35068 NODE_NAME_CASE(VTRUNCS)
35069 NODE_NAME_CASE(VTRUNCUS)
35070 NODE_NAME_CASE(VMTRUNC)
35071 NODE_NAME_CASE(VMTRUNCS)
35072 NODE_NAME_CASE(VMTRUNCUS)
35073 NODE_NAME_CASE(VTRUNCSTORES)
35074 NODE_NAME_CASE(VTRUNCSTOREUS)
35075 NODE_NAME_CASE(VMTRUNCSTORES)
35076 NODE_NAME_CASE(VMTRUNCSTOREUS)
35077 NODE_NAME_CASE(VFPEXT)
35078 NODE_NAME_CASE(STRICT_VFPEXT)
35079 NODE_NAME_CASE(VFPEXT_SAE)
35080 NODE_NAME_CASE(VFPEXTS)
35081 NODE_NAME_CASE(VFPEXTS_SAE)
35082 NODE_NAME_CASE(VFPROUND)
35083 NODE_NAME_CASE(VFPROUND2)
35084 NODE_NAME_CASE(VFPROUND2_RND)
35085 NODE_NAME_CASE(STRICT_VFPROUND)
35086 NODE_NAME_CASE(VMFPROUND)
35087 NODE_NAME_CASE(VFPROUND_RND)
35088 NODE_NAME_CASE(VFPROUNDS)
35089 NODE_NAME_CASE(VFPROUNDS_RND)
35090 NODE_NAME_CASE(VSHLDQ)
35091 NODE_NAME_CASE(VSRLDQ)
35092 NODE_NAME_CASE(VSHL)
35093 NODE_NAME_CASE(VSRL)
35094 NODE_NAME_CASE(VSRA)
35095 NODE_NAME_CASE(VSHLI)
35096 NODE_NAME_CASE(VSRLI)
35097 NODE_NAME_CASE(VSRAI)
35098 NODE_NAME_CASE(VSHLV)
35099 NODE_NAME_CASE(VSRLV)
35100 NODE_NAME_CASE(VSRAV)
35101 NODE_NAME_CASE(VROTLI)
35102 NODE_NAME_CASE(VROTRI)
35103 NODE_NAME_CASE(VPPERM)
35104 NODE_NAME_CASE(CMPP)
35105 NODE_NAME_CASE(STRICT_CMPP)
35106 NODE_NAME_CASE(PCMPEQ)
35107 NODE_NAME_CASE(PCMPGT)
35108 NODE_NAME_CASE(PHMINPOS)
35109 NODE_NAME_CASE(ADD)
35110 NODE_NAME_CASE(SUB)
35111 NODE_NAME_CASE(ADC)
35112 NODE_NAME_CASE(SBB)
35113 NODE_NAME_CASE(SMUL)
35114 NODE_NAME_CASE(UMUL)
35115 NODE_NAME_CASE(OR)
35116 NODE_NAME_CASE(XOR)
35117 NODE_NAME_CASE(AND)
35118 NODE_NAME_CASE(BEXTR)
35120 NODE_NAME_CASE(BZHI)
35121 NODE_NAME_CASE(PDEP)
35122 NODE_NAME_CASE(PEXT)
35123 NODE_NAME_CASE(MUL_IMM)
35124 NODE_NAME_CASE(MOVMSK)
35125 NODE_NAME_CASE(PTEST)
35126 NODE_NAME_CASE(TESTP)
35127 NODE_NAME_CASE(KORTEST)
35128 NODE_NAME_CASE(KTEST)
35129 NODE_NAME_CASE(KADD)
35130 NODE_NAME_CASE(KSHIFTL)
35131 NODE_NAME_CASE(KSHIFTR)
35132 NODE_NAME_CASE(PACKSS)
35133 NODE_NAME_CASE(PACKUS)
35134 NODE_NAME_CASE(PALIGNR)
35135 NODE_NAME_CASE(VALIGN)
35136 NODE_NAME_CASE(VSHLD)
35137 NODE_NAME_CASE(VSHRD)
35138 NODE_NAME_CASE(VSHLDV)
35139 NODE_NAME_CASE(VSHRDV)
35140 NODE_NAME_CASE(PSHUFD)
35141 NODE_NAME_CASE(PSHUFHW)
35142 NODE_NAME_CASE(PSHUFLW)
35143 NODE_NAME_CASE(SHUFP)
35144 NODE_NAME_CASE(SHUF128)
35145 NODE_NAME_CASE(MOVLHPS)
35146 NODE_NAME_CASE(MOVHLPS)
35147 NODE_NAME_CASE(MOVDDUP)
35148 NODE_NAME_CASE(MOVSHDUP)
35149 NODE_NAME_CASE(MOVSLDUP)
35150 NODE_NAME_CASE(MOVSD)
35151 NODE_NAME_CASE(MOVSS)
35152 NODE_NAME_CASE(MOVSH)
35153 NODE_NAME_CASE(UNPCKL)
35154 NODE_NAME_CASE(UNPCKH)
35155 NODE_NAME_CASE(VBROADCAST)
35156 NODE_NAME_CASE(VBROADCAST_LOAD)
35157 NODE_NAME_CASE(VBROADCASTM)
35158 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35159 NODE_NAME_CASE(VPERMILPV)
35160 NODE_NAME_CASE(VPERMILPI)
35161 NODE_NAME_CASE(VPERM2X128)
35162 NODE_NAME_CASE(VPERMV)
35163 NODE_NAME_CASE(VPERMV3)
35164 NODE_NAME_CASE(VPERMI)
35165 NODE_NAME_CASE(VPTERNLOG)
35166 NODE_NAME_CASE(FP_TO_SINT_SAT)
35167 NODE_NAME_CASE(FP_TO_UINT_SAT)
35168 NODE_NAME_CASE(VFIXUPIMM)
35169 NODE_NAME_CASE(VFIXUPIMM_SAE)
35170 NODE_NAME_CASE(VFIXUPIMMS)
35171 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35172 NODE_NAME_CASE(VRANGE)
35173 NODE_NAME_CASE(VRANGE_SAE)
35174 NODE_NAME_CASE(VRANGES)
35175 NODE_NAME_CASE(VRANGES_SAE)
35176 NODE_NAME_CASE(PMULUDQ)
35177 NODE_NAME_CASE(PMULDQ)
35178 NODE_NAME_CASE(PSADBW)
35179 NODE_NAME_CASE(DBPSADBW)
35180 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35181 NODE_NAME_CASE(VAARG_64)
35182 NODE_NAME_CASE(VAARG_X32)
35183 NODE_NAME_CASE(DYN_ALLOCA)
35184 NODE_NAME_CASE(MFENCE)
35185 NODE_NAME_CASE(SEG_ALLOCA)
35186 NODE_NAME_CASE(PROBED_ALLOCA)
35189 NODE_NAME_CASE(RDPKRU)
35190 NODE_NAME_CASE(WRPKRU)
35191 NODE_NAME_CASE(VPMADDUBSW)
35192 NODE_NAME_CASE(VPMADDWD)
35193 NODE_NAME_CASE(VPSHA)
35194 NODE_NAME_CASE(VPSHL)
35195 NODE_NAME_CASE(VPCOM)
35196 NODE_NAME_CASE(VPCOMU)
35197 NODE_NAME_CASE(VPERMIL2)
35199 NODE_NAME_CASE(STRICT_FMSUB)
35201 NODE_NAME_CASE(STRICT_FNMADD)
35203 NODE_NAME_CASE(STRICT_FNMSUB)
35204 NODE_NAME_CASE(FMADDSUB)
35205 NODE_NAME_CASE(FMSUBADD)
35206 NODE_NAME_CASE(FMADD_RND)
35207 NODE_NAME_CASE(FNMADD_RND)
35208 NODE_NAME_CASE(FMSUB_RND)
35209 NODE_NAME_CASE(FNMSUB_RND)
35210 NODE_NAME_CASE(FMADDSUB_RND)
35211 NODE_NAME_CASE(FMSUBADD_RND)
35212 NODE_NAME_CASE(VFMADDC)
35213 NODE_NAME_CASE(VFMADDC_RND)
35214 NODE_NAME_CASE(VFCMADDC)
35215 NODE_NAME_CASE(VFCMADDC_RND)
35216 NODE_NAME_CASE(VFMULC)
35217 NODE_NAME_CASE(VFMULC_RND)
35218 NODE_NAME_CASE(VFCMULC)
35219 NODE_NAME_CASE(VFCMULC_RND)
35220 NODE_NAME_CASE(VFMULCSH)
35221 NODE_NAME_CASE(VFMULCSH_RND)
35222 NODE_NAME_CASE(VFCMULCSH)
35223 NODE_NAME_CASE(VFCMULCSH_RND)
35224 NODE_NAME_CASE(VFMADDCSH)
35225 NODE_NAME_CASE(VFMADDCSH_RND)
35226 NODE_NAME_CASE(VFCMADDCSH)
35227 NODE_NAME_CASE(VFCMADDCSH_RND)
35228 NODE_NAME_CASE(VPMADD52H)
35229 NODE_NAME_CASE(VPMADD52L)
35230 NODE_NAME_CASE(VRNDSCALE)
35231 NODE_NAME_CASE(STRICT_VRNDSCALE)
35232 NODE_NAME_CASE(VRNDSCALE_SAE)
35233 NODE_NAME_CASE(VRNDSCALES)
35234 NODE_NAME_CASE(VRNDSCALES_SAE)
35235 NODE_NAME_CASE(VREDUCE)
35236 NODE_NAME_CASE(VREDUCE_SAE)
35237 NODE_NAME_CASE(VREDUCES)
35238 NODE_NAME_CASE(VREDUCES_SAE)
35239 NODE_NAME_CASE(VGETMANT)
35240 NODE_NAME_CASE(VGETMANT_SAE)
35241 NODE_NAME_CASE(VGETMANTS)
35242 NODE_NAME_CASE(VGETMANTS_SAE)
35243 NODE_NAME_CASE(PCMPESTR)
35244 NODE_NAME_CASE(PCMPISTR)
35246 NODE_NAME_CASE(COMPRESS)
35248 NODE_NAME_CASE(SELECTS)
35249 NODE_NAME_CASE(ADDSUB)
35250 NODE_NAME_CASE(RCP14)
35251 NODE_NAME_CASE(RCP14S)
35252 NODE_NAME_CASE(RSQRT14)
35253 NODE_NAME_CASE(RSQRT14S)
35254 NODE_NAME_CASE(FADD_RND)
35255 NODE_NAME_CASE(FADDS)
35256 NODE_NAME_CASE(FADDS_RND)
35257 NODE_NAME_CASE(FSUB_RND)
35258 NODE_NAME_CASE(FSUBS)
35259 NODE_NAME_CASE(FSUBS_RND)
35260 NODE_NAME_CASE(FMUL_RND)
35261 NODE_NAME_CASE(FMULS)
35262 NODE_NAME_CASE(FMULS_RND)
35263 NODE_NAME_CASE(FDIV_RND)
35264 NODE_NAME_CASE(FDIVS)
35265 NODE_NAME_CASE(FDIVS_RND)
35266 NODE_NAME_CASE(FSQRT_RND)
35267 NODE_NAME_CASE(FSQRTS)
35268 NODE_NAME_CASE(FSQRTS_RND)
35269 NODE_NAME_CASE(FGETEXP)
35270 NODE_NAME_CASE(FGETEXP_SAE)
35271 NODE_NAME_CASE(FGETEXPS)
35272 NODE_NAME_CASE(FGETEXPS_SAE)
35273 NODE_NAME_CASE(SCALEF)
35274 NODE_NAME_CASE(SCALEF_RND)
35275 NODE_NAME_CASE(SCALEFS)
35276 NODE_NAME_CASE(SCALEFS_RND)
35277 NODE_NAME_CASE(MULHRS)
35278 NODE_NAME_CASE(SINT_TO_FP_RND)
35279 NODE_NAME_CASE(UINT_TO_FP_RND)
35280 NODE_NAME_CASE(CVTTP2SI)
35281 NODE_NAME_CASE(CVTTP2UI)
35282 NODE_NAME_CASE(STRICT_CVTTP2SI)
35283 NODE_NAME_CASE(STRICT_CVTTP2UI)
35284 NODE_NAME_CASE(MCVTTP2SI)
35285 NODE_NAME_CASE(MCVTTP2UI)
35286 NODE_NAME_CASE(CVTTP2SI_SAE)
35287 NODE_NAME_CASE(CVTTP2UI_SAE)
35288 NODE_NAME_CASE(CVTTS2SI)
35289 NODE_NAME_CASE(CVTTS2UI)
35290 NODE_NAME_CASE(CVTTS2SI_SAE)
35291 NODE_NAME_CASE(CVTTS2UI_SAE)
35292 NODE_NAME_CASE(CVTSI2P)
35293 NODE_NAME_CASE(CVTUI2P)
35294 NODE_NAME_CASE(STRICT_CVTSI2P)
35295 NODE_NAME_CASE(STRICT_CVTUI2P)
35296 NODE_NAME_CASE(MCVTSI2P)
35297 NODE_NAME_CASE(MCVTUI2P)
35298 NODE_NAME_CASE(VFPCLASS)
35299 NODE_NAME_CASE(VFPCLASSS)
35300 NODE_NAME_CASE(MULTISHIFT)
35301 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35302 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35303 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35304 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35305 NODE_NAME_CASE(CVTPS2PH)
35306 NODE_NAME_CASE(STRICT_CVTPS2PH)
35307 NODE_NAME_CASE(CVTPS2PH_SAE)
35308 NODE_NAME_CASE(MCVTPS2PH)
35309 NODE_NAME_CASE(MCVTPS2PH_SAE)
35310 NODE_NAME_CASE(CVTPH2PS)
35311 NODE_NAME_CASE(STRICT_CVTPH2PS)
35312 NODE_NAME_CASE(CVTPH2PS_SAE)
35313 NODE_NAME_CASE(CVTP2SI)
35314 NODE_NAME_CASE(CVTP2UI)
35315 NODE_NAME_CASE(MCVTP2SI)
35316 NODE_NAME_CASE(MCVTP2UI)
35317 NODE_NAME_CASE(CVTP2SI_RND)
35318 NODE_NAME_CASE(CVTP2UI_RND)
35319 NODE_NAME_CASE(CVTS2SI)
35320 NODE_NAME_CASE(CVTS2UI)
35321 NODE_NAME_CASE(CVTS2SI_RND)
35322 NODE_NAME_CASE(CVTS2UI_RND)
35323 NODE_NAME_CASE(CVTNEPS2BF16)
35324 NODE_NAME_CASE(MCVTNEPS2BF16)
35325 NODE_NAME_CASE(DPBF16PS)
35326 NODE_NAME_CASE(DPFP16PS)
35327 NODE_NAME_CASE(MPSADBW)
35328 NODE_NAME_CASE(LWPINS)
35329 NODE_NAME_CASE(MGATHER)
35330 NODE_NAME_CASE(MSCATTER)
35331 NODE_NAME_CASE(VPDPBUSD)
35332 NODE_NAME_CASE(VPDPBUSDS)
35333 NODE_NAME_CASE(VPDPWSSD)
35334 NODE_NAME_CASE(VPDPWSSDS)
35335 NODE_NAME_CASE(VPSHUFBITQMB)
35336 NODE_NAME_CASE(GF2P8MULB)
35337 NODE_NAME_CASE(GF2P8AFFINEQB)
35338 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35339 NODE_NAME_CASE(NT_CALL)
35340 NODE_NAME_CASE(NT_BRIND)
35341 NODE_NAME_CASE(UMWAIT)
35342 NODE_NAME_CASE(TPAUSE)
35343 NODE_NAME_CASE(ENQCMD)
35344 NODE_NAME_CASE(ENQCMDS)
35345 NODE_NAME_CASE(VP2INTERSECT)
35346 NODE_NAME_CASE(VPDPBSUD)
35347 NODE_NAME_CASE(VPDPBSUDS)
35348 NODE_NAME_CASE(VPDPBUUD)
35349 NODE_NAME_CASE(VPDPBUUDS)
35350 NODE_NAME_CASE(VPDPBSSD)
35351 NODE_NAME_CASE(VPDPBSSDS)
35352 NODE_NAME_CASE(VPDPWSUD)
35353 NODE_NAME_CASE(VPDPWSUDS)
35354 NODE_NAME_CASE(VPDPWUSD)
35355 NODE_NAME_CASE(VPDPWUSDS)
35356 NODE_NAME_CASE(VPDPWUUD)
35357 NODE_NAME_CASE(VPDPWUUDS)
35358 NODE_NAME_CASE(VMINMAX)
35359 NODE_NAME_CASE(VMINMAX_SAE)
35360 NODE_NAME_CASE(VMINMAXS)
35361 NODE_NAME_CASE(VMINMAXS_SAE)
35362 NODE_NAME_CASE(CVTP2IBS)
35363 NODE_NAME_CASE(CVTP2IUBS)
35364 NODE_NAME_CASE(CVTP2IBS_RND)
35365 NODE_NAME_CASE(CVTP2IUBS_RND)
35366 NODE_NAME_CASE(CVTTP2IBS)
35367 NODE_NAME_CASE(CVTTP2IUBS)
35368 NODE_NAME_CASE(CVTTP2IBS_SAE)
35369 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35370 NODE_NAME_CASE(VCVT2PH2BF8)
35371 NODE_NAME_CASE(VCVT2PH2BF8S)
35372 NODE_NAME_CASE(VCVT2PH2HF8)
35373 NODE_NAME_CASE(VCVT2PH2HF8S)
35374 NODE_NAME_CASE(VCVTBIASPH2BF8)
35375 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35376 NODE_NAME_CASE(VCVTBIASPH2HF8)
35377 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35378 NODE_NAME_CASE(VCVTPH2BF8)
35379 NODE_NAME_CASE(VCVTPH2BF8S)
35380 NODE_NAME_CASE(VCVTPH2HF8)
35381 NODE_NAME_CASE(VCVTPH2HF8S)
35382 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35383 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35384 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35385 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35386 NODE_NAME_CASE(VMCVTPH2BF8)
35387 NODE_NAME_CASE(VMCVTPH2BF8S)
35388 NODE_NAME_CASE(VMCVTPH2HF8)
35389 NODE_NAME_CASE(VMCVTPH2HF8S)
35390 NODE_NAME_CASE(VCVTHF82PH)
35391 NODE_NAME_CASE(AESENC128KL)
35392 NODE_NAME_CASE(AESDEC128KL)
35393 NODE_NAME_CASE(AESENC256KL)
35394 NODE_NAME_CASE(AESDEC256KL)
35395 NODE_NAME_CASE(AESENCWIDE128KL)
35396 NODE_NAME_CASE(AESDECWIDE128KL)
35397 NODE_NAME_CASE(AESENCWIDE256KL)
35398 NODE_NAME_CASE(AESDECWIDE256KL)
35399 NODE_NAME_CASE(CMPCCXADD)
35400 NODE_NAME_CASE(TESTUI)
35401 NODE_NAME_CASE(FP80_ADD)
35402 NODE_NAME_CASE(STRICT_FP80_ADD)
35403 NODE_NAME_CASE(CCMP)
35404 NODE_NAME_CASE(CTEST)
35405 NODE_NAME_CASE(CLOAD)
35406 NODE_NAME_CASE(CSTORE)
35407 NODE_NAME_CASE(CVTTS2SIS)
35408 NODE_NAME_CASE(CVTTS2UIS)
35409 NODE_NAME_CASE(CVTTS2SIS_SAE)
35410 NODE_NAME_CASE(CVTTS2UIS_SAE)
35411 NODE_NAME_CASE(CVTTP2SIS)
35412 NODE_NAME_CASE(MCVTTP2SIS)
35413 NODE_NAME_CASE(CVTTP2UIS_SAE)
35414 NODE_NAME_CASE(CVTTP2SIS_SAE)
35415 NODE_NAME_CASE(CVTTP2UIS)
35416 NODE_NAME_CASE(MCVTTP2UIS)
35417 NODE_NAME_CASE(POP_FROM_X87_REG)
35418 }
35419 return nullptr;
35420#undef NODE_NAME_CASE
35421}
35422
35423/// Return true if the addressing mode represented by AM is legal for this
35424/// target, for a load/store of the specified type.
35426 const AddrMode &AM, Type *Ty,
35427 unsigned AS,
35428 Instruction *I) const {
35429 // X86 supports extremely general addressing modes.
35431
35432 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35433 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35434 return false;
35435
35436 if (AM.BaseGV) {
35437 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35438
35439 // If a reference to this global requires an extra load, we can't fold it.
35440 if (isGlobalStubReference(GVFlags))
35441 return false;
35442
35443 // If BaseGV requires a register for the PIC base, we cannot also have a
35444 // BaseReg specified.
35445 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35446 return false;
35447
35448 // If lower 4G is not available, then we must use rip-relative addressing.
35449 if ((M != CodeModel::Small || isPositionIndependent()) &&
35450 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35451 return false;
35452 }
35453
35454 switch (AM.Scale) {
35455 case 0:
35456 case 1:
35457 case 2:
35458 case 4:
35459 case 8:
35460 // These scales always work.
35461 break;
35462 case 3:
35463 case 5:
35464 case 9:
35465 // These scales are formed with basereg+scalereg. Only accept if there is
35466 // no basereg yet.
35467 if (AM.HasBaseReg)
35468 return false;
35469 break;
35470 default: // Other stuff never works.
35471 return false;
35472 }
35473
35474 return true;
35475}
35476
35477bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35478 switch (Opcode) {
35479 // These are non-commutative binops.
35480 // TODO: Add more X86ISD opcodes once we have test coverage.
35481 case X86ISD::ANDNP:
35482 case X86ISD::PCMPGT:
35483 case X86ISD::FMAX:
35484 case X86ISD::FMIN:
35485 case X86ISD::FANDN:
35486 case X86ISD::VPSHA:
35487 case X86ISD::VPSHL:
35488 case X86ISD::VSHLV:
35489 case X86ISD::VSRLV:
35490 case X86ISD::VSRAV:
35491 return true;
35492 }
35493
35494 return TargetLoweringBase::isBinOp(Opcode);
35495}
35496
35497bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35498 switch (Opcode) {
35499 // TODO: Add more X86ISD opcodes once we have test coverage.
35500 case X86ISD::PCMPEQ:
35501 case X86ISD::PMULDQ:
35502 case X86ISD::PMULUDQ:
35503 case X86ISD::FMAXC:
35504 case X86ISD::FMINC:
35505 case X86ISD::FAND:
35506 case X86ISD::FOR:
35507 case X86ISD::FXOR:
35508 return true;
35509 }
35510
35512}
35513
35515 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35516 return false;
35517 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35518 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35519 return NumBits1 > NumBits2;
35520}
35521
35523 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35524 return false;
35525
35526 if (!isTypeLegal(EVT::getEVT(Ty1)))
35527 return false;
35528
35529 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35530
35531 // Assuming the caller doesn't have a zeroext or signext return parameter,
35532 // truncation all the way down to i1 is valid.
35533 return true;
35534}
35535
35537 return isInt<32>(Imm);
35538}
35539
35541 // Can also use sub to handle negated immediates.
35542 return isInt<32>(Imm);
35543}
35544
35546 return isInt<32>(Imm);
35547}
35548
35550 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35551 return false;
35552 unsigned NumBits1 = VT1.getSizeInBits();
35553 unsigned NumBits2 = VT2.getSizeInBits();
35554 return NumBits1 > NumBits2;
35555}
35556
35558 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35559 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35560}
35561
35563 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35564 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35565}
35566
35568 EVT VT1 = Val.getValueType();
35569 if (isZExtFree(VT1, VT2))
35570 return true;
35571
35572 if (Val.getOpcode() != ISD::LOAD)
35573 return false;
35574
35575 if (!VT1.isSimple() || !VT1.isInteger() ||
35576 !VT2.isSimple() || !VT2.isInteger())
35577 return false;
35578
35579 switch (VT1.getSimpleVT().SimpleTy) {
35580 default: break;
35581 case MVT::i8:
35582 case MVT::i16:
35583 case MVT::i32:
35584 // X86 has 8, 16, and 32-bit zero-extending loads.
35585 return true;
35586 }
35587
35588 return false;
35589}
35590
35592 if (!Subtarget.is64Bit())
35593 return false;
35595}
35596
35598 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35599 return false;
35600
35601 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35602
35603 // There is no extending load for vXi1.
35604 if (SrcVT.getScalarType() == MVT::i1)
35605 return false;
35606
35607 return true;
35608}
35609
35611 EVT VT) const {
35612 if (Subtarget.useSoftFloat())
35613 return false;
35614
35615 if (!Subtarget.hasAnyFMA())
35616 return false;
35617
35618 VT = VT.getScalarType();
35619
35620 if (!VT.isSimple())
35621 return false;
35622
35623 switch (VT.getSimpleVT().SimpleTy) {
35624 case MVT::f16:
35625 return Subtarget.hasFP16();
35626 case MVT::f32:
35627 case MVT::f64:
35628 return true;
35629 default:
35630 break;
35631 }
35632
35633 return false;
35634}
35635
35637 EVT DestVT) const {
35638 // i16 instructions are longer (0x66 prefix) and potentially slower.
35639 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35640}
35641
35643 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35644 SDValue Y) const {
35645 if (SelectOpcode == ISD::SELECT) {
35646 if (VT.isVector())
35647 return false;
35648 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35649 return false;
35650 using namespace llvm::SDPatternMatch;
35651 // BLSI
35652 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35653 sd_match(X, m_Neg(m_Specific(Y)))))
35654 return true;
35655 // BLSR
35656 if (BinOpcode == ISD::AND &&
35657 (sd_match(Y, m_Add(m_Specific(X), m_AllOnes())) ||
35658 sd_match(X, m_Add(m_Specific(Y), m_AllOnes()))))
35659 return true;
35660 // BLSMSK
35661 if (BinOpcode == ISD::XOR &&
35662 (sd_match(Y, m_Add(m_Specific(X), m_AllOnes())) ||
35663 sd_match(X, m_Add(m_Specific(Y), m_AllOnes()))))
35664 return true;
35665
35666 return false;
35667 }
35668 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35669 // benefit. The transform may also be profitable for scalar code.
35670 if (!Subtarget.hasAVX512())
35671 return false;
35672 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35673 return false;
35674 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35675 return false;
35676
35677 return true;
35678}
35679
35680/// Targets can use this to indicate that they only support *some*
35681/// VECTOR_SHUFFLE operations, those with specific masks.
35682/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35683/// are assumed to be legal.
35685 if (!VT.isSimple())
35686 return false;
35687
35688 // Not for i1 vectors
35689 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35690 return false;
35691
35692 // Very little shuffling can be done for 64-bit vectors right now.
35693 if (VT.getSimpleVT().getSizeInBits() == 64)
35694 return false;
35695
35696 // We only care that the types being shuffled are legal. The lowering can
35697 // handle any possible shuffle mask that results.
35698 return isTypeLegal(VT.getSimpleVT());
35699}
35700
35702 EVT VT) const {
35703 // Don't convert an 'and' into a shuffle that we don't directly support.
35704 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35705 if (!Subtarget.hasAVX2())
35706 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35707 return false;
35708
35709 // Just delegate to the generic legality, clear masks aren't special.
35710 return isShuffleMaskLegal(Mask, VT);
35711}
35712
35714 // If the subtarget is using thunks, we need to not generate jump tables.
35715 if (Subtarget.useIndirectThunkBranches())
35716 return false;
35717
35718 // Otherwise, fallback on the generic logic.
35720}
35721
35723 EVT ConditionVT) const {
35724 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35725 // zero-extensions.
35726 if (ConditionVT.getSizeInBits() < 32)
35727 return MVT::i32;
35729 ConditionVT);
35730}
35731
35732//===----------------------------------------------------------------------===//
35733// X86 Scheduler Hooks
35734//===----------------------------------------------------------------------===//
35735
35736/// Utility function to emit xbegin specifying the start of an RTM region.
35738 const TargetInstrInfo *TII) {
35739 const MIMetadata MIMD(MI);
35740
35741 const BasicBlock *BB = MBB->getBasicBlock();
35743
35744 // For the v = xbegin(), we generate
35745 //
35746 // thisMBB:
35747 // xbegin sinkMBB
35748 //
35749 // mainMBB:
35750 // s0 = -1
35751 //
35752 // fallBB:
35753 // eax = # XABORT_DEF
35754 // s1 = eax
35755 //
35756 // sinkMBB:
35757 // v = phi(s0/mainBB, s1/fallBB)
35758
35759 MachineBasicBlock *thisMBB = MBB;
35760 MachineFunction *MF = MBB->getParent();
35761 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35762 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35763 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35764 MF->insert(I, mainMBB);
35765 MF->insert(I, fallMBB);
35766 MF->insert(I, sinkMBB);
35767
35768 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35769 mainMBB->addLiveIn(X86::EFLAGS);
35770 fallMBB->addLiveIn(X86::EFLAGS);
35771 sinkMBB->addLiveIn(X86::EFLAGS);
35772 }
35773
35774 // Transfer the remainder of BB and its successor edges to sinkMBB.
35775 sinkMBB->splice(sinkMBB->begin(), MBB,
35776 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35778
35780 Register DstReg = MI.getOperand(0).getReg();
35781 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35782 Register mainDstReg = MRI.createVirtualRegister(RC);
35783 Register fallDstReg = MRI.createVirtualRegister(RC);
35784
35785 // thisMBB:
35786 // xbegin fallMBB
35787 // # fallthrough to mainMBB
35788 // # abortion to fallMBB
35789 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35790 thisMBB->addSuccessor(mainMBB);
35791 thisMBB->addSuccessor(fallMBB);
35792
35793 // mainMBB:
35794 // mainDstReg := -1
35795 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35796 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35797 mainMBB->addSuccessor(sinkMBB);
35798
35799 // fallMBB:
35800 // ; pseudo instruction to model hardware's definition from XABORT
35801 // EAX := XABORT_DEF
35802 // fallDstReg := EAX
35803 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35804 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35805 .addReg(X86::EAX);
35806 fallMBB->addSuccessor(sinkMBB);
35807
35808 // sinkMBB:
35809 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35810 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35811 .addReg(mainDstReg).addMBB(mainMBB)
35812 .addReg(fallDstReg).addMBB(fallMBB);
35813
35814 MI.eraseFromParent();
35815 return sinkMBB;
35816}
35817
35819X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35820 MachineBasicBlock *MBB) const {
35821 // Emit va_arg instruction on X86-64.
35822
35823 // Operands to this pseudo-instruction:
35824 // 0 ) Output : destination address (reg)
35825 // 1-5) Input : va_list address (addr, i64mem)
35826 // 6 ) ArgSize : Size (in bytes) of vararg type
35827 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35828 // 8 ) Align : Alignment of type
35829 // 9 ) EFLAGS (implicit-def)
35830
35831 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35832 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35833
35834 Register DestReg = MI.getOperand(0).getReg();
35835 MachineOperand &Base = MI.getOperand(1);
35836 MachineOperand &Scale = MI.getOperand(2);
35837 MachineOperand &Index = MI.getOperand(3);
35838 MachineOperand &Disp = MI.getOperand(4);
35839 MachineOperand &Segment = MI.getOperand(5);
35840 unsigned ArgSize = MI.getOperand(6).getImm();
35841 unsigned ArgMode = MI.getOperand(7).getImm();
35842 Align Alignment = Align(MI.getOperand(8).getImm());
35843
35844 MachineFunction *MF = MBB->getParent();
35845
35846 // Memory Reference
35847 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35848
35849 MachineMemOperand *OldMMO = MI.memoperands().front();
35850
35851 // Clone the MMO into two separate MMOs for loading and storing
35852 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35853 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35854 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35855 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35856
35857 // Machine Information
35858 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35860 const TargetRegisterClass *AddrRegClass =
35862 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35863 const MIMetadata MIMD(MI);
35864
35865 // struct va_list {
35866 // i32 gp_offset
35867 // i32 fp_offset
35868 // i64 overflow_area (address)
35869 // i64 reg_save_area (address)
35870 // }
35871 // sizeof(va_list) = 24
35872 // alignment(va_list) = 8
35873
35874 unsigned TotalNumIntRegs = 6;
35875 unsigned TotalNumXMMRegs = 8;
35876 bool UseGPOffset = (ArgMode == 1);
35877 bool UseFPOffset = (ArgMode == 2);
35878 unsigned MaxOffset = TotalNumIntRegs * 8 +
35879 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35880
35881 /* Align ArgSize to a multiple of 8 */
35882 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35883 bool NeedsAlign = (Alignment > 8);
35884
35885 MachineBasicBlock *thisMBB = MBB;
35886 MachineBasicBlock *overflowMBB;
35887 MachineBasicBlock *offsetMBB;
35888 MachineBasicBlock *endMBB;
35889
35890 Register OffsetDestReg; // Argument address computed by offsetMBB
35891 Register OverflowDestReg; // Argument address computed by overflowMBB
35892 Register OffsetReg;
35893
35894 if (!UseGPOffset && !UseFPOffset) {
35895 // If we only pull from the overflow region, we don't create a branch.
35896 // We don't need to alter control flow.
35897 OffsetDestReg = Register(); // unused
35898 OverflowDestReg = DestReg;
35899
35900 offsetMBB = nullptr;
35901 overflowMBB = thisMBB;
35902 endMBB = thisMBB;
35903 } else {
35904 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35905 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35906 // If not, pull from overflow_area. (branch to overflowMBB)
35907 //
35908 // thisMBB
35909 // | .
35910 // | .
35911 // offsetMBB overflowMBB
35912 // | .
35913 // | .
35914 // endMBB
35915
35916 // Registers for the PHI in endMBB
35917 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35918 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35919
35920 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35921 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35922 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35923 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35924
35926
35927 // Insert the new basic blocks
35928 MF->insert(MBBIter, offsetMBB);
35929 MF->insert(MBBIter, overflowMBB);
35930 MF->insert(MBBIter, endMBB);
35931
35932 // Transfer the remainder of MBB and its successor edges to endMBB.
35933 endMBB->splice(endMBB->begin(), thisMBB,
35934 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35935 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35936
35937 // Make offsetMBB and overflowMBB successors of thisMBB
35938 thisMBB->addSuccessor(offsetMBB);
35939 thisMBB->addSuccessor(overflowMBB);
35940
35941 // endMBB is a successor of both offsetMBB and overflowMBB
35942 offsetMBB->addSuccessor(endMBB);
35943 overflowMBB->addSuccessor(endMBB);
35944
35945 // Load the offset value into a register
35946 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35947 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35948 .add(Base)
35949 .add(Scale)
35950 .add(Index)
35951 .addDisp(Disp, UseFPOffset ? 4 : 0)
35952 .add(Segment)
35953 .setMemRefs(LoadOnlyMMO);
35954
35955 // Check if there is enough room left to pull this argument.
35956 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35957 .addReg(OffsetReg)
35958 .addImm(MaxOffset + 8 - ArgSizeA8);
35959
35960 // Branch to "overflowMBB" if offset >= max
35961 // Fall through to "offsetMBB" otherwise
35962 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35963 .addMBB(overflowMBB).addImm(X86::COND_AE);
35964 }
35965
35966 // In offsetMBB, emit code to use the reg_save_area.
35967 if (offsetMBB) {
35968 assert(OffsetReg != 0);
35969
35970 // Read the reg_save_area address.
35971 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35972 BuildMI(
35973 offsetMBB, MIMD,
35974 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35975 RegSaveReg)
35976 .add(Base)
35977 .add(Scale)
35978 .add(Index)
35979 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35980 .add(Segment)
35981 .setMemRefs(LoadOnlyMMO);
35982
35983 if (Subtarget.isTarget64BitLP64()) {
35984 // Zero-extend the offset
35985 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35986 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35987 .addImm(0)
35988 .addReg(OffsetReg)
35989 .addImm(X86::sub_32bit);
35990
35991 // Add the offset to the reg_save_area to get the final address.
35992 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35993 .addReg(OffsetReg64)
35994 .addReg(RegSaveReg);
35995 } else {
35996 // Add the offset to the reg_save_area to get the final address.
35997 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35998 .addReg(OffsetReg)
35999 .addReg(RegSaveReg);
36000 }
36001
36002 // Compute the offset for the next argument
36003 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36004 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
36005 .addReg(OffsetReg)
36006 .addImm(UseFPOffset ? 16 : 8);
36007
36008 // Store it back into the va_list.
36009 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36010 .add(Base)
36011 .add(Scale)
36012 .add(Index)
36013 .addDisp(Disp, UseFPOffset ? 4 : 0)
36014 .add(Segment)
36015 .addReg(NextOffsetReg)
36016 .setMemRefs(StoreOnlyMMO);
36017
36018 // Jump to endMBB
36019 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36020 .addMBB(endMBB);
36021 }
36022
36023 //
36024 // Emit code to use overflow area
36025 //
36026
36027 // Load the overflow_area address into a register.
36028 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36029 BuildMI(overflowMBB, MIMD,
36030 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36031 OverflowAddrReg)
36032 .add(Base)
36033 .add(Scale)
36034 .add(Index)
36035 .addDisp(Disp, 8)
36036 .add(Segment)
36037 .setMemRefs(LoadOnlyMMO);
36038
36039 // If we need to align it, do so. Otherwise, just copy the address
36040 // to OverflowDestReg.
36041 if (NeedsAlign) {
36042 // Align the overflow address
36043 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36044
36045 // aligned_addr = (addr + (align-1)) & ~(align-1)
36046 BuildMI(
36047 overflowMBB, MIMD,
36048 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36049 TmpReg)
36050 .addReg(OverflowAddrReg)
36051 .addImm(Alignment.value() - 1);
36052
36053 BuildMI(
36054 overflowMBB, MIMD,
36055 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36056 OverflowDestReg)
36057 .addReg(TmpReg)
36058 .addImm(~(uint64_t)(Alignment.value() - 1));
36059 } else {
36060 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36061 .addReg(OverflowAddrReg);
36062 }
36063
36064 // Compute the next overflow address after this argument.
36065 // (the overflow address should be kept 8-byte aligned)
36066 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36067 BuildMI(
36068 overflowMBB, MIMD,
36069 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36070 NextAddrReg)
36071 .addReg(OverflowDestReg)
36072 .addImm(ArgSizeA8);
36073
36074 // Store the new overflow address.
36075 BuildMI(overflowMBB, MIMD,
36076 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36077 .add(Base)
36078 .add(Scale)
36079 .add(Index)
36080 .addDisp(Disp, 8)
36081 .add(Segment)
36082 .addReg(NextAddrReg)
36083 .setMemRefs(StoreOnlyMMO);
36084
36085 // If we branched, emit the PHI to the front of endMBB.
36086 if (offsetMBB) {
36087 BuildMI(*endMBB, endMBB->begin(), MIMD,
36088 TII->get(X86::PHI), DestReg)
36089 .addReg(OffsetDestReg).addMBB(offsetMBB)
36090 .addReg(OverflowDestReg).addMBB(overflowMBB);
36091 }
36092
36093 // Erase the pseudo instruction
36094 MI.eraseFromParent();
36095
36096 return endMBB;
36097}
36098
36099// The EFLAGS operand of SelectItr might be missing a kill marker
36100// because there were multiple uses of EFLAGS, and ISel didn't know
36101// which to mark. Figure out whether SelectItr should have had a
36102// kill marker, and set it if it should. Returns the correct kill
36103// marker value.
36106 const TargetRegisterInfo* TRI) {
36107 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36108 return false;
36109
36110 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36111 // out. SelectMI should have a kill flag on EFLAGS.
36112 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36113 return true;
36114}
36115
36116// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36117// together with other CMOV pseudo-opcodes into a single basic-block with
36118// conditional jump around it.
36120 switch (MI.getOpcode()) {
36121 case X86::CMOV_FR16:
36122 case X86::CMOV_FR16X:
36123 case X86::CMOV_FR32:
36124 case X86::CMOV_FR32X:
36125 case X86::CMOV_FR64:
36126 case X86::CMOV_FR64X:
36127 case X86::CMOV_GR8:
36128 case X86::CMOV_GR16:
36129 case X86::CMOV_GR32:
36130 case X86::CMOV_RFP32:
36131 case X86::CMOV_RFP64:
36132 case X86::CMOV_RFP80:
36133 case X86::CMOV_VR64:
36134 case X86::CMOV_VR128:
36135 case X86::CMOV_VR128X:
36136 case X86::CMOV_VR256:
36137 case X86::CMOV_VR256X:
36138 case X86::CMOV_VR512:
36139 case X86::CMOV_VK1:
36140 case X86::CMOV_VK2:
36141 case X86::CMOV_VK4:
36142 case X86::CMOV_VK8:
36143 case X86::CMOV_VK16:
36144 case X86::CMOV_VK32:
36145 case X86::CMOV_VK64:
36146 return true;
36147
36148 default:
36149 return false;
36150 }
36151}
36152
36153// Helper function, which inserts PHI functions into SinkMBB:
36154// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36155// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36156// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36157// the last PHI function inserted.
36160 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36161 MachineBasicBlock *SinkMBB) {
36162 MachineFunction *MF = TrueMBB->getParent();
36164 const MIMetadata MIMD(*MIItBegin);
36165
36166 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36168
36169 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36170
36171 // As we are creating the PHIs, we have to be careful if there is more than
36172 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36173 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36174 // That also means that PHI construction must work forward from earlier to
36175 // later, and that the code must maintain a mapping from earlier PHI's
36176 // destination registers, and the registers that went into the PHI.
36179
36180 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36181 Register DestReg = MIIt->getOperand(0).getReg();
36182 Register Op1Reg = MIIt->getOperand(1).getReg();
36183 Register Op2Reg = MIIt->getOperand(2).getReg();
36184
36185 // If this CMOV we are generating is the opposite condition from
36186 // the jump we generated, then we have to swap the operands for the
36187 // PHI that is going to be generated.
36188 if (MIIt->getOperand(3).getImm() == OppCC)
36189 std::swap(Op1Reg, Op2Reg);
36190
36191 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36192 Op1Reg = It->second.first;
36193
36194 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36195 Op2Reg = It->second.second;
36196
36197 MIB =
36198 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36199 .addReg(Op1Reg)
36200 .addMBB(FalseMBB)
36201 .addReg(Op2Reg)
36202 .addMBB(TrueMBB);
36203
36204 // Add this PHI to the rewrite table.
36205 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36206 }
36207
36208 return MIB;
36209}
36210
36211// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36213X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36214 MachineInstr &SecondCascadedCMOV,
36215 MachineBasicBlock *ThisMBB) const {
36216 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36217 const MIMetadata MIMD(FirstCMOV);
36218
36219 // We lower cascaded CMOVs such as
36220 //
36221 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36222 //
36223 // to two successive branches.
36224 //
36225 // Without this, we would add a PHI between the two jumps, which ends up
36226 // creating a few copies all around. For instance, for
36227 //
36228 // (sitofp (zext (fcmp une)))
36229 //
36230 // we would generate:
36231 //
36232 // ucomiss %xmm1, %xmm0
36233 // movss <1.0f>, %xmm0
36234 // movaps %xmm0, %xmm1
36235 // jne .LBB5_2
36236 // xorps %xmm1, %xmm1
36237 // .LBB5_2:
36238 // jp .LBB5_4
36239 // movaps %xmm1, %xmm0
36240 // .LBB5_4:
36241 // retq
36242 //
36243 // because this custom-inserter would have generated:
36244 //
36245 // A
36246 // | \
36247 // | B
36248 // | /
36249 // C
36250 // | \
36251 // | D
36252 // | /
36253 // E
36254 //
36255 // A: X = ...; Y = ...
36256 // B: empty
36257 // C: Z = PHI [X, A], [Y, B]
36258 // D: empty
36259 // E: PHI [X, C], [Z, D]
36260 //
36261 // If we lower both CMOVs in a single step, we can instead generate:
36262 //
36263 // A
36264 // | \
36265 // | C
36266 // | /|
36267 // |/ |
36268 // | |
36269 // | D
36270 // | /
36271 // E
36272 //
36273 // A: X = ...; Y = ...
36274 // D: empty
36275 // E: PHI [X, A], [X, C], [Y, D]
36276 //
36277 // Which, in our sitofp/fcmp example, gives us something like:
36278 //
36279 // ucomiss %xmm1, %xmm0
36280 // movss <1.0f>, %xmm0
36281 // jne .LBB5_4
36282 // jp .LBB5_4
36283 // xorps %xmm0, %xmm0
36284 // .LBB5_4:
36285 // retq
36286 //
36287
36288 // We lower cascaded CMOV into two successive branches to the same block.
36289 // EFLAGS is used by both, so mark it as live in the second.
36290 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36291 MachineFunction *F = ThisMBB->getParent();
36292 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36293 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36294 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36295
36296 MachineFunction::iterator It = ++ThisMBB->getIterator();
36297 F->insert(It, FirstInsertedMBB);
36298 F->insert(It, SecondInsertedMBB);
36299 F->insert(It, SinkMBB);
36300
36301 // For a cascaded CMOV, we lower it to two successive branches to
36302 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36303 // the FirstInsertedMBB.
36304 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36305
36306 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36307 // live into the sink and copy blocks.
36308 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36309 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36310 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36311 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36312 SinkMBB->addLiveIn(X86::EFLAGS);
36313 }
36314
36315 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36316 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36317 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36318 ThisMBB->end());
36319 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36320
36321 // Fallthrough block for ThisMBB.
36322 ThisMBB->addSuccessor(FirstInsertedMBB);
36323 // The true block target of the first branch is always SinkMBB.
36324 ThisMBB->addSuccessor(SinkMBB);
36325 // Fallthrough block for FirstInsertedMBB.
36326 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36327 // The true block for the branch of FirstInsertedMBB.
36328 FirstInsertedMBB->addSuccessor(SinkMBB);
36329 // This is fallthrough.
36330 SecondInsertedMBB->addSuccessor(SinkMBB);
36331
36332 // Create the conditional branch instructions.
36333 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36334 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36335
36336 X86::CondCode SecondCC =
36337 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36338 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36339 .addMBB(SinkMBB)
36340 .addImm(SecondCC);
36341
36342 // SinkMBB:
36343 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36344 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36345 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36346 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36348 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36349 .addReg(Op1Reg)
36350 .addMBB(SecondInsertedMBB)
36351 .addReg(Op2Reg)
36352 .addMBB(ThisMBB);
36353
36354 // The second SecondInsertedMBB provides the same incoming value as the
36355 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36356 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36357
36358 // Now remove the CMOVs.
36359 FirstCMOV.eraseFromParent();
36360 SecondCascadedCMOV.eraseFromParent();
36361
36362 return SinkMBB;
36363}
36364
36366X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36367 MachineBasicBlock *ThisMBB) const {
36368 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36369 const MIMetadata MIMD(MI);
36370
36371 // To "insert" a SELECT_CC instruction, we actually have to insert the
36372 // diamond control-flow pattern. The incoming instruction knows the
36373 // destination vreg to set, the condition code register to branch on, the
36374 // true/false values to select between and a branch opcode to use.
36375
36376 // ThisMBB:
36377 // ...
36378 // TrueVal = ...
36379 // cmpTY ccX, r1, r2
36380 // bCC copy1MBB
36381 // fallthrough --> FalseMBB
36382
36383 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36384 // as described above, by inserting a BB, and then making a PHI at the join
36385 // point to select the true and false operands of the CMOV in the PHI.
36386 //
36387 // The code also handles two different cases of multiple CMOV opcodes
36388 // in a row.
36389 //
36390 // Case 1:
36391 // In this case, there are multiple CMOVs in a row, all which are based on
36392 // the same condition setting (or the exact opposite condition setting).
36393 // In this case we can lower all the CMOVs using a single inserted BB, and
36394 // then make a number of PHIs at the join point to model the CMOVs. The only
36395 // trickiness here, is that in a case like:
36396 //
36397 // t2 = CMOV cond1 t1, f1
36398 // t3 = CMOV cond1 t2, f2
36399 //
36400 // when rewriting this into PHIs, we have to perform some renaming on the
36401 // temps since you cannot have a PHI operand refer to a PHI result earlier
36402 // in the same block. The "simple" but wrong lowering would be:
36403 //
36404 // t2 = PHI t1(BB1), f1(BB2)
36405 // t3 = PHI t2(BB1), f2(BB2)
36406 //
36407 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36408 // renaming is to note that on the path through BB1, t2 is really just a
36409 // copy of t1, and do that renaming, properly generating:
36410 //
36411 // t2 = PHI t1(BB1), f1(BB2)
36412 // t3 = PHI t1(BB1), f2(BB2)
36413 //
36414 // Case 2:
36415 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36416 // function - EmitLoweredCascadedSelect.
36417
36418 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36420 MachineInstr *LastCMOV = &MI;
36422
36423 // Check for case 1, where there are multiple CMOVs with the same condition
36424 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36425 // number of jumps the most.
36426
36427 if (isCMOVPseudo(MI)) {
36428 // See if we have a string of CMOVS with the same condition. Skip over
36429 // intervening debug insts.
36430 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36431 (NextMIIt->getOperand(3).getImm() == CC ||
36432 NextMIIt->getOperand(3).getImm() == OppCC)) {
36433 LastCMOV = &*NextMIIt;
36434 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36435 }
36436 }
36437
36438 // This checks for case 2, but only do this if we didn't already find
36439 // case 1, as indicated by LastCMOV == MI.
36440 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36441 NextMIIt->getOpcode() == MI.getOpcode() &&
36442 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36443 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36444 NextMIIt->getOperand(1).isKill()) {
36445 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36446 }
36447
36448 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36449 MachineFunction *F = ThisMBB->getParent();
36450 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36451 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36452
36453 MachineFunction::iterator It = ++ThisMBB->getIterator();
36454 F->insert(It, FalseMBB);
36455 F->insert(It, SinkMBB);
36456
36457 // Set the call frame size on entry to the new basic blocks.
36458 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36459 FalseMBB->setCallFrameSize(CallFrameSize);
36460 SinkMBB->setCallFrameSize(CallFrameSize);
36461
36462 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36463 // live into the sink and copy blocks.
36464 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36465 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36466 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36467 FalseMBB->addLiveIn(X86::EFLAGS);
36468 SinkMBB->addLiveIn(X86::EFLAGS);
36469 }
36470
36471 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36473 MachineBasicBlock::iterator(LastCMOV));
36474 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36475 if (MI.isDebugInstr())
36476 SinkMBB->push_back(MI.removeFromParent());
36477
36478 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36479 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36480 std::next(MachineBasicBlock::iterator(LastCMOV)),
36481 ThisMBB->end());
36482 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36483
36484 // Fallthrough block for ThisMBB.
36485 ThisMBB->addSuccessor(FalseMBB);
36486 // The true block target of the first (or only) branch is always a SinkMBB.
36487 ThisMBB->addSuccessor(SinkMBB);
36488 // Fallthrough block for FalseMBB.
36489 FalseMBB->addSuccessor(SinkMBB);
36490
36491 // Create the conditional branch instruction.
36492 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36493
36494 // SinkMBB:
36495 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36496 // ...
36499 std::next(MachineBasicBlock::iterator(LastCMOV));
36500 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36501
36502 // Now remove the CMOV(s).
36503 ThisMBB->erase(MIItBegin, MIItEnd);
36504
36505 return SinkMBB;
36506}
36507
36508static unsigned getSUBriOpcode(bool IsLP64) {
36509 if (IsLP64)
36510 return X86::SUB64ri32;
36511 else
36512 return X86::SUB32ri;
36513}
36514
36516X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36517 MachineBasicBlock *MBB) const {
36518 MachineFunction *MF = MBB->getParent();
36519 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36520 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36521 const MIMetadata MIMD(MI);
36522 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36523
36524 const unsigned ProbeSize = getStackProbeSize(*MF);
36525
36527 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36528 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36529 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36530
36532 MF->insert(MBBIter, testMBB);
36533 MF->insert(MBBIter, blockMBB);
36534 MF->insert(MBBIter, tailMBB);
36535
36536 Register sizeVReg = MI.getOperand(1).getReg();
36537
36538 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36539
36540 Register TmpStackPtr = MRI.createVirtualRegister(
36541 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36542 Register FinalStackPtr = MRI.createVirtualRegister(
36543 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36544
36545 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36546 .addReg(physSPReg);
36547 {
36548 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36549 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36550 .addReg(TmpStackPtr)
36551 .addReg(sizeVReg);
36552 }
36553
36554 // test rsp size
36555
36556 BuildMI(testMBB, MIMD,
36557 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36558 .addReg(FinalStackPtr)
36559 .addReg(physSPReg);
36560
36561 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36562 .addMBB(tailMBB)
36564 testMBB->addSuccessor(blockMBB);
36565 testMBB->addSuccessor(tailMBB);
36566
36567 // Touch the block then extend it. This is done on the opposite side of
36568 // static probe where we allocate then touch, to avoid the need of probing the
36569 // tail of the static alloca. Possible scenarios are:
36570 //
36571 // + ---- <- ------------ <- ------------- <- ------------ +
36572 // | |
36573 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36574 // | |
36575 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36576 //
36577 // The property we want to enforce is to never have more than [page alloc] between two probes.
36578
36579 const unsigned XORMIOpc =
36580 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36581 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36582 .addImm(0);
36583
36584 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36585 physSPReg)
36586 .addReg(physSPReg)
36587 .addImm(ProbeSize);
36588
36589 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36590 blockMBB->addSuccessor(testMBB);
36591
36592 // Replace original instruction by the expected stack ptr
36593 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36594 MI.getOperand(0).getReg())
36595 .addReg(FinalStackPtr);
36596
36597 tailMBB->splice(tailMBB->end(), MBB,
36598 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36600 MBB->addSuccessor(testMBB);
36601
36602 // Delete the original pseudo instruction.
36603 MI.eraseFromParent();
36604
36605 // And we're done.
36606 return tailMBB;
36607}
36608
36610X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36611 MachineBasicBlock *BB) const {
36612 MachineFunction *MF = BB->getParent();
36613 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36614 const MIMetadata MIMD(MI);
36615 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36616
36617 assert(MF->shouldSplitStack());
36618
36619 const bool Is64Bit = Subtarget.is64Bit();
36620 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36621
36622 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36623 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36624
36625 // BB:
36626 // ... [Till the alloca]
36627 // If stacklet is not large enough, jump to mallocMBB
36628 //
36629 // bumpMBB:
36630 // Allocate by subtracting from RSP
36631 // Jump to continueMBB
36632 //
36633 // mallocMBB:
36634 // Allocate by call to runtime
36635 //
36636 // continueMBB:
36637 // ...
36638 // [rest of original BB]
36639 //
36640
36641 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36642 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36643 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36644
36646 const TargetRegisterClass *AddrRegClass =
36648
36649 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36650 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36651 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36652 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36653 sizeVReg = MI.getOperand(1).getReg(),
36654 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36655
36656 MachineFunction::iterator MBBIter = ++BB->getIterator();
36657
36658 MF->insert(MBBIter, bumpMBB);
36659 MF->insert(MBBIter, mallocMBB);
36660 MF->insert(MBBIter, continueMBB);
36661
36662 continueMBB->splice(continueMBB->begin(), BB,
36663 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36664 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36665
36666 // Add code to the main basic block to check if the stack limit has been hit,
36667 // and if so, jump to mallocMBB otherwise to bumpMBB.
36668 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36669 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36670 .addReg(tmpSPVReg).addReg(sizeVReg);
36671 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36672 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36673 .addReg(SPLimitVReg);
36674 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36675
36676 // bumpMBB simply decreases the stack pointer, since we know the current
36677 // stacklet has enough space.
36678 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36679 .addReg(SPLimitVReg);
36680 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36681 .addReg(SPLimitVReg);
36682 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36683
36684 // Calls into a routine in libgcc to allocate more space from the heap.
36685 const uint32_t *RegMask =
36687 if (IsLP64) {
36688 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36689 .addReg(sizeVReg);
36690 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36691 .addExternalSymbol("__morestack_allocate_stack_space")
36692 .addRegMask(RegMask)
36693 .addReg(X86::RDI, RegState::Implicit)
36694 .addReg(X86::RAX, RegState::ImplicitDefine);
36695 } else if (Is64Bit) {
36696 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36697 .addReg(sizeVReg);
36698 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36699 .addExternalSymbol("__morestack_allocate_stack_space")
36700 .addRegMask(RegMask)
36701 .addReg(X86::EDI, RegState::Implicit)
36702 .addReg(X86::EAX, RegState::ImplicitDefine);
36703 } else {
36704 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36705 .addImm(12);
36706 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36707 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36708 .addExternalSymbol("__morestack_allocate_stack_space")
36709 .addRegMask(RegMask)
36710 .addReg(X86::EAX, RegState::ImplicitDefine);
36711 }
36712
36713 if (!Is64Bit)
36714 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36715 .addImm(16);
36716
36717 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36718 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36719 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36720
36721 // Set up the CFG correctly.
36722 BB->addSuccessor(bumpMBB);
36723 BB->addSuccessor(mallocMBB);
36724 mallocMBB->addSuccessor(continueMBB);
36725 bumpMBB->addSuccessor(continueMBB);
36726
36727 // Take care of the PHI nodes.
36728 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36729 MI.getOperand(0).getReg())
36730 .addReg(mallocPtrVReg)
36731 .addMBB(mallocMBB)
36732 .addReg(bumpSPPtrVReg)
36733 .addMBB(bumpMBB);
36734
36735 // Delete the original pseudo instruction.
36736 MI.eraseFromParent();
36737
36738 // And we're done.
36739 return continueMBB;
36740}
36741
36743X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36744 MachineBasicBlock *BB) const {
36745 MachineFunction *MF = BB->getParent();
36746 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36747 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36748 const MIMetadata MIMD(MI);
36749
36752 "SEH does not use catchret!");
36753
36754 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36755 if (!Subtarget.is32Bit())
36756 return BB;
36757
36758 // C++ EH creates a new target block to hold the restore code, and wires up
36759 // the new block to the return destination with a normal JMP_4.
36760 MachineBasicBlock *RestoreMBB =
36762 assert(BB->succ_size() == 1);
36763 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36764 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36765 BB->addSuccessor(RestoreMBB);
36766 MI.getOperand(0).setMBB(RestoreMBB);
36767
36768 // Marking this as an EH pad but not a funclet entry block causes PEI to
36769 // restore stack pointers in the block.
36770 RestoreMBB->setIsEHPad(true);
36771
36772 auto RestoreMBBI = RestoreMBB->begin();
36773 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36774 return BB;
36775}
36776
36778X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36779 MachineBasicBlock *BB) const {
36780 // This is pretty easy. We're taking the value that we received from
36781 // our load from the relocation, sticking it in either RDI (x86-64)
36782 // or EAX and doing an indirect call. The return value will then
36783 // be in the normal return register.
36784 MachineFunction *F = BB->getParent();
36785 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36786 const MIMetadata MIMD(MI);
36787
36788 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36789 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36790
36791 // Get a register mask for the lowered call.
36792 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36793 // proper register mask.
36794 const uint32_t *RegMask =
36795 Subtarget.is64Bit() ?
36798 if (Subtarget.is64Bit()) {
36800 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36801 .addReg(X86::RIP)
36802 .addImm(0)
36803 .addReg(0)
36804 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36805 MI.getOperand(3).getTargetFlags())
36806 .addReg(0);
36807 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36808 addDirectMem(MIB, X86::RDI);
36809 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36810 } else if (!isPositionIndependent()) {
36812 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36813 .addReg(0)
36814 .addImm(0)
36815 .addReg(0)
36816 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36817 MI.getOperand(3).getTargetFlags())
36818 .addReg(0);
36819 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36820 addDirectMem(MIB, X86::EAX);
36821 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36822 } else {
36824 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36825 .addReg(TII->getGlobalBaseReg(F))
36826 .addImm(0)
36827 .addReg(0)
36828 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36829 MI.getOperand(3).getTargetFlags())
36830 .addReg(0);
36831 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36832 addDirectMem(MIB, X86::EAX);
36833 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36834 }
36835
36836 MI.eraseFromParent(); // The pseudo instruction is gone now.
36837 return BB;
36838}
36839
36840static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36841 switch (RPOpc) {
36842 case X86::INDIRECT_THUNK_CALL32:
36843 return X86::CALLpcrel32;
36844 case X86::INDIRECT_THUNK_CALL64:
36845 return X86::CALL64pcrel32;
36846 case X86::INDIRECT_THUNK_TCRETURN32:
36847 return X86::TCRETURNdi;
36848 case X86::INDIRECT_THUNK_TCRETURN64:
36849 return X86::TCRETURNdi64;
36850 }
36851 llvm_unreachable("not indirect thunk opcode");
36852}
36853
36854static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36855 Register Reg) {
36856 if (Subtarget.useRetpolineExternalThunk()) {
36857 // When using an external thunk for retpolines, we pick names that match the
36858 // names GCC happens to use as well. This helps simplify the implementation
36859 // of the thunks for kernels where they have no easy ability to create
36860 // aliases and are doing non-trivial configuration of the thunk's body. For
36861 // example, the Linux kernel will do boot-time hot patching of the thunk
36862 // bodies and cannot easily export aliases of these to loaded modules.
36863 //
36864 // Note that at any point in the future, we may need to change the semantics
36865 // of how we implement retpolines and at that time will likely change the
36866 // name of the called thunk. Essentially, there is no hard guarantee that
36867 // LLVM will generate calls to specific thunks, we merely make a best-effort
36868 // attempt to help out kernels and other systems where duplicating the
36869 // thunks is costly.
36870 switch (Reg.id()) {
36871 case X86::EAX:
36872 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36873 return "__x86_indirect_thunk_eax";
36874 case X86::ECX:
36875 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36876 return "__x86_indirect_thunk_ecx";
36877 case X86::EDX:
36878 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36879 return "__x86_indirect_thunk_edx";
36880 case X86::EDI:
36881 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36882 return "__x86_indirect_thunk_edi";
36883 case X86::R11:
36884 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36885 return "__x86_indirect_thunk_r11";
36886 }
36887 llvm_unreachable("unexpected reg for external indirect thunk");
36888 }
36889
36890 if (Subtarget.useRetpolineIndirectCalls() ||
36891 Subtarget.useRetpolineIndirectBranches()) {
36892 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36893 switch (Reg.id()) {
36894 case X86::EAX:
36895 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36896 return "__llvm_retpoline_eax";
36897 case X86::ECX:
36898 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36899 return "__llvm_retpoline_ecx";
36900 case X86::EDX:
36901 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36902 return "__llvm_retpoline_edx";
36903 case X86::EDI:
36904 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36905 return "__llvm_retpoline_edi";
36906 case X86::R11:
36907 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36908 return "__llvm_retpoline_r11";
36909 }
36910 llvm_unreachable("unexpected reg for retpoline");
36911 }
36912
36913 if (Subtarget.useLVIControlFlowIntegrity()) {
36914 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36915 return "__llvm_lvi_thunk_r11";
36916 }
36917 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36918}
36919
36921X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36922 MachineBasicBlock *BB) const {
36923 // Copy the virtual register into the R11 physical register and
36924 // call the retpoline thunk.
36925 const MIMetadata MIMD(MI);
36926 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36927 Register CalleeVReg = MI.getOperand(0).getReg();
36928 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36929
36930 // Find an available scratch register to hold the callee. On 64-bit, we can
36931 // just use R11, but we scan for uses anyway to ensure we don't generate
36932 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36933 // already a register use operand to the call to hold the callee. If none
36934 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36935 // register and ESI is the base pointer to realigned stack frames with VLAs.
36936 SmallVector<Register, 3> AvailableRegs;
36937 if (Subtarget.is64Bit())
36938 AvailableRegs.push_back(X86::R11);
36939 else
36940 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36941
36942 // Zero out any registers that are already used.
36943 for (const auto &MO : MI.operands()) {
36944 if (MO.isReg() && MO.isUse())
36945 llvm::replace(AvailableRegs, MO.getReg(), Register());
36946 }
36947
36948 // Choose the first remaining non-zero available register.
36949 Register AvailableReg;
36950 for (Register MaybeReg : AvailableRegs) {
36951 if (MaybeReg) {
36952 AvailableReg = MaybeReg;
36953 break;
36954 }
36955 }
36956 if (!AvailableReg)
36957 report_fatal_error("calling convention incompatible with retpoline, no "
36958 "available registers");
36959
36960 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36961
36962 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36963 .addReg(CalleeVReg);
36964 MI.getOperand(0).ChangeToES(Symbol);
36965 MI.setDesc(TII->get(Opc));
36967 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36968 return BB;
36969}
36970
36971/// SetJmp implies future control flow change upon calling the corresponding
36972/// LongJmp.
36973/// Instead of using the 'return' instruction, the long jump fixes the stack and
36974/// performs an indirect branch. To do so it uses the registers that were stored
36975/// in the jump buffer (when calling SetJmp).
36976/// In case the shadow stack is enabled we need to fix it as well, because some
36977/// return addresses will be skipped.
36978/// The function will save the SSP for future fixing in the function
36979/// emitLongJmpShadowStackFix.
36980/// \sa emitLongJmpShadowStackFix
36981/// \param [in] MI The temporary Machine Instruction for the builtin.
36982/// \param [in] MBB The Machine Basic Block that will be modified.
36983void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36984 MachineBasicBlock *MBB) const {
36985 const MIMetadata MIMD(MI);
36986 MachineFunction *MF = MBB->getParent();
36987 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36990
36991 // Memory Reference.
36992 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36993
36994 // Initialize a register with zero.
36995 MVT PVT = getPointerTy(MF->getDataLayout());
36996 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36997 Register ZReg = MRI.createVirtualRegister(PtrRC);
36998 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
36999 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
37000 .addDef(ZReg)
37001 .addReg(ZReg, RegState::Undef)
37002 .addReg(ZReg, RegState::Undef);
37003
37004 // Read the current SSP Register value to the zeroed register.
37005 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37006 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37007 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37008
37009 // Write the SSP register value to offset 3 in input memory buffer.
37010 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37011 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37012 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37013 const unsigned MemOpndSlot = 1;
37014 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37015 if (i == X86::AddrDisp)
37016 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37017 else
37018 MIB.add(MI.getOperand(MemOpndSlot + i));
37019 }
37020 MIB.addReg(SSPCopyReg);
37021 MIB.setMemRefs(MMOs);
37022}
37023
37025X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37026 MachineBasicBlock *MBB) const {
37027 const MIMetadata MIMD(MI);
37028 MachineFunction *MF = MBB->getParent();
37029 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37030 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37032
37033 const BasicBlock *BB = MBB->getBasicBlock();
37035
37036 // Memory Reference
37037 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37038
37039 unsigned MemOpndSlot = 0;
37040
37041 unsigned CurOp = 0;
37042
37043 Register DstReg = MI.getOperand(CurOp++).getReg();
37044 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37045 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37046 (void)TRI;
37047 Register mainDstReg = MRI.createVirtualRegister(RC);
37048 Register restoreDstReg = MRI.createVirtualRegister(RC);
37049
37050 MemOpndSlot = CurOp;
37051
37052 MVT PVT = getPointerTy(MF->getDataLayout());
37053 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37054 "Invalid Pointer Size!");
37055
37056 // For v = setjmp(buf), we generate
37057 //
37058 // thisMBB:
37059 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37060 // SjLjSetup restoreMBB
37061 //
37062 // mainMBB:
37063 // v_main = 0
37064 //
37065 // sinkMBB:
37066 // v = phi(main, restore)
37067 //
37068 // restoreMBB:
37069 // if base pointer being used, load it from frame
37070 // v_restore = 1
37071
37072 MachineBasicBlock *thisMBB = MBB;
37073 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37074 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37075 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37076 MF->insert(I, mainMBB);
37077 MF->insert(I, sinkMBB);
37078 MF->push_back(restoreMBB);
37079 restoreMBB->setMachineBlockAddressTaken();
37080
37082
37083 // Transfer the remainder of BB and its successor edges to sinkMBB.
37084 sinkMBB->splice(sinkMBB->begin(), MBB,
37085 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37087
37088 // thisMBB:
37089 unsigned PtrStoreOpc = 0;
37090 Register LabelReg;
37091 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37092 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37094
37095 // Prepare IP either in reg or imm.
37096 if (!UseImmLabel) {
37097 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37098 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37099 LabelReg = MRI.createVirtualRegister(PtrRC);
37100 if (Subtarget.is64Bit()) {
37101 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37102 .addReg(X86::RIP)
37103 .addImm(0)
37104 .addReg(0)
37105 .addMBB(restoreMBB)
37106 .addReg(0);
37107 } else {
37108 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37109 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37110 .addReg(XII->getGlobalBaseReg(MF))
37111 .addImm(0)
37112 .addReg(0)
37113 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37114 .addReg(0);
37115 }
37116 } else
37117 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37118 // Store IP
37119 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37120 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37121 if (i == X86::AddrDisp)
37122 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37123 else
37124 MIB.add(MI.getOperand(MemOpndSlot + i));
37125 }
37126 if (!UseImmLabel)
37127 MIB.addReg(LabelReg);
37128 else
37129 MIB.addMBB(restoreMBB);
37130 MIB.setMemRefs(MMOs);
37131
37132 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37133 emitSetJmpShadowStackFix(MI, thisMBB);
37134 }
37135
37136 // Setup
37137 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37138 .addMBB(restoreMBB);
37139
37140 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37141 MIB.addRegMask(RegInfo->getNoPreservedMask());
37142 thisMBB->addSuccessor(mainMBB);
37143 thisMBB->addSuccessor(restoreMBB);
37144
37145 // mainMBB:
37146 // EAX = 0
37147 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37148 mainMBB->addSuccessor(sinkMBB);
37149
37150 // sinkMBB:
37151 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37152 .addReg(mainDstReg)
37153 .addMBB(mainMBB)
37154 .addReg(restoreDstReg)
37155 .addMBB(restoreMBB);
37156
37157 // restoreMBB:
37158 if (RegInfo->hasBasePointer(*MF)) {
37159 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37161 X86FI->setRestoreBasePointer(MF);
37162 Register FramePtr = RegInfo->getFrameRegister(*MF);
37163 Register BasePtr = RegInfo->getBaseRegister();
37164 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37165 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37166 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37168 }
37169 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37170 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37171 restoreMBB->addSuccessor(sinkMBB);
37172
37173 MI.eraseFromParent();
37174 return sinkMBB;
37175}
37176
37177/// Fix the shadow stack using the previously saved SSP pointer.
37178/// \sa emitSetJmpShadowStackFix
37179/// \param [in] MI The temporary Machine Instruction for the builtin.
37180/// \param [in] MBB The Machine Basic Block that will be modified.
37181/// \return The sink MBB that will perform the future indirect branch.
37183X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37184 MachineBasicBlock *MBB) const {
37185 const MIMetadata MIMD(MI);
37186 MachineFunction *MF = MBB->getParent();
37187 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37189
37190 // Memory Reference
37191 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37192
37193 MVT PVT = getPointerTy(MF->getDataLayout());
37194 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37195
37196 // checkSspMBB:
37197 // xor vreg1, vreg1
37198 // rdssp vreg1
37199 // test vreg1, vreg1
37200 // je sinkMBB # Jump if Shadow Stack is not supported
37201 // fallMBB:
37202 // mov buf+24/12(%rip), vreg2
37203 // sub vreg1, vreg2
37204 // jbe sinkMBB # No need to fix the Shadow Stack
37205 // fixShadowMBB:
37206 // shr 3/2, vreg2
37207 // incssp vreg2 # fix the SSP according to the lower 8 bits
37208 // shr 8, vreg2
37209 // je sinkMBB
37210 // fixShadowLoopPrepareMBB:
37211 // shl vreg2
37212 // mov 128, vreg3
37213 // fixShadowLoopMBB:
37214 // incssp vreg3
37215 // dec vreg2
37216 // jne fixShadowLoopMBB # Iterate until you finish fixing
37217 // # the Shadow Stack
37218 // sinkMBB:
37219
37221 const BasicBlock *BB = MBB->getBasicBlock();
37222
37223 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37224 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37225 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37226 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37227 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37228 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37229 MF->insert(I, checkSspMBB);
37230 MF->insert(I, fallMBB);
37231 MF->insert(I, fixShadowMBB);
37232 MF->insert(I, fixShadowLoopPrepareMBB);
37233 MF->insert(I, fixShadowLoopMBB);
37234 MF->insert(I, sinkMBB);
37235
37236 // Transfer the remainder of BB and its successor edges to sinkMBB.
37237 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37238 MBB->end());
37240
37241 MBB->addSuccessor(checkSspMBB);
37242
37243 // Initialize a register with zero.
37244 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37245 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37246
37247 if (PVT == MVT::i64) {
37248 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37249 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37250 .addImm(0)
37251 .addReg(ZReg)
37252 .addImm(X86::sub_32bit);
37253 ZReg = TmpZReg;
37254 }
37255
37256 // Read the current SSP Register value to the zeroed register.
37257 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37258 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37259 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37260
37261 // Check whether the result of the SSP register is zero and jump directly
37262 // to the sink.
37263 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37264 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37265 .addReg(SSPCopyReg)
37266 .addReg(SSPCopyReg);
37267 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37268 .addMBB(sinkMBB)
37270 checkSspMBB->addSuccessor(sinkMBB);
37271 checkSspMBB->addSuccessor(fallMBB);
37272
37273 // Reload the previously saved SSP register value.
37274 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37275 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37276 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37278 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37279 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37280 const MachineOperand &MO = MI.getOperand(i);
37281 if (i == X86::AddrDisp)
37282 MIB.addDisp(MO, SPPOffset);
37283 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37284 // preserve kill flags.
37285 MIB.addReg(MO.getReg());
37286 else
37287 MIB.add(MO);
37288 }
37289 MIB.setMemRefs(MMOs);
37290
37291 // Subtract the current SSP from the previous SSP.
37292 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37293 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37294 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37295 .addReg(PrevSSPReg)
37296 .addReg(SSPCopyReg);
37297
37298 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37299 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37300 .addMBB(sinkMBB)
37302 fallMBB->addSuccessor(sinkMBB);
37303 fallMBB->addSuccessor(fixShadowMBB);
37304
37305 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37306 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37307 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37308 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37309 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37310 .addReg(SspSubReg)
37311 .addImm(Offset);
37312
37313 // Increase SSP when looking only on the lower 8 bits of the delta.
37314 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37315 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37316
37317 // Reset the lower 8 bits.
37318 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37319 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37320 .addReg(SspFirstShrReg)
37321 .addImm(8);
37322
37323 // Jump if the result of the shift is zero.
37324 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37325 .addMBB(sinkMBB)
37327 fixShadowMBB->addSuccessor(sinkMBB);
37328 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37329
37330 // Do a single shift left.
37331 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37332 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37333 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37334 .addReg(SspSecondShrReg)
37335 .addImm(1);
37336
37337 // Save the value 128 to a register (will be used next with incssp).
37338 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37339 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37340 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37341 .addImm(128);
37342 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37343
37344 // Since incssp only looks at the lower 8 bits, we might need to do several
37345 // iterations of incssp until we finish fixing the shadow stack.
37346 Register DecReg = MRI.createVirtualRegister(PtrRC);
37347 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37348 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37349 .addReg(SspAfterShlReg)
37350 .addMBB(fixShadowLoopPrepareMBB)
37351 .addReg(DecReg)
37352 .addMBB(fixShadowLoopMBB);
37353
37354 // Every iteration we increase the SSP by 128.
37355 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37356
37357 // Every iteration we decrement the counter by 1.
37358 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37359 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37360
37361 // Jump if the counter is not zero yet.
37362 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37363 .addMBB(fixShadowLoopMBB)
37365 fixShadowLoopMBB->addSuccessor(sinkMBB);
37366 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37367
37368 return sinkMBB;
37369}
37370
37372X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37373 MachineBasicBlock *MBB) const {
37374 const MIMetadata MIMD(MI);
37375 MachineFunction *MF = MBB->getParent();
37376 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37378
37379 // Memory Reference
37380 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37381
37382 MVT PVT = getPointerTy(MF->getDataLayout());
37383 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37384 "Invalid Pointer Size!");
37385
37386 const TargetRegisterClass *RC =
37387 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37388 Register Tmp = MRI.createVirtualRegister(RC);
37389 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37390 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37391 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37392 Register SP = RegInfo->getStackRegister();
37393
37395
37396 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37397 const int64_t SPOffset = 2 * PVT.getStoreSize();
37398
37399 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37400 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37401
37402 MachineBasicBlock *thisMBB = MBB;
37403
37404 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37405 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37406 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37407 }
37408
37409 // Reload FP
37410 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37411 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37412 const MachineOperand &MO = MI.getOperand(i);
37413 if (MO.isReg()) // Don't add the whole operand, we don't want to
37414 // preserve kill flags.
37415 MIB.addReg(MO.getReg());
37416 else
37417 MIB.add(MO);
37418 }
37419 MIB.setMemRefs(MMOs);
37421
37422 // Reload IP
37423 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37424 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37425 const MachineOperand &MO = MI.getOperand(i);
37426 if (i == X86::AddrDisp)
37427 MIB.addDisp(MO, LabelOffset);
37428 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37429 // preserve kill flags.
37430 MIB.addReg(MO.getReg());
37431 else
37432 MIB.add(MO);
37433 }
37434 MIB.setMemRefs(MMOs);
37435
37436 // Reload SP
37437 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37438 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37439 if (i == X86::AddrDisp)
37440 MIB.addDisp(MI.getOperand(i), SPOffset);
37441 else
37442 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37443 // the last instruction of the expansion.
37444 }
37445 MIB.setMemRefs(MMOs);
37447
37448 // Jump
37449 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37450
37451 MI.eraseFromParent();
37452 return thisMBB;
37453}
37454
37455void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37457 MachineBasicBlock *DispatchBB,
37458 int FI) const {
37459 const MIMetadata MIMD(MI);
37460 MachineFunction *MF = MBB->getParent();
37462 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37463
37464 MVT PVT = getPointerTy(MF->getDataLayout());
37465 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37466
37467 unsigned Op = 0;
37468 Register VR;
37469
37470 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37472
37473 if (UseImmLabel) {
37474 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37475 } else {
37476 const TargetRegisterClass *TRC =
37477 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37478 VR = MRI->createVirtualRegister(TRC);
37479 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37480
37481 if (Subtarget.is64Bit())
37482 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37483 .addReg(X86::RIP)
37484 .addImm(1)
37485 .addReg(0)
37486 .addMBB(DispatchBB)
37487 .addReg(0);
37488 else
37489 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37490 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37491 .addImm(1)
37492 .addReg(0)
37493 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37494 .addReg(0);
37495 }
37496
37497 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37498 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37499 if (UseImmLabel)
37500 MIB.addMBB(DispatchBB);
37501 else
37502 MIB.addReg(VR);
37503}
37504
37506X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37507 MachineBasicBlock *BB) const {
37508 const MIMetadata MIMD(MI);
37509 MachineFunction *MF = BB->getParent();
37511 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37512 int FI = MF->getFrameInfo().getFunctionContextIndex();
37513
37514 // Get a mapping of the call site numbers to all of the landing pads they're
37515 // associated with.
37517 unsigned MaxCSNum = 0;
37518 for (auto &MBB : *MF) {
37519 if (!MBB.isEHPad())
37520 continue;
37521
37522 MCSymbol *Sym = nullptr;
37523 for (const auto &MI : MBB) {
37524 if (MI.isDebugInstr())
37525 continue;
37526
37527 assert(MI.isEHLabel() && "expected EH_LABEL");
37528 Sym = MI.getOperand(0).getMCSymbol();
37529 break;
37530 }
37531
37532 if (!MF->hasCallSiteLandingPad(Sym))
37533 continue;
37534
37535 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37536 CallSiteNumToLPad[CSI].push_back(&MBB);
37537 MaxCSNum = std::max(MaxCSNum, CSI);
37538 }
37539 }
37540
37541 // Get an ordered list of the machine basic blocks for the jump table.
37542 std::vector<MachineBasicBlock *> LPadList;
37544 LPadList.reserve(CallSiteNumToLPad.size());
37545
37546 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37547 for (auto &LP : CallSiteNumToLPad[CSI]) {
37548 LPadList.push_back(LP);
37549 InvokeBBs.insert_range(LP->predecessors());
37550 }
37551 }
37552
37553 assert(!LPadList.empty() &&
37554 "No landing pad destinations for the dispatch jump table!");
37555
37556 // Create the MBBs for the dispatch code.
37557
37558 // Shove the dispatch's address into the return slot in the function context.
37559 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37560 DispatchBB->setIsEHPad(true);
37561
37562 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37563 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37564 DispatchBB->addSuccessor(TrapBB);
37565
37566 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37567 DispatchBB->addSuccessor(DispContBB);
37568
37569 // Insert MBBs.
37570 MF->push_back(DispatchBB);
37571 MF->push_back(DispContBB);
37572 MF->push_back(TrapBB);
37573
37574 // Insert code into the entry block that creates and registers the function
37575 // context.
37576 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37577
37578 // Create the jump table and associated information
37579 unsigned JTE = getJumpTableEncoding();
37580 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37581 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37582
37583 const X86RegisterInfo &RI = TII->getRegisterInfo();
37584 // Add a register mask with no preserved registers. This results in all
37585 // registers being marked as clobbered.
37586 if (RI.hasBasePointer(*MF)) {
37587 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37588 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37589 MFI->setRestoreBasePointer(MF);
37590
37591 Register FP = RI.getFrameRegister(*MF);
37592 Register BP = RI.getBaseRegister();
37593 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37594 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37597 } else {
37598 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37600 }
37601
37602 // IReg is used as an index in a memory operand and therefore can't be SP
37603 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37604 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37605 Subtarget.is64Bit() ? 8 : 4);
37606 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37607 .addReg(IReg)
37608 .addImm(LPadList.size());
37609 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37610 .addMBB(TrapBB)
37612
37613 if (Subtarget.is64Bit()) {
37614 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37615 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37616
37617 // leaq .LJTI0_0(%rip), BReg
37618 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37619 .addReg(X86::RIP)
37620 .addImm(1)
37621 .addReg(0)
37622 .addJumpTableIndex(MJTI)
37623 .addReg(0);
37624 // movzx IReg64, IReg
37625 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37626 .addImm(0)
37627 .addReg(IReg)
37628 .addImm(X86::sub_32bit);
37629
37630 switch (JTE) {
37632 // jmpq *(BReg,IReg64,8)
37633 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37634 .addReg(BReg)
37635 .addImm(8)
37636 .addReg(IReg64)
37637 .addImm(0)
37638 .addReg(0);
37639 break;
37641 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37642 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37643 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37644
37645 // movl (BReg,IReg64,4), OReg
37646 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37647 .addReg(BReg)
37648 .addImm(4)
37649 .addReg(IReg64)
37650 .addImm(0)
37651 .addReg(0);
37652 // movsx OReg64, OReg
37653 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37654 .addReg(OReg);
37655 // addq BReg, OReg64, TReg
37656 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37657 .addReg(OReg64)
37658 .addReg(BReg);
37659 // jmpq *TReg
37660 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37661 break;
37662 }
37663 default:
37664 llvm_unreachable("Unexpected jump table encoding");
37665 }
37666 } else {
37667 // jmpl *.LJTI0_0(,IReg,4)
37668 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37669 .addReg(0)
37670 .addImm(4)
37671 .addReg(IReg)
37672 .addJumpTableIndex(MJTI)
37673 .addReg(0);
37674 }
37675
37676 // Add the jump table entries as successors to the MBB.
37678 for (auto &LP : LPadList)
37679 if (SeenMBBs.insert(LP).second)
37680 DispContBB->addSuccessor(LP);
37681
37682 // N.B. the order the invoke BBs are processed in doesn't matter here.
37684 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37685 for (MachineBasicBlock *MBB : InvokeBBs) {
37686 // Remove the landing pad successor from the invoke block and replace it
37687 // with the new dispatch block.
37688 // Keep a copy of Successors since it's modified inside the loop.
37690 MBB->succ_rend());
37691 // FIXME: Avoid quadratic complexity.
37692 for (auto *MBBS : Successors) {
37693 if (MBBS->isEHPad()) {
37694 MBB->removeSuccessor(MBBS);
37695 MBBLPads.push_back(MBBS);
37696 }
37697 }
37698
37699 MBB->addSuccessor(DispatchBB);
37700
37701 // Find the invoke call and mark all of the callee-saved registers as
37702 // 'implicit defined' so that they're spilled. This prevents code from
37703 // moving instructions to before the EH block, where they will never be
37704 // executed.
37705 for (auto &II : reverse(*MBB)) {
37706 if (!II.isCall())
37707 continue;
37708
37709 DenseSet<Register> DefRegs;
37710 for (auto &MOp : II.operands())
37711 if (MOp.isReg())
37712 DefRegs.insert(MOp.getReg());
37713
37714 MachineInstrBuilder MIB(*MF, &II);
37715 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37716 Register Reg = SavedRegs[RegIdx];
37717 if (!DefRegs.contains(Reg))
37719 }
37720
37721 break;
37722 }
37723 }
37724
37725 // Mark all former landing pads as non-landing pads. The dispatch is the only
37726 // landing pad now.
37727 for (auto &LP : MBBLPads)
37728 LP->setIsEHPad(false);
37729
37730 // The instruction is gone now.
37731 MI.eraseFromParent();
37732 return BB;
37733}
37734
37736X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37737 MachineBasicBlock *BB) const {
37738 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37739 // calls may require proper stack alignment.
37740 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37741 const MIMetadata MIMD(MI);
37742 MachineFunction &MF = *BB->getParent();
37743
37744 // Emit CALLSEQ_START right before the instruction.
37745 MF.getFrameInfo().setAdjustsStack(true);
37746 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37747 MachineInstrBuilder CallseqStart =
37748 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37749 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37750
37751 // Emit CALLSEQ_END right after the instruction.
37752 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37753 MachineInstrBuilder CallseqEnd =
37754 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37755 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37756
37757 return BB;
37758}
37759
37762 MachineBasicBlock *BB) const {
37763 MachineFunction *MF = BB->getParent();
37764 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37765 const MIMetadata MIMD(MI);
37766
37767 auto TMMImmToTMMReg = [](unsigned Imm) {
37768 assert (Imm < 8 && "Illegal tmm index");
37769 return X86::TMM0 + Imm;
37770 };
37771 auto TMMImmToTMMPair = [](unsigned Imm) {
37772 assert(Imm < 8 && "Illegal tmm pair index.");
37773 return X86::TMM0_TMM1 + Imm / 2;
37774 };
37775 switch (MI.getOpcode()) {
37776 default:
37777 llvm_unreachable("Unexpected instr type to insert");
37778 case X86::INDIRECT_THUNK_CALL32:
37779 case X86::INDIRECT_THUNK_CALL64:
37780 case X86::INDIRECT_THUNK_TCRETURN32:
37781 case X86::INDIRECT_THUNK_TCRETURN64:
37782 return EmitLoweredIndirectThunk(MI, BB);
37783 case X86::CATCHRET:
37784 return EmitLoweredCatchRet(MI, BB);
37785 case X86::SEG_ALLOCA_32:
37786 case X86::SEG_ALLOCA_64:
37787 return EmitLoweredSegAlloca(MI, BB);
37788 case X86::PROBED_ALLOCA_32:
37789 case X86::PROBED_ALLOCA_64:
37790 return EmitLoweredProbedAlloca(MI, BB);
37791 case X86::TLSCall_32:
37792 case X86::TLSCall_64:
37793 return EmitLoweredTLSCall(MI, BB);
37794 case X86::CMOV_FR16:
37795 case X86::CMOV_FR16X:
37796 case X86::CMOV_FR32:
37797 case X86::CMOV_FR32X:
37798 case X86::CMOV_FR64:
37799 case X86::CMOV_FR64X:
37800 case X86::CMOV_GR8:
37801 case X86::CMOV_GR16:
37802 case X86::CMOV_GR32:
37803 case X86::CMOV_RFP32:
37804 case X86::CMOV_RFP64:
37805 case X86::CMOV_RFP80:
37806 case X86::CMOV_VR64:
37807 case X86::CMOV_VR128:
37808 case X86::CMOV_VR128X:
37809 case X86::CMOV_VR256:
37810 case X86::CMOV_VR256X:
37811 case X86::CMOV_VR512:
37812 case X86::CMOV_VK1:
37813 case X86::CMOV_VK2:
37814 case X86::CMOV_VK4:
37815 case X86::CMOV_VK8:
37816 case X86::CMOV_VK16:
37817 case X86::CMOV_VK32:
37818 case X86::CMOV_VK64:
37819 return EmitLoweredSelect(MI, BB);
37820
37821 case X86::FP80_ADDr:
37822 case X86::FP80_ADDm32: {
37823 // Change the floating point control register to use double extended
37824 // precision when performing the addition.
37825 int OrigCWFrameIdx =
37826 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37827 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37828 OrigCWFrameIdx);
37829
37830 // Load the old value of the control word...
37831 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37832 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37833 OrigCWFrameIdx);
37834
37835 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37836 // precision.
37837 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37838 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37839 .addReg(OldCW, RegState::Kill)
37840 .addImm(0x300);
37841
37842 // Extract to 16 bits.
37843 Register NewCW16 =
37844 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37845 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37846 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37847
37848 // Prepare memory for FLDCW.
37849 int NewCWFrameIdx =
37850 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37851 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37852 NewCWFrameIdx)
37853 .addReg(NewCW16, RegState::Kill);
37854
37855 // Reload the modified control word now...
37856 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37857 NewCWFrameIdx);
37858
37859 // Do the addition.
37860 if (MI.getOpcode() == X86::FP80_ADDr) {
37861 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37862 .add(MI.getOperand(0))
37863 .add(MI.getOperand(1))
37864 .add(MI.getOperand(2));
37865 } else {
37866 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37867 .add(MI.getOperand(0))
37868 .add(MI.getOperand(1))
37869 .add(MI.getOperand(2))
37870 .add(MI.getOperand(3))
37871 .add(MI.getOperand(4))
37872 .add(MI.getOperand(5))
37873 .add(MI.getOperand(6));
37874 }
37875
37876 // Reload the original control word now.
37877 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37878 OrigCWFrameIdx);
37879
37880 MI.eraseFromParent(); // The pseudo instruction is gone now.
37881 return BB;
37882 }
37883
37884 case X86::FP32_TO_INT16_IN_MEM:
37885 case X86::FP32_TO_INT32_IN_MEM:
37886 case X86::FP32_TO_INT64_IN_MEM:
37887 case X86::FP64_TO_INT16_IN_MEM:
37888 case X86::FP64_TO_INT32_IN_MEM:
37889 case X86::FP64_TO_INT64_IN_MEM:
37890 case X86::FP80_TO_INT16_IN_MEM:
37891 case X86::FP80_TO_INT32_IN_MEM:
37892 case X86::FP80_TO_INT64_IN_MEM: {
37893 // Change the floating point control register to use "round towards zero"
37894 // mode when truncating to an integer value.
37895 int OrigCWFrameIdx =
37896 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37897 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37898 OrigCWFrameIdx);
37899
37900 // Load the old value of the control word...
37901 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37902 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37903 OrigCWFrameIdx);
37904
37905 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37906 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37907 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37908 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37909
37910 // Extract to 16 bits.
37911 Register NewCW16 =
37912 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37913 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37914 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37915
37916 // Prepare memory for FLDCW.
37917 int NewCWFrameIdx =
37918 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37919 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37920 NewCWFrameIdx)
37921 .addReg(NewCW16, RegState::Kill);
37922
37923 // Reload the modified control word now...
37924 addFrameReference(BuildMI(*BB, MI, MIMD,
37925 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37926
37927 // Get the X86 opcode to use.
37928 unsigned Opc;
37929 switch (MI.getOpcode()) {
37930 // clang-format off
37931 default: llvm_unreachable("illegal opcode!");
37932 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37933 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37934 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37935 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37936 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37937 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37938 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37939 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37940 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37941 // clang-format on
37942 }
37943
37945 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37946 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37947
37948 // Reload the original control word now.
37949 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37950 OrigCWFrameIdx);
37951
37952 MI.eraseFromParent(); // The pseudo instruction is gone now.
37953 return BB;
37954 }
37955
37956 // xbegin
37957 case X86::XBEGIN:
37958 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37959
37960 case X86::VAARG_64:
37961 case X86::VAARG_X32:
37962 return EmitVAARGWithCustomInserter(MI, BB);
37963
37964 case X86::EH_SjLj_SetJmp32:
37965 case X86::EH_SjLj_SetJmp64:
37966 return emitEHSjLjSetJmp(MI, BB);
37967
37968 case X86::EH_SjLj_LongJmp32:
37969 case X86::EH_SjLj_LongJmp64:
37970 return emitEHSjLjLongJmp(MI, BB);
37971
37972 case X86::Int_eh_sjlj_setup_dispatch:
37973 return EmitSjLjDispatchBlock(MI, BB);
37974
37975 case TargetOpcode::STATEPOINT:
37976 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37977 // this point in the process. We diverge later.
37978 return emitPatchPoint(MI, BB);
37979
37980 case TargetOpcode::STACKMAP:
37981 case TargetOpcode::PATCHPOINT:
37982 return emitPatchPoint(MI, BB);
37983
37984 case TargetOpcode::PATCHABLE_EVENT_CALL:
37985 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37986 return emitPatchableEventCall(MI, BB);
37987
37988 case X86::LCMPXCHG8B: {
37989 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37990 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37991 // requires a memory operand. If it happens that current architecture is
37992 // i686 and for current function we need a base pointer
37993 // - which is ESI for i686 - register allocator would not be able to
37994 // allocate registers for an address in form of X(%reg, %reg, Y)
37995 // - there never would be enough unreserved registers during regalloc
37996 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37997 // We are giving a hand to register allocator by precomputing the address in
37998 // a new vreg using LEA.
37999
38000 // If it is not i686 or there is no base pointer - nothing to do here.
38001 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38002 return BB;
38003
38004 // Even though this code does not necessarily needs the base pointer to
38005 // be ESI, we check for that. The reason: if this assert fails, there are
38006 // some changes happened in the compiler base pointer handling, which most
38007 // probably have to be addressed somehow here.
38008 assert(TRI->getBaseRegister() == X86::ESI &&
38009 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38010 "base pointer in mind");
38011
38013 MVT SPTy = getPointerTy(MF->getDataLayout());
38014 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38015 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38016
38018 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38019 // does not use index register.
38020 if (AM.IndexReg == X86::NoRegister)
38021 return BB;
38022
38023 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38024 // four operand definitions that are E[ABCD] registers. We skip them and
38025 // then insert the LEA.
38026 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38027 while (RMBBI != BB->rend() &&
38028 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38029 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38030 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38031 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38032 ++RMBBI;
38033 }
38036 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38037
38038 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38039
38040 return BB;
38041 }
38042 case X86::LCMPXCHG16B_NO_RBX: {
38043 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38044 Register BasePtr = TRI->getBaseRegister();
38045 if (TRI->hasBasePointer(*MF) &&
38046 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38047 if (!BB->isLiveIn(BasePtr))
38048 BB->addLiveIn(BasePtr);
38049 // Save RBX into a virtual register.
38050 Register SaveRBX =
38051 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38052 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38053 .addReg(X86::RBX);
38054 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38056 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38057 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38058 MIB.add(MI.getOperand(Idx));
38059 MIB.add(MI.getOperand(X86::AddrNumOperands));
38060 MIB.addReg(SaveRBX);
38061 } else {
38062 // Simple case, just copy the virtual register to RBX.
38063 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38064 .add(MI.getOperand(X86::AddrNumOperands));
38066 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38067 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38068 MIB.add(MI.getOperand(Idx));
38069 }
38070 MI.eraseFromParent();
38071 return BB;
38072 }
38073 case X86::MWAITX: {
38074 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38075 Register BasePtr = TRI->getBaseRegister();
38076 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38077 // If no need to save the base pointer, we generate MWAITXrrr,
38078 // else we generate pseudo MWAITX_SAVE_RBX.
38079 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38080 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38081 .addReg(MI.getOperand(0).getReg());
38082 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38083 .addReg(MI.getOperand(1).getReg());
38084 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38085 .addReg(MI.getOperand(2).getReg());
38086 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38087 MI.eraseFromParent();
38088 } else {
38089 if (!BB->isLiveIn(BasePtr)) {
38090 BB->addLiveIn(BasePtr);
38091 }
38092 // Parameters can be copied into ECX and EAX but not EBX yet.
38093 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38094 .addReg(MI.getOperand(0).getReg());
38095 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38096 .addReg(MI.getOperand(1).getReg());
38097 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38098 // Save RBX into a virtual register.
38099 Register SaveRBX =
38100 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38101 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38102 .addReg(X86::RBX);
38103 // Generate mwaitx pseudo.
38104 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38105 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38106 .addDef(Dst) // Destination tied in with SaveRBX.
38107 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38108 .addUse(SaveRBX); // Save of base pointer.
38109 MI.eraseFromParent();
38110 }
38111 return BB;
38112 }
38113 case TargetOpcode::PREALLOCATED_SETUP: {
38114 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38115 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38116 MFI->setHasPreallocatedCall(true);
38117 int64_t PreallocatedId = MI.getOperand(0).getImm();
38118 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38119 assert(StackAdjustment != 0 && "0 stack adjustment");
38120 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38121 << StackAdjustment << "\n");
38122 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38123 .addReg(X86::ESP)
38124 .addImm(StackAdjustment);
38125 MI.eraseFromParent();
38126 return BB;
38127 }
38128 case TargetOpcode::PREALLOCATED_ARG: {
38129 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38130 int64_t PreallocatedId = MI.getOperand(1).getImm();
38131 int64_t ArgIdx = MI.getOperand(2).getImm();
38132 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38133 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38134 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38135 << ", arg offset " << ArgOffset << "\n");
38136 // stack pointer + offset
38137 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38138 MI.getOperand(0).getReg()),
38139 X86::ESP, false, ArgOffset);
38140 MI.eraseFromParent();
38141 return BB;
38142 }
38143 case X86::PTDPBSSD:
38144 case X86::PTDPBSUD:
38145 case X86::PTDPBUSD:
38146 case X86::PTDPBUUD:
38147 case X86::PTDPBF16PS:
38148 case X86::PTDPFP16PS:
38149 case X86::PTCMMIMFP16PS:
38150 case X86::PTCMMRLFP16PS:
38151 case X86::PTDPBF8PS:
38152 case X86::PTDPBHF8PS:
38153 case X86::PTDPHBF8PS:
38154 case X86::PTDPHF8PS:
38155 case X86::PTTDPBF16PS:
38156 case X86::PTTDPFP16PS:
38157 case X86::PTTCMMIMFP16PS:
38158 case X86::PTTCMMRLFP16PS:
38159 case X86::PTCONJTCMMIMFP16PS:
38160 case X86::PTMMULTF32PS:
38161 case X86::PTTMMULTF32PS: {
38162 unsigned Opc;
38163 switch (MI.getOpcode()) {
38164 default: llvm_unreachable("illegal opcode!");
38165 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38166 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38167 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38168 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38169 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38170 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38171 case X86::PTCMMIMFP16PS:
38172 Opc = X86::TCMMIMFP16PS;
38173 break;
38174 case X86::PTCMMRLFP16PS:
38175 Opc = X86::TCMMRLFP16PS;
38176 break;
38177 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38178 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38179 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38180 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38181 case X86::PTTDPBF16PS:
38182 Opc = X86::TTDPBF16PS;
38183 break;
38184 case X86::PTTDPFP16PS:
38185 Opc = X86::TTDPFP16PS;
38186 break;
38187 case X86::PTTCMMIMFP16PS:
38188 Opc = X86::TTCMMIMFP16PS;
38189 break;
38190 case X86::PTTCMMRLFP16PS:
38191 Opc = X86::TTCMMRLFP16PS;
38192 break;
38193 case X86::PTCONJTCMMIMFP16PS:
38194 Opc = X86::TCONJTCMMIMFP16PS;
38195 break;
38196 case X86::PTMMULTF32PS:
38197 Opc = X86::TMMULTF32PS;
38198 break;
38199 case X86::PTTMMULTF32PS:
38200 Opc = X86::TTMMULTF32PS;
38201 break;
38202 }
38203
38204 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38205 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38206 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38207 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38208 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38209
38210 MI.eraseFromParent(); // The pseudo is gone now.
38211 return BB;
38212 }
38213 case X86::PTILEZERO: {
38214 unsigned Imm = MI.getOperand(0).getImm();
38215 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38216 MI.eraseFromParent(); // The pseudo is gone now.
38217 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38219 return BB;
38220 }
38221 case X86::PTILEZEROV: {
38222 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38224 return BB;
38225 }
38226 case X86::PTILELOADDRS:
38227 case X86::PTILELOADDRST1:
38228 case X86::PTILELOADD:
38229 case X86::PTILELOADDT1:
38230 case X86::PTILESTORED: {
38231 unsigned Opc;
38232 switch (MI.getOpcode()) {
38233 default: llvm_unreachable("illegal opcode!");
38234#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38235 case X86::PTILELOADD:
38236 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38237 break;
38238 case X86::PTILELOADDT1:
38239 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38240 break;
38241 case X86::PTILESTORED:
38242 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38243 break;
38244 case X86::PTILELOADDRS:
38245 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38246 break;
38247 case X86::PTILELOADDRST1:
38248 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38249 break;
38250 }
38251#undef GET_EGPR_IF_ENABLED
38252
38253 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38254 unsigned CurOp = 0;
38255 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38256 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38258
38259 MIB.add(MI.getOperand(CurOp++)); // base
38260 MIB.add(MI.getOperand(CurOp++)); // scale
38261 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38262 MIB.add(MI.getOperand(CurOp++)); // displacement
38263 MIB.add(MI.getOperand(CurOp++)); // segment
38264
38265 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38266 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38268
38269 MI.eraseFromParent(); // The pseudo is gone now.
38270 return BB;
38271 }
38272 case X86::PT2RPNTLVWZ0:
38273 case X86::PT2RPNTLVWZ0T1:
38274 case X86::PT2RPNTLVWZ1:
38275 case X86::PT2RPNTLVWZ1T1:
38276 case X86::PT2RPNTLVWZ0RS:
38277 case X86::PT2RPNTLVWZ0RST1:
38278 case X86::PT2RPNTLVWZ1RS:
38279 case X86::PT2RPNTLVWZ1RST1: {
38280 const DebugLoc &DL = MI.getDebugLoc();
38281 unsigned Opc;
38282#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38283 switch (MI.getOpcode()) {
38284 default:
38285 llvm_unreachable("Unexpected instruction!");
38286 case X86::PT2RPNTLVWZ0:
38287 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38288 break;
38289 case X86::PT2RPNTLVWZ0T1:
38290 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38291 break;
38292 case X86::PT2RPNTLVWZ1:
38293 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38294 break;
38295 case X86::PT2RPNTLVWZ1T1:
38296 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38297 break;
38298 case X86::PT2RPNTLVWZ0RS:
38299 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38300 break;
38301 case X86::PT2RPNTLVWZ0RST1:
38302 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38303 break;
38304 case X86::PT2RPNTLVWZ1RS:
38305 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38306 break;
38307 case X86::PT2RPNTLVWZ1RST1:
38308 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38309 break;
38310 }
38311#undef GET_EGPR_IF_ENABLED
38312 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38313 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38314
38315 MIB.add(MI.getOperand(1)); // base
38316 MIB.add(MI.getOperand(2)); // scale
38317 MIB.add(MI.getOperand(3)); // index
38318 MIB.add(MI.getOperand(4)); // displacement
38319 MIB.add(MI.getOperand(5)); // segment
38320 MI.eraseFromParent(); // The pseudo is gone now.
38321 return BB;
38322 }
38323 case X86::PTTRANSPOSED:
38324 case X86::PTCONJTFP16: {
38325 const DebugLoc &DL = MI.getDebugLoc();
38326 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38327 : X86::TCONJTFP16;
38328
38329 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38330 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38331 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38332
38333 MI.eraseFromParent(); // The pseudo is gone now.
38334 return BB;
38335 }
38336 case X86::PTCVTROWPS2BF16Hrri:
38337 case X86::PTCVTROWPS2BF16Lrri:
38338 case X86::PTCVTROWPS2PHHrri:
38339 case X86::PTCVTROWPS2PHLrri:
38340 case X86::PTCVTROWD2PSrri:
38341 case X86::PTILEMOVROWrri: {
38342 const DebugLoc &DL = MI.getDebugLoc();
38343 unsigned Opc;
38344 switch (MI.getOpcode()) {
38345 default:
38346 llvm_unreachable("Unexpected instruction!");
38347 case X86::PTCVTROWD2PSrri:
38348 Opc = X86::TCVTROWD2PSrri;
38349 break;
38350 case X86::PTCVTROWPS2BF16Hrri:
38351 Opc = X86::TCVTROWPS2BF16Hrri;
38352 break;
38353 case X86::PTCVTROWPS2PHHrri:
38354 Opc = X86::TCVTROWPS2PHHrri;
38355 break;
38356 case X86::PTCVTROWPS2BF16Lrri:
38357 Opc = X86::TCVTROWPS2BF16Lrri;
38358 break;
38359 case X86::PTCVTROWPS2PHLrri:
38360 Opc = X86::TCVTROWPS2PHLrri;
38361 break;
38362 case X86::PTILEMOVROWrri:
38363 Opc = X86::TILEMOVROWrri;
38364 break;
38365 }
38366 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38367 MIB.add(MI.getOperand(0));
38368 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38369 MIB.addImm(MI.getOperand(2).getImm());
38370
38371 MI.eraseFromParent(); // The pseudo is gone now.
38372 return BB;
38373 }
38374 case X86::PTCVTROWPS2BF16Hrre:
38375 case X86::PTCVTROWPS2BF16Lrre:
38376 case X86::PTCVTROWPS2PHHrre:
38377 case X86::PTCVTROWPS2PHLrre:
38378 case X86::PTCVTROWD2PSrre:
38379 case X86::PTILEMOVROWrre: {
38380 const DebugLoc &DL = MI.getDebugLoc();
38381 unsigned Opc;
38382 switch (MI.getOpcode()) {
38383 default:
38384 llvm_unreachable("Unexpected instruction!");
38385 case X86::PTCVTROWD2PSrre:
38386 Opc = X86::TCVTROWD2PSrre;
38387 break;
38388 case X86::PTCVTROWPS2BF16Hrre:
38389 Opc = X86::TCVTROWPS2BF16Hrre;
38390 break;
38391 case X86::PTCVTROWPS2BF16Lrre:
38392 Opc = X86::TCVTROWPS2BF16Lrre;
38393 break;
38394 case X86::PTCVTROWPS2PHHrre:
38395 Opc = X86::TCVTROWPS2PHHrre;
38396 break;
38397 case X86::PTCVTROWPS2PHLrre:
38398 Opc = X86::TCVTROWPS2PHLrre;
38399 break;
38400 case X86::PTILEMOVROWrre:
38401 Opc = X86::TILEMOVROWrre;
38402 break;
38403 }
38404 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38405 MIB.add(MI.getOperand(0));
38406 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38407 MIB.add(MI.getOperand(2));
38408
38409 MI.eraseFromParent(); // The pseudo is gone now.
38410 return BB;
38411 }
38412 }
38413}
38414
38415//===----------------------------------------------------------------------===//
38416// X86 Optimization Hooks
38417//===----------------------------------------------------------------------===//
38418
38419bool
38421 const APInt &DemandedBits,
38422 const APInt &DemandedElts,
38423 TargetLoweringOpt &TLO) const {
38424 EVT VT = Op.getValueType();
38425 unsigned Opcode = Op.getOpcode();
38426 unsigned EltSize = VT.getScalarSizeInBits();
38427
38428 if (VT.isVector()) {
38429 // If the constant is only all signbits in the active bits, then we should
38430 // extend it to the entire constant to allow it act as a boolean constant
38431 // vector.
38432 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38433 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38434 return false;
38435 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38436 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38437 continue;
38438 const APInt &Val = V.getConstantOperandAPInt(i);
38439 if (Val.getBitWidth() > Val.getNumSignBits() &&
38440 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38441 return true;
38442 }
38443 return false;
38444 };
38445 // For vectors - if we have a constant, then try to sign extend.
38446 // TODO: Handle AND cases.
38447 unsigned ActiveBits = DemandedBits.getActiveBits();
38448 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38449 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38450 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38451 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38452 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38454 SDValue NewC =
38456 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38457 SDValue NewOp =
38458 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38459 return TLO.CombineTo(Op, NewOp);
38460 }
38461 return false;
38462 }
38463
38464 // Only optimize Ands to prevent shrinking a constant that could be
38465 // matched by movzx.
38466 if (Opcode != ISD::AND)
38467 return false;
38468
38469 // Make sure the RHS really is a constant.
38470 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38471 if (!C)
38472 return false;
38473
38474 const APInt &Mask = C->getAPIntValue();
38475
38476 // Clear all non-demanded bits initially.
38477 APInt ShrunkMask = Mask & DemandedBits;
38478
38479 // Find the width of the shrunk mask.
38480 unsigned Width = ShrunkMask.getActiveBits();
38481
38482 // If the mask is all 0s there's nothing to do here.
38483 if (Width == 0)
38484 return false;
38485
38486 // Find the next power of 2 width, rounding up to a byte.
38487 Width = llvm::bit_ceil(std::max(Width, 8U));
38488 // Truncate the width to size to handle illegal types.
38489 Width = std::min(Width, EltSize);
38490
38491 // Calculate a possible zero extend mask for this constant.
38492 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38493
38494 // If we aren't changing the mask, just return true to keep it and prevent
38495 // the caller from optimizing.
38496 if (ZeroExtendMask == Mask)
38497 return true;
38498
38499 // Make sure the new mask can be represented by a combination of mask bits
38500 // and non-demanded bits.
38501 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38502 return false;
38503
38504 // Replace the constant with the zero extend mask.
38505 SDLoc DL(Op);
38506 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38507 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38508 return TLO.CombineTo(Op, NewOp);
38509}
38510
38512 KnownBits &Known,
38513 const APInt &DemandedElts,
38514 const SelectionDAG &DAG, unsigned Depth) {
38515 KnownBits Known2;
38516 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38517 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38518 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38519 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38520 Known = KnownBits::abdu(Known, Known2).zext(16);
38521 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38522 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38523 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38524 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38525 Known = Known.zext(64);
38526}
38527
38529 KnownBits &Known,
38530 const APInt &DemandedElts,
38531 const SelectionDAG &DAG,
38532 unsigned Depth) {
38533 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38534
38535 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38536 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38537 APInt DemandedLoElts =
38538 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38539 APInt DemandedHiElts =
38540 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38541 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38542 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38543 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38544 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38545 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38546 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38547 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38548}
38549
38551 KnownBits &Known,
38552 const APInt &DemandedElts,
38553 const SelectionDAG &DAG,
38554 unsigned Depth) {
38555 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38556
38557 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38558 // pairs.
38559 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38560 APInt DemandedLoElts =
38561 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38562 APInt DemandedHiElts =
38563 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38564 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38565 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38566 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38567 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38568 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38569 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38570 Known = KnownBits::sadd_sat(Lo, Hi);
38571}
38572
38574 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38575 const SelectionDAG &DAG,
38576 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38577 KnownBitsFunc) {
38578 APInt DemandedEltsLHS, DemandedEltsRHS;
38579 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38580 DemandedElts, DemandedEltsLHS,
38581 DemandedEltsRHS);
38582
38583 const auto ComputeForSingleOpFunc =
38584 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38585 return KnownBitsFunc(
38586 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38587 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38588 };
38589
38590 if (DemandedEltsRHS.isZero())
38591 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38592 if (DemandedEltsLHS.isZero())
38593 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38594
38595 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38596 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38597}
38598
38600 KnownBits &Known,
38601 const APInt &DemandedElts,
38602 const SelectionDAG &DAG,
38603 unsigned Depth) const {
38604 unsigned BitWidth = Known.getBitWidth();
38605 unsigned NumElts = DemandedElts.getBitWidth();
38606 unsigned Opc = Op.getOpcode();
38607 EVT VT = Op.getValueType();
38612 "Should use MaskedValueIsZero if you don't know whether Op"
38613 " is a target node!");
38614
38615 Known.resetAll();
38616 switch (Opc) {
38617 default: break;
38618 case X86ISD::MUL_IMM: {
38619 KnownBits Known2;
38620 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38621 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38622 Known = KnownBits::mul(Known, Known2);
38623 break;
38624 }
38625 case X86ISD::BSF: {
38627
38628 KnownBits Known2;
38629 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38630 if (Known2.isNonZero()) {
38631 // If we have a known 1, its position is our upper bound.
38632 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38633 unsigned LowBits = llvm::bit_width(PossibleTZ);
38634 Known.Zero.setBitsFrom(LowBits);
38635 } else if (!Op.getOperand(0).isUndef()) {
38636 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38637 Known = Known.intersectWith(Known2);
38638 }
38639 break;
38640 }
38641 case X86ISD::BSR: {
38642 // TODO: Bound with input known bits?
38644
38645 if (!Op.getOperand(0).isUndef() &&
38646 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38647 KnownBits Known2;
38648 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38649 Known = Known.intersectWith(Known2);
38650 }
38651 break;
38652 }
38653 case X86ISD::SETCC:
38654 Known.Zero.setBitsFrom(1);
38655 break;
38656 case X86ISD::MOVMSK: {
38657 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38658 Known.Zero.setBitsFrom(NumLoBits);
38659 break;
38660 }
38661 case X86ISD::PEXTRB:
38662 case X86ISD::PEXTRW: {
38663 SDValue Src = Op.getOperand(0);
38664 EVT SrcVT = Src.getValueType();
38665 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38666 Op.getConstantOperandVal(1));
38667 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38668 Known = Known.anyextOrTrunc(BitWidth);
38669 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38670 break;
38671 }
38672 case X86ISD::VSRAI:
38673 case X86ISD::VSHLI:
38674 case X86ISD::VSRLI: {
38675 unsigned ShAmt = Op.getConstantOperandVal(1);
38676 if (ShAmt >= VT.getScalarSizeInBits()) {
38677 // Out of range logical bit shifts are guaranteed to be zero.
38678 // Out of range arithmetic bit shifts splat the sign bit.
38679 if (Opc != X86ISD::VSRAI) {
38680 Known.setAllZero();
38681 break;
38682 }
38683
38684 ShAmt = VT.getScalarSizeInBits() - 1;
38685 }
38686
38687 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38688 if (Opc == X86ISD::VSHLI) {
38689 Known <<= ShAmt;
38690 // Low bits are known zero.
38691 Known.Zero.setLowBits(ShAmt);
38692 } else if (Opc == X86ISD::VSRLI) {
38693 Known >>= ShAmt;
38694 // High bits are known zero.
38695 Known.Zero.setHighBits(ShAmt);
38696 } else {
38697 Known.Zero.ashrInPlace(ShAmt);
38698 Known.One.ashrInPlace(ShAmt);
38699 }
38700 break;
38701 }
38702 case X86ISD::PACKUS: {
38703 // PACKUS is just a truncation if the upper half is zero.
38704 APInt DemandedLHS, DemandedRHS;
38705 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38706
38707 Known.One = APInt::getAllOnes(BitWidth * 2);
38708 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38709
38710 KnownBits Known2;
38711 if (!!DemandedLHS) {
38712 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38713 Known = Known.intersectWith(Known2);
38714 }
38715 if (!!DemandedRHS) {
38716 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38717 Known = Known.intersectWith(Known2);
38718 }
38719
38720 if (Known.countMinLeadingZeros() < BitWidth)
38721 Known.resetAll();
38722 Known = Known.trunc(BitWidth);
38723 break;
38724 }
38725 case X86ISD::PSHUFB: {
38726 SDValue Src = Op.getOperand(0);
38727 SDValue Idx = Op.getOperand(1);
38728
38729 // If the index vector is never negative (MSB is zero), then all elements
38730 // come from the source vector. This is useful for cases where
38731 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38732 // below will handle the more common constant shuffle mask case.
38733 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38734 if (KnownIdx.isNonNegative())
38735 Known = DAG.computeKnownBits(Src, Depth + 1);
38736 break;
38737 }
38738 case X86ISD::VBROADCAST: {
38739 SDValue Src = Op.getOperand(0);
38740 if (!Src.getSimpleValueType().isVector()) {
38741 Known = DAG.computeKnownBits(Src, Depth + 1);
38742 return;
38743 }
38744 break;
38745 }
38746 case X86ISD::AND: {
38747 if (Op.getResNo() == 0) {
38748 KnownBits Known2;
38749 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38750 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38751 Known &= Known2;
38752 }
38753 break;
38754 }
38755 case X86ISD::ANDNP: {
38756 KnownBits Known2;
38757 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38758 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38759
38760 // ANDNP = (~X & Y);
38761 Known.One &= Known2.Zero;
38762 Known.Zero |= Known2.One;
38763 break;
38764 }
38765 case X86ISD::FOR: {
38766 KnownBits Known2;
38767 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38768 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38769
38770 Known |= Known2;
38771 break;
38772 }
38773 case X86ISD::PSADBW: {
38774 SDValue LHS = Op.getOperand(0);
38775 SDValue RHS = Op.getOperand(1);
38776 assert(VT.getScalarType() == MVT::i64 &&
38777 LHS.getValueType() == RHS.getValueType() &&
38778 LHS.getValueType().getScalarType() == MVT::i8 &&
38779 "Unexpected PSADBW types");
38780 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38781 break;
38782 }
38783 case X86ISD::PCMPGT:
38784 case X86ISD::PCMPEQ: {
38785 KnownBits KnownLhs =
38786 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38787 KnownBits KnownRhs =
38788 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38789 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38790 ? KnownBits::eq(KnownLhs, KnownRhs)
38791 : KnownBits::sgt(KnownLhs, KnownRhs);
38792 if (Res) {
38793 if (*Res)
38794 Known.setAllOnes();
38795 else
38796 Known.setAllZero();
38797 }
38798 break;
38799 }
38800 case X86ISD::VPMADDWD: {
38801 SDValue LHS = Op.getOperand(0);
38802 SDValue RHS = Op.getOperand(1);
38803 assert(VT.getVectorElementType() == MVT::i32 &&
38804 LHS.getValueType() == RHS.getValueType() &&
38805 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38806 "Unexpected PMADDWD types");
38807 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38808 break;
38809 }
38810 case X86ISD::VPMADDUBSW: {
38811 SDValue LHS = Op.getOperand(0);
38812 SDValue RHS = Op.getOperand(1);
38813 assert(VT.getVectorElementType() == MVT::i16 &&
38814 LHS.getValueType() == RHS.getValueType() &&
38815 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38816 "Unexpected PMADDUBSW types");
38817 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38818 break;
38819 }
38820 case X86ISD::PMULUDQ: {
38821 KnownBits Known2;
38822 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38823 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38824
38825 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38826 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38827 Known = KnownBits::mul(Known, Known2);
38828 break;
38829 }
38830 case X86ISD::CMOV: {
38831 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38832 // If we don't know any bits, early out.
38833 if (Known.isUnknown())
38834 break;
38835 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38836
38837 // Only known if known in both the LHS and RHS.
38838 Known = Known.intersectWith(Known2);
38839 break;
38840 }
38841 case X86ISD::BEXTR:
38842 case X86ISD::BEXTRI: {
38843 SDValue Op0 = Op.getOperand(0);
38844 SDValue Op1 = Op.getOperand(1);
38845
38846 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38847 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38848 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38849
38850 // If the length is 0, the result is 0.
38851 if (Length == 0) {
38852 Known.setAllZero();
38853 break;
38854 }
38855
38856 if ((Shift + Length) <= BitWidth) {
38857 Known = DAG.computeKnownBits(Op0, Depth + 1);
38858 Known = Known.extractBits(Length, Shift);
38859 Known = Known.zextOrTrunc(BitWidth);
38860 }
38861 }
38862 break;
38863 }
38864 case X86ISD::PDEP: {
38865 KnownBits Known2;
38866 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38867 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38868 // Zeros are retained from the mask operand. But not ones.
38869 Known.One.clearAllBits();
38870 // The result will have at least as many trailing zeros as the non-mask
38871 // operand since bits can only map to the same or higher bit position.
38872 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38873 break;
38874 }
38875 case X86ISD::PEXT: {
38876 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38877 // The result has as many leading zeros as the number of zeroes in the mask.
38878 unsigned Count = Known.Zero.popcount();
38879 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
38880 Known.One.clearAllBits();
38881 break;
38882 }
38883 case X86ISD::VTRUNC:
38884 case X86ISD::VTRUNCS:
38885 case X86ISD::VTRUNCUS:
38886 case X86ISD::CVTSI2P:
38887 case X86ISD::CVTUI2P:
38888 case X86ISD::CVTP2SI:
38889 case X86ISD::CVTP2UI:
38890 case X86ISD::MCVTP2SI:
38891 case X86ISD::MCVTP2UI:
38892 case X86ISD::CVTTP2SI:
38893 case X86ISD::CVTTP2UI:
38894 case X86ISD::MCVTTP2SI:
38895 case X86ISD::MCVTTP2UI:
38896 case X86ISD::MCVTSI2P:
38897 case X86ISD::MCVTUI2P:
38898 case X86ISD::VFPROUND:
38899 case X86ISD::VMFPROUND:
38900 case X86ISD::CVTPS2PH:
38901 case X86ISD::MCVTPS2PH:
38902 case X86ISD::MCVTTP2SIS:
38903 case X86ISD::MCVTTP2UIS: {
38904 // Truncations/Conversions - upper elements are known zero.
38905 EVT SrcVT = Op.getOperand(0).getValueType();
38906 if (SrcVT.isVector()) {
38907 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38908 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38909 Known.setAllZero();
38910 }
38911 break;
38912 }
38919 // Strict Conversions - upper elements are known zero.
38920 EVT SrcVT = Op.getOperand(1).getValueType();
38921 if (SrcVT.isVector()) {
38922 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38923 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38924 Known.setAllZero();
38925 }
38926 break;
38927 }
38928 case X86ISD::MOVQ2DQ: {
38929 // Move from MMX to XMM. Upper half of XMM should be 0.
38930 if (DemandedElts.countr_zero() >= (NumElts / 2))
38931 Known.setAllZero();
38932 break;
38933 }
38935 APInt UndefElts;
38936 SmallVector<APInt, 16> EltBits;
38937 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38938 /*AllowWholeUndefs*/ false,
38939 /*AllowPartialUndefs*/ false)) {
38940 Known.Zero.setAllBits();
38941 Known.One.setAllBits();
38942 for (unsigned I = 0; I != NumElts; ++I) {
38943 if (!DemandedElts[I])
38944 continue;
38945 if (UndefElts[I]) {
38946 Known.resetAll();
38947 break;
38948 }
38949 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38950 Known = Known.intersectWith(Known2);
38951 }
38952 return;
38953 }
38954 break;
38955 }
38956 case X86ISD::HADD:
38957 case X86ISD::HSUB: {
38959 Op, DemandedElts, Depth, DAG,
38960 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38962 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38963 KnownLHS, KnownRHS);
38964 });
38965 break;
38966 }
38968 switch (Op->getConstantOperandVal(0)) {
38969 case Intrinsic::x86_sse2_pmadd_wd:
38970 case Intrinsic::x86_avx2_pmadd_wd:
38971 case Intrinsic::x86_avx512_pmaddw_d_512: {
38972 SDValue LHS = Op.getOperand(1);
38973 SDValue RHS = Op.getOperand(2);
38974 assert(VT.getScalarType() == MVT::i32 &&
38975 LHS.getValueType() == RHS.getValueType() &&
38976 LHS.getValueType().getScalarType() == MVT::i16 &&
38977 "Unexpected PMADDWD types");
38978 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38979 break;
38980 }
38981 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38982 case Intrinsic::x86_avx2_pmadd_ub_sw:
38983 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38984 SDValue LHS = Op.getOperand(1);
38985 SDValue RHS = Op.getOperand(2);
38986 assert(VT.getScalarType() == MVT::i16 &&
38987 LHS.getValueType() == RHS.getValueType() &&
38988 LHS.getValueType().getScalarType() == MVT::i8 &&
38989 "Unexpected PMADDUBSW types");
38990 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38991 break;
38992 }
38993 case Intrinsic::x86_sse2_psad_bw:
38994 case Intrinsic::x86_avx2_psad_bw:
38995 case Intrinsic::x86_avx512_psad_bw_512: {
38996 SDValue LHS = Op.getOperand(1);
38997 SDValue RHS = Op.getOperand(2);
38998 assert(VT.getScalarType() == MVT::i64 &&
38999 LHS.getValueType() == RHS.getValueType() &&
39000 LHS.getValueType().getScalarType() == MVT::i8 &&
39001 "Unexpected PSADBW types");
39002 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39003 break;
39004 }
39005 }
39006 break;
39007 }
39008 }
39009
39010 // Handle target shuffles.
39011 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39012 if (isTargetShuffle(Opc)) {
39015 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39016 unsigned NumOps = Ops.size();
39017 unsigned NumElts = VT.getVectorNumElements();
39018 if (Mask.size() == NumElts) {
39019 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39020 Known.Zero.setAllBits(); Known.One.setAllBits();
39021 for (unsigned i = 0; i != NumElts; ++i) {
39022 if (!DemandedElts[i])
39023 continue;
39024 int M = Mask[i];
39025 if (M == SM_SentinelUndef) {
39026 // For UNDEF elements, we don't know anything about the common state
39027 // of the shuffle result.
39028 Known.resetAll();
39029 break;
39030 }
39031 if (M == SM_SentinelZero) {
39032 Known.One.clearAllBits();
39033 continue;
39034 }
39035 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39036 "Shuffle index out of range");
39037
39038 unsigned OpIdx = (unsigned)M / NumElts;
39039 unsigned EltIdx = (unsigned)M % NumElts;
39040 if (Ops[OpIdx].getValueType() != VT) {
39041 // TODO - handle target shuffle ops with different value types.
39042 Known.resetAll();
39043 break;
39044 }
39045 DemandedOps[OpIdx].setBit(EltIdx);
39046 }
39047 // Known bits are the values that are shared by every demanded element.
39048 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39049 if (!DemandedOps[i])
39050 continue;
39051 KnownBits Known2 =
39052 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39053 Known = Known.intersectWith(Known2);
39054 }
39055 }
39056 }
39057 }
39058}
39059
39061 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39062 unsigned Depth) const {
39063 EVT VT = Op.getValueType();
39064 unsigned VTBits = VT.getScalarSizeInBits();
39065 unsigned Opcode = Op.getOpcode();
39066 switch (Opcode) {
39068 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39069 return VTBits;
39070
39071 case X86ISD::VTRUNC: {
39072 SDValue Src = Op.getOperand(0);
39073 MVT SrcVT = Src.getSimpleValueType();
39074 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39075 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39076 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39077 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39078 if (Tmp > (NumSrcBits - VTBits))
39079 return Tmp - (NumSrcBits - VTBits);
39080 return 1;
39081 }
39082
39083 case X86ISD::PACKSS: {
39084 // PACKSS is just a truncation if the sign bits extend to the packed size.
39085 APInt DemandedLHS, DemandedRHS;
39086 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39087 DemandedRHS);
39088
39089 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39090 // patterns often used to compact vXi64 allsignbit patterns.
39091 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39093 if (BC.getOpcode() == X86ISD::PACKSS &&
39094 BC.getScalarValueSizeInBits() == 16 &&
39095 V.getScalarValueSizeInBits() == 32) {
39098 if (BC0.getScalarValueSizeInBits() == 64 &&
39099 BC1.getScalarValueSizeInBits() == 64 &&
39100 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39101 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39102 return 32;
39103 }
39104 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39105 };
39106
39107 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39108 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39109 if (!!DemandedLHS)
39110 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39111 if (!!DemandedRHS)
39112 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39113 unsigned Tmp = std::min(Tmp0, Tmp1);
39114 if (Tmp > (SrcBits - VTBits))
39115 return Tmp - (SrcBits - VTBits);
39116 return 1;
39117 }
39118
39119 case X86ISD::VBROADCAST: {
39120 SDValue Src = Op.getOperand(0);
39121 if (!Src.getSimpleValueType().isVector())
39122 return DAG.ComputeNumSignBits(Src, Depth + 1);
39123 break;
39124 }
39125
39126 case X86ISD::VSHLI: {
39127 SDValue Src = Op.getOperand(0);
39128 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39129 if (ShiftVal.uge(VTBits))
39130 return VTBits; // Shifted all bits out --> zero.
39131 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39132 if (ShiftVal.uge(Tmp))
39133 return 1; // Shifted all sign bits out --> unknown.
39134 return Tmp - ShiftVal.getZExtValue();
39135 }
39136
39137 case X86ISD::VSRAI: {
39138 SDValue Src = Op.getOperand(0);
39139 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39140 if (ShiftVal.uge(VTBits - 1))
39141 return VTBits; // Sign splat.
39142 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39143 ShiftVal += Tmp;
39144 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39145 }
39146
39147 case X86ISD::FSETCC:
39148 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39149 if (VT == MVT::f32 || VT == MVT::f64 ||
39150 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39151 return VTBits;
39152 break;
39153
39154 case X86ISD::PCMPGT:
39155 case X86ISD::PCMPEQ:
39156 case X86ISD::CMPP:
39157 case X86ISD::VPCOM:
39158 case X86ISD::VPCOMU:
39159 // Vector compares return zero/all-bits result values.
39160 return VTBits;
39161
39162 case X86ISD::ANDNP: {
39163 unsigned Tmp0 =
39164 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39165 if (Tmp0 == 1) return 1; // Early out.
39166 unsigned Tmp1 =
39167 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39168 return std::min(Tmp0, Tmp1);
39169 }
39170
39171 case X86ISD::CMOV: {
39172 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39173 if (Tmp0 == 1) return 1; // Early out.
39174 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39175 return std::min(Tmp0, Tmp1);
39176 }
39177 }
39178
39179 // Handle target shuffles.
39180 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39181 if (isTargetShuffle(Opcode)) {
39184 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39185 unsigned NumOps = Ops.size();
39186 unsigned NumElts = VT.getVectorNumElements();
39187 if (Mask.size() == NumElts) {
39188 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39189 for (unsigned i = 0; i != NumElts; ++i) {
39190 if (!DemandedElts[i])
39191 continue;
39192 int M = Mask[i];
39193 if (M == SM_SentinelUndef) {
39194 // For UNDEF elements, we don't know anything about the common state
39195 // of the shuffle result.
39196 return 1;
39197 } else if (M == SM_SentinelZero) {
39198 // Zero = all sign bits.
39199 continue;
39200 }
39201 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39202 "Shuffle index out of range");
39203
39204 unsigned OpIdx = (unsigned)M / NumElts;
39205 unsigned EltIdx = (unsigned)M % NumElts;
39206 if (Ops[OpIdx].getValueType() != VT) {
39207 // TODO - handle target shuffle ops with different value types.
39208 return 1;
39209 }
39210 DemandedOps[OpIdx].setBit(EltIdx);
39211 }
39212 unsigned Tmp0 = VTBits;
39213 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39214 if (!DemandedOps[i])
39215 continue;
39216 unsigned Tmp1 =
39217 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39218 Tmp0 = std::min(Tmp0, Tmp1);
39219 }
39220 return Tmp0;
39221 }
39222 }
39223 }
39224
39225 // Fallback case.
39226 return 1;
39227}
39228
39230 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39231 return N->getOperand(0);
39232 return N;
39233}
39234
39235// Helper to look for a normal load that can be narrowed into a vzload with the
39236// specified VT and memory VT. Returns SDValue() on failure.
39238 SelectionDAG &DAG) {
39239 // Can't if the load is volatile or atomic.
39240 if (!LN->isSimple())
39241 return SDValue();
39242
39243 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39244 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39245 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39246 LN->getPointerInfo(), LN->getBaseAlign(),
39247 LN->getMemOperand()->getFlags());
39248}
39249
39250// Attempt to match a combined shuffle mask against supported unary shuffle
39251// instructions.
39252// TODO: Investigate sharing more of this with shuffle lowering.
39253static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39254 bool AllowFloatDomain, bool AllowIntDomain,
39255 SDValue V1, const SelectionDAG &DAG,
39256 const X86Subtarget &Subtarget, unsigned &Shuffle,
39257 MVT &SrcVT, MVT &DstVT) {
39258 unsigned NumMaskElts = Mask.size();
39259 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39260
39261 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39262 if (Mask[0] == 0 &&
39263 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39264 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39266 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39267 Shuffle = X86ISD::VZEXT_MOVL;
39268 if (MaskEltSize == 16)
39269 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39270 else
39271 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39272 return true;
39273 }
39274 }
39275
39276 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39277 if (AllowIntDomain &&
39278 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39279 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39280 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39281 unsigned MaxScale = 64 / MaskEltSize;
39282 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39283 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39284 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39285 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39286 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39287 continue;
39288 bool MatchAny = true;
39289 bool MatchZero = true;
39290 bool MatchSign = UseSign;
39291 unsigned NumDstElts = NumMaskElts / Scale;
39292 for (unsigned i = 0;
39293 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39294 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39295 MatchAny = MatchSign = MatchZero = false;
39296 break;
39297 }
39298 unsigned Pos = (i * Scale) + 1;
39299 unsigned Len = Scale - 1;
39300 MatchAny &= isUndefInRange(Mask, Pos, Len);
39301 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39302 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39303 }
39304 if (MatchAny || MatchSign || MatchZero) {
39305 assert((MatchSign || MatchZero) &&
39306 "Failed to match sext/zext but matched aext?");
39307 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39308 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39309 : MVT::getIntegerVT(MaskEltSize);
39310 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39311
39312 Shuffle = unsigned(
39313 MatchAny ? ISD::ANY_EXTEND
39314 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39315 if (SrcVT.getVectorNumElements() != NumDstElts)
39316 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39317
39318 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39319 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39320 return true;
39321 }
39322 }
39323 }
39324
39325 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39326 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39327 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39328 isUndefOrEqual(Mask[0], 0) &&
39329 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39330 Shuffle = X86ISD::VZEXT_MOVL;
39331 if (MaskEltSize == 16)
39332 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39333 else
39334 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39335 return true;
39336 }
39337
39338 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39339 // instructions are no slower than UNPCKLPD but has the option to
39340 // fold the input operand into even an unaligned memory load.
39341 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39342 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39343 Shuffle = X86ISD::MOVDDUP;
39344 SrcVT = DstVT = MVT::v2f64;
39345 return true;
39346 }
39347 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39348 Shuffle = X86ISD::MOVSLDUP;
39349 SrcVT = DstVT = MVT::v4f32;
39350 return true;
39351 }
39352 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39353 Shuffle = X86ISD::MOVSHDUP;
39354 SrcVT = DstVT = MVT::v4f32;
39355 return true;
39356 }
39357 }
39358
39359 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39360 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39361 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39362 Shuffle = X86ISD::MOVDDUP;
39363 SrcVT = DstVT = MVT::v4f64;
39364 return true;
39365 }
39366 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39367 V1)) {
39368 Shuffle = X86ISD::MOVSLDUP;
39369 SrcVT = DstVT = MVT::v8f32;
39370 return true;
39371 }
39372 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39373 V1)) {
39374 Shuffle = X86ISD::MOVSHDUP;
39375 SrcVT = DstVT = MVT::v8f32;
39376 return true;
39377 }
39378 }
39379
39380 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39381 assert(Subtarget.hasAVX512() &&
39382 "AVX512 required for 512-bit vector shuffles");
39383 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39384 V1)) {
39385 Shuffle = X86ISD::MOVDDUP;
39386 SrcVT = DstVT = MVT::v8f64;
39387 return true;
39388 }
39390 MaskVT, Mask,
39391 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39392 Shuffle = X86ISD::MOVSLDUP;
39393 SrcVT = DstVT = MVT::v16f32;
39394 return true;
39395 }
39397 MaskVT, Mask,
39398 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39399 Shuffle = X86ISD::MOVSHDUP;
39400 SrcVT = DstVT = MVT::v16f32;
39401 return true;
39402 }
39403 }
39404
39405 return false;
39406}
39407
39408// Attempt to match a combined shuffle mask against supported unary immediate
39409// permute instructions.
39410// TODO: Investigate sharing more of this with shuffle lowering.
39412 const APInt &Zeroable,
39413 bool AllowFloatDomain, bool AllowIntDomain,
39414 const SelectionDAG &DAG,
39415 const X86Subtarget &Subtarget,
39416 unsigned &Shuffle, MVT &ShuffleVT,
39417 unsigned &PermuteImm) {
39418 unsigned NumMaskElts = Mask.size();
39419 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39420 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39421 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39422 bool ContainsZeros = isAnyZero(Mask);
39423
39424 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39425 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39426 // Check for lane crossing permutes.
39427 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39428 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39429 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39430 Shuffle = X86ISD::VPERMI;
39431 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39432 PermuteImm = getV4X86ShuffleImm(Mask);
39433 return true;
39434 }
39435 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39436 SmallVector<int, 4> RepeatedMask;
39437 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39438 Shuffle = X86ISD::VPERMI;
39439 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39440 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39441 return true;
39442 }
39443 }
39444 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39445 // VPERMILPD can permute with a non-repeating shuffle.
39446 Shuffle = X86ISD::VPERMILPI;
39447 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39448 PermuteImm = 0;
39449 for (int i = 0, e = Mask.size(); i != e; ++i) {
39450 int M = Mask[i];
39451 if (M == SM_SentinelUndef)
39452 continue;
39453 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39454 PermuteImm |= (M & 1) << i;
39455 }
39456 return true;
39457 }
39458 }
39459
39460 // We are checking for shuffle match or shift match. Loop twice so we can
39461 // order which we try and match first depending on target preference.
39462 for (unsigned Order = 0; Order < 2; ++Order) {
39463 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39464 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39465 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39466 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39467 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39468 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39469 SmallVector<int, 4> RepeatedMask;
39470 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39471 // Narrow the repeated mask to create 32-bit element permutes.
39472 SmallVector<int, 4> WordMask = RepeatedMask;
39473 if (MaskScalarSizeInBits == 64)
39474 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39475
39476 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39477 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39478 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39479 PermuteImm = getV4X86ShuffleImm(WordMask);
39480 return true;
39481 }
39482 }
39483
39484 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39485 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39486 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39487 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39488 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39489 SmallVector<int, 4> RepeatedMask;
39490 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39491 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39492 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39493
39494 // PSHUFLW: permute lower 4 elements only.
39495 if (isUndefOrInRange(LoMask, 0, 4) &&
39496 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39497 Shuffle = X86ISD::PSHUFLW;
39498 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39499 PermuteImm = getV4X86ShuffleImm(LoMask);
39500 return true;
39501 }
39502
39503 // PSHUFHW: permute upper 4 elements only.
39504 if (isUndefOrInRange(HiMask, 4, 8) &&
39505 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39506 // Offset the HiMask so that we can create the shuffle immediate.
39507 int OffsetHiMask[4];
39508 for (int i = 0; i != 4; ++i)
39509 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39510
39511 Shuffle = X86ISD::PSHUFHW;
39512 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39513 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39514 return true;
39515 }
39516 }
39517 }
39518 } else {
39519 // Attempt to match against bit rotates.
39520 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39521 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39522 Subtarget.hasAVX512())) {
39523 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39524 Subtarget, Mask);
39525 if (0 < RotateAmt) {
39526 Shuffle = X86ISD::VROTLI;
39527 PermuteImm = (unsigned)RotateAmt;
39528 return true;
39529 }
39530 }
39531 }
39532 // Attempt to match against byte/bit shifts.
39533 if (AllowIntDomain &&
39534 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39535 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39536 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39537 int ShiftAmt =
39538 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39539 Zeroable, Subtarget);
39540 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39541 32 <= ShuffleVT.getScalarSizeInBits())) {
39542 // Byte shifts can be slower so only match them on second attempt.
39543 if (Order == 0 &&
39544 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39545 continue;
39546
39547 PermuteImm = (unsigned)ShiftAmt;
39548 return true;
39549 }
39550
39551 }
39552 }
39553
39554 return false;
39555}
39556
39557// Attempt to match a combined unary shuffle mask against supported binary
39558// shuffle instructions.
39559// TODO: Investigate sharing more of this with shuffle lowering.
39560static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39561 bool AllowFloatDomain, bool AllowIntDomain,
39562 SDValue &V1, SDValue &V2, const SDLoc &DL,
39563 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39564 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39565 bool IsUnary) {
39566 unsigned NumMaskElts = Mask.size();
39567 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39568 unsigned SizeInBits = MaskVT.getSizeInBits();
39569
39570 if (MaskVT.is128BitVector()) {
39571 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39572 AllowFloatDomain) {
39573 V2 = V1;
39574 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39575 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39576 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39577 return true;
39578 }
39579 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39580 AllowFloatDomain) {
39581 V2 = V1;
39582 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39583 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39584 return true;
39585 }
39586 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39587 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39588 std::swap(V1, V2);
39589 Shuffle = X86ISD::MOVSD;
39590 SrcVT = DstVT = MVT::v2f64;
39591 return true;
39592 }
39593 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39594 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39595 Shuffle = X86ISD::MOVSS;
39596 SrcVT = DstVT = MVT::v4f32;
39597 return true;
39598 }
39599 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39600 DAG) &&
39601 Subtarget.hasFP16()) {
39602 Shuffle = X86ISD::MOVSH;
39603 SrcVT = DstVT = MVT::v8f16;
39604 return true;
39605 }
39606 }
39607
39608 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39609 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39610 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39611 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39612 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39613 Subtarget)) {
39614 DstVT = MaskVT;
39615 return true;
39616 }
39617 }
39618 // TODO: Can we handle this inside matchShuffleWithPACK?
39619 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39620 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39621 V1.getScalarValueSizeInBits() == 64 &&
39622 V2.getScalarValueSizeInBits() == 64) {
39623 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39624 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39625 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39626 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39627 SrcVT = MVT::v4i32;
39628 DstVT = MVT::v8i16;
39629 Shuffle = X86ISD::PACKUS;
39630 return true;
39631 }
39632 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39633 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39634 SrcVT = MVT::v8i16;
39635 DstVT = MVT::v16i8;
39636 Shuffle = X86ISD::PACKUS;
39637 return true;
39638 }
39639 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39640 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39641 SrcVT = MVT::v4i32;
39642 DstVT = MVT::v8i16;
39643 Shuffle = X86ISD::PACKSS;
39644 return true;
39645 }
39646 }
39647
39648 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39649 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39650 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39651 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39652 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39653 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39654 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39655 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39656 Subtarget)) {
39657 SrcVT = DstVT = MaskVT;
39658 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39659 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39660 return true;
39661 }
39662 }
39663
39664 // Attempt to match against a OR if we're performing a blend shuffle and the
39665 // non-blended source element is zero in each case.
39666 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39667 if (SizeInBits == V1.getValueSizeInBits() &&
39668 SizeInBits == V2.getValueSizeInBits() &&
39669 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39670 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39671 bool IsBlend = true;
39672 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39673 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39674 unsigned Scale1 = NumV1Elts / NumMaskElts;
39675 unsigned Scale2 = NumV2Elts / NumMaskElts;
39676 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39677 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39678 for (unsigned i = 0; i != NumMaskElts; ++i) {
39679 int M = Mask[i];
39680 if (M == SM_SentinelUndef)
39681 continue;
39682 if (M == SM_SentinelZero) {
39683 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39684 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39685 continue;
39686 }
39687 if (M == (int)i) {
39688 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39689 continue;
39690 }
39691 if (M == (int)(i + NumMaskElts)) {
39692 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39693 continue;
39694 }
39695 IsBlend = false;
39696 break;
39697 }
39698 if (IsBlend) {
39699 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39700 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39701 Shuffle = ISD::OR;
39702 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39703 return true;
39704 }
39705 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39706 // FIXME: handle mismatched sizes?
39707 // TODO: investigate if `ISD::OR` handling in
39708 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39709 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39710 unsigned NumElts = V.getValueType().getVectorNumElements();
39711 KnownBits Known(NumElts);
39712 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39713 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39714 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39715 if (PeepholeKnown.isZero())
39716 Known.Zero.setBit(EltIdx);
39717 if (PeepholeKnown.isAllOnes())
39718 Known.One.setBit(EltIdx);
39719 }
39720 return Known;
39721 };
39722
39723 KnownBits V1Known = computeKnownBitsElementWise(V1);
39724 KnownBits V2Known = computeKnownBitsElementWise(V2);
39725
39726 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39727 int M = Mask[i];
39728 if (M == SM_SentinelUndef)
39729 continue;
39730 if (M == SM_SentinelZero) {
39731 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39732 continue;
39733 }
39734 if (M == (int)i) {
39735 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39736 continue;
39737 }
39738 if (M == (int)(i + NumMaskElts)) {
39739 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39740 continue;
39741 }
39742 llvm_unreachable("will not get here.");
39743 }
39744 if (IsBlend) {
39745 Shuffle = ISD::OR;
39746 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39747 return true;
39748 }
39749 }
39750 }
39751 }
39752
39753 return false;
39754}
39755
39757 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39758 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39759 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39760 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39761 unsigned NumMaskElts = Mask.size();
39762 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39763
39764 // Attempt to match against VALIGND/VALIGNQ rotate.
39765 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39766 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39767 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39768 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39769 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39770 MaskVT.getSizeInBits() / EltSizeInBits);
39771 if (!isAnyZero(Mask)) {
39772 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39773 if (0 < Rotation) {
39774 Shuffle = X86ISD::VALIGN;
39775 ShuffleVT = AlignVT;
39776 PermuteImm = Rotation;
39777 return true;
39778 }
39779 }
39780 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39781 unsigned ZeroLo = Zeroable.countr_one();
39782 unsigned ZeroHi = Zeroable.countl_one();
39783 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39784 if (ZeroLo) {
39785 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39786 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39787 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39788 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39789 Shuffle = X86ISD::VALIGN;
39790 ShuffleVT = AlignVT;
39791 PermuteImm = NumMaskElts - ZeroLo;
39792 return true;
39793 }
39794 }
39795 if (ZeroHi) {
39796 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39797 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39798 ZeroHi);
39799 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39800 V2 = V1;
39801 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39802 Shuffle = X86ISD::VALIGN;
39803 ShuffleVT = AlignVT;
39804 PermuteImm = ZeroHi;
39805 return true;
39806 }
39807 }
39808 }
39809
39810 // Attempt to match against PALIGNR byte rotate.
39811 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39812 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39813 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39814 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39815 if (0 < ByteRotation) {
39816 Shuffle = X86ISD::PALIGNR;
39817 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39818 PermuteImm = ByteRotation;
39819 return true;
39820 }
39821 }
39822
39823 // Attempt to combine to X86ISD::BLENDI.
39824 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39825 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39826 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39827 uint64_t BlendMask = 0;
39828 bool ForceV1Zero = false, ForceV2Zero = false;
39829 SmallVector<int, 8> TargetMask(Mask);
39830 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39831 ForceV2Zero, BlendMask)) {
39832 if (MaskVT == MVT::v16i16) {
39833 // We can only use v16i16 PBLENDW if the lanes are repeated.
39834 SmallVector<int, 8> RepeatedMask;
39835 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39836 RepeatedMask)) {
39837 assert(RepeatedMask.size() == 8 &&
39838 "Repeated mask size doesn't match!");
39839 PermuteImm = 0;
39840 for (int i = 0; i < 8; ++i)
39841 if (RepeatedMask[i] >= 8)
39842 PermuteImm |= 1 << i;
39843 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39844 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39845 Shuffle = X86ISD::BLENDI;
39846 ShuffleVT = MaskVT;
39847 return true;
39848 }
39849 } else {
39850 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39851 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39852 PermuteImm = (unsigned)BlendMask;
39853 Shuffle = X86ISD::BLENDI;
39854 ShuffleVT = MaskVT;
39855 return true;
39856 }
39857 }
39858 }
39859
39860 // Attempt to combine to INSERTPS, but only if it has elements that need to
39861 // be set to zero.
39862 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39863 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39864 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39865 Shuffle = X86ISD::INSERTPS;
39866 ShuffleVT = MVT::v4f32;
39867 return true;
39868 }
39869
39870 // Attempt to combine to SHUFPD.
39871 if (AllowFloatDomain && EltSizeInBits == 64 &&
39872 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39873 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39874 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39875 bool ForceV1Zero = false, ForceV2Zero = false;
39876 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39877 PermuteImm, Mask, Zeroable)) {
39878 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39879 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39880 Shuffle = X86ISD::SHUFP;
39881 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39882 return true;
39883 }
39884 }
39885
39886 // Attempt to combine to SHUFPS.
39887 if (AllowFloatDomain && EltSizeInBits == 32 &&
39888 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39889 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39890 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39891 SmallVector<int, 4> RepeatedMask;
39892 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39893 // Match each half of the repeated mask, to determine if its just
39894 // referencing one of the vectors, is zeroable or entirely undef.
39895 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39896 int M0 = RepeatedMask[Offset];
39897 int M1 = RepeatedMask[Offset + 1];
39898
39899 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39900 return DAG.getUNDEF(MaskVT);
39901 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39902 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39903 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39904 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39905 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39906 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39907 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39908 return V1;
39909 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39910 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39911 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39912 return V2;
39913 }
39914
39915 return SDValue();
39916 };
39917
39918 int ShufMask[4] = {-1, -1, -1, -1};
39919 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39920 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39921
39922 if (Lo && Hi) {
39923 V1 = Lo;
39924 V2 = Hi;
39925 Shuffle = X86ISD::SHUFP;
39926 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39927 PermuteImm = getV4X86ShuffleImm(ShufMask);
39928 return true;
39929 }
39930 }
39931 }
39932
39933 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39934 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39935 MaskVT.is128BitVector() &&
39936 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39937 Shuffle = X86ISD::INSERTPS;
39938 ShuffleVT = MVT::v4f32;
39939 return true;
39940 }
39941
39942 return false;
39943}
39944
39946 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39947 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39948 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39949 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39950 const X86Subtarget &Subtarget);
39951
39952/// Combine an arbitrary chain of shuffles into a single instruction if
39953/// possible.
39954///
39955/// This is the leaf of the recursive combine below. When we have found some
39956/// chain of single-use x86 shuffle instructions and accumulated the combined
39957/// shuffle mask represented by them, this will try to pattern match that mask
39958/// into either a single instruction if there is a special purpose instruction
39959/// for this operation, or into a PSHUFB instruction which is a fully general
39960/// instruction but should only be used to replace chains over a certain depth.
39962 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39963 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39964 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39965 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39966 const X86Subtarget &Subtarget) {
39967 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39968 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39969 "Unexpected number of shuffle inputs!");
39970 unsigned RootSizeInBits = RootVT.getSizeInBits();
39971 unsigned NumRootElts = RootVT.getVectorNumElements();
39972
39973 // Canonicalize shuffle input op to the requested type.
39974 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39975 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39976 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39977 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39978 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39979 return DAG.getBitcast(VT, Op);
39980 };
39981
39982 // Find the inputs that enter the chain. Note that multiple uses are OK
39983 // here, we're not going to remove the operands we find.
39984 bool UnaryShuffle = (Inputs.size() == 1);
39985 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39986 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39987 : peekThroughBitcasts(Inputs[1]));
39988
39989 MVT VT1 = V1.getSimpleValueType();
39990 MVT VT2 = V2.getSimpleValueType();
39991 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
39992 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
39993
39994 SDValue Res;
39995
39996 unsigned NumBaseMaskElts = BaseMask.size();
39997 if (NumBaseMaskElts == 1) {
39998 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
39999 return CanonicalizeShuffleInput(RootVT, V1);
40000 }
40001
40002 bool OptForSize = DAG.shouldOptForSize();
40003 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
40004 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
40005 (RootVT.isFloatingPoint() && Depth >= 1) ||
40006 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40007
40008 // If we are shuffling a splat (and not introducing zeros) then we can just
40009 // use it directly. This works for smaller elements as well as they already
40010 // repeat across each mask element.
40011 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40012 V1.getValueSizeInBits() >= RootSizeInBits &&
40013 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40014 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40015 return CanonicalizeShuffleInput(RootVT, V1);
40016 }
40017
40018 SmallVector<int, 64> Mask(BaseMask);
40019
40020 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40021 // etc. can be simplified.
40022 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40023 SmallVector<int> ScaledMask, IdentityMask;
40024 unsigned NumElts = VT1.getVectorNumElements();
40025 if (Mask.size() <= NumElts &&
40026 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40027 for (unsigned i = 0; i != NumElts; ++i)
40028 IdentityMask.push_back(i);
40029 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40030 V2))
40031 return CanonicalizeShuffleInput(RootVT, V1);
40032 }
40033 }
40034
40035 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40036 if (RootVT.is512BitVector() &&
40037 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40038 // If the upper subvectors are zeroable, then an extract+insert is more
40039 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40040 // to zero the upper subvectors.
40041 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40042 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40043 return SDValue(); // Nothing to do!
40044 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40045 "Unexpected lane shuffle");
40046 Res = CanonicalizeShuffleInput(RootVT, V1);
40047 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40048 bool UseZero = isAnyZero(Mask);
40049 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40050 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40051 }
40052
40053 // Narrow shuffle mask to v4x128.
40054 SmallVector<int, 4> ScaledMask;
40055 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40056 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40057
40058 // Try to lower to vshuf64x2/vshuf32x4.
40059 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40060 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40061 SelectionDAG &DAG) {
40062 int PermMask[4] = {-1, -1, -1, -1};
40063 // Ensure elements came from the same Op.
40064 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40065 for (int i = 0; i < 4; ++i) {
40066 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40067 if (ScaledMask[i] < 0)
40068 continue;
40069
40070 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40071 unsigned OpIndex = i / 2;
40072 if (Ops[OpIndex].isUndef())
40073 Ops[OpIndex] = Op;
40074 else if (Ops[OpIndex] != Op)
40075 return SDValue();
40076
40077 PermMask[i] = ScaledMask[i] % 4;
40078 }
40079
40080 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40081 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40082 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40083 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40084 };
40085
40086 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40087 // doesn't work because our mask is for 128 bits and we don't have an MVT
40088 // to match that.
40089 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40090 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40091 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40092 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40093 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40094 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40095 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40096 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40097 ScaledMask[1] == (ScaledMask[3] % 2));
40098
40099 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40100 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40101 return SDValue(); // Nothing to do!
40102 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40103 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40104 return DAG.getBitcast(RootVT, V);
40105 }
40106 }
40107
40108 // Handle 128-bit lane shuffles of 256-bit vectors.
40109 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40110 // If the upper half is zeroable, then an extract+insert is more optimal
40111 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40112 // zero the upper half.
40113 if (isUndefOrZero(Mask[1])) {
40114 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40115 return SDValue(); // Nothing to do!
40116 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40117 Res = CanonicalizeShuffleInput(RootVT, V1);
40118 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40119 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40120 256);
40121 }
40122
40123 // If we're inserting the low subvector, an insert-subvector 'concat'
40124 // pattern is quicker than VPERM2X128.
40125 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40126 !Subtarget.hasAVX2()) {
40127 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40128 return SDValue(); // Nothing to do!
40129 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40130 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40131 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40132 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40133 }
40134
40135 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40136 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40137 // feature.
40138 // Prefer blends for sequential shuffles unless we are optimizing for size.
40139 if (UnaryShuffle &&
40140 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40141 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40142 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40143 return SDValue(); // Nothing to do!
40144 unsigned PermMask = 0;
40145 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40146 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40147 return DAG.getNode(
40148 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40149 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40150 }
40151
40152 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40153 return SDValue(); // Nothing to do!
40154
40155 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40156 if (!UnaryShuffle && !IsMaskedShuffle) {
40157 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40158 "Unexpected shuffle sentinel value");
40159 // Prefer blends to X86ISD::VPERM2X128.
40160 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40161 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40162 return SDValue(); // Nothing to do!
40163 unsigned PermMask = 0;
40164 PermMask |= ((Mask[0] & 3) << 0);
40165 PermMask |= ((Mask[1] & 3) << 4);
40166 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40167 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40168 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40169 CanonicalizeShuffleInput(RootVT, LHS),
40170 CanonicalizeShuffleInput(RootVT, RHS),
40171 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40172 }
40173 }
40174 }
40175
40176 // For masks that have been widened to 128-bit elements or more,
40177 // narrow back down to 64-bit elements.
40178 if (BaseMaskEltSizeInBits > 64) {
40179 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40180 int MaskScale = BaseMaskEltSizeInBits / 64;
40181 SmallVector<int, 64> ScaledMask;
40182 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40183 Mask = std::move(ScaledMask);
40184 }
40185
40186 // For masked shuffles, we're trying to match the root width for better
40187 // writemask folding, attempt to scale the mask.
40188 // TODO - variable shuffles might need this to be widened again.
40189 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40190 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40191 int MaskScale = NumRootElts / Mask.size();
40192 SmallVector<int, 64> ScaledMask;
40193 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40194 Mask = std::move(ScaledMask);
40195 }
40196
40197 unsigned NumMaskElts = Mask.size();
40198 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40199 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40200
40201 // Determine the effective mask value type.
40202 FloatDomain &= (32 <= MaskEltSizeInBits);
40203 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40204 : MVT::getIntegerVT(MaskEltSizeInBits);
40205 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40206
40207 // Only allow legal mask types.
40208 if (!TLI.isTypeLegal(MaskVT))
40209 return SDValue();
40210
40211 // Attempt to match the mask against known shuffle patterns.
40212 MVT ShuffleSrcVT, ShuffleVT;
40213 unsigned Shuffle, PermuteImm;
40214
40215 // Which shuffle domains are permitted?
40216 // Permit domain crossing at higher combine depths.
40217 // TODO: Should we indicate which domain is preferred if both are allowed?
40218 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40219 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40220 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40221
40222 // Determine zeroable mask elements.
40223 APInt KnownUndef, KnownZero;
40224 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40225 APInt Zeroable = KnownUndef | KnownZero;
40226
40227 if (UnaryShuffle) {
40228 // Attempt to match against broadcast-from-vector.
40229 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40230 if ((Subtarget.hasAVX2() ||
40231 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40232 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40233 if (isUndefOrEqual(Mask, 0)) {
40234 if (V1.getValueType() == MaskVT &&
40236 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40237 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40238 return SDValue(); // Nothing to do!
40239 Res = V1.getOperand(0);
40240 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40241 return DAG.getBitcast(RootVT, Res);
40242 }
40243 if (Subtarget.hasAVX2()) {
40244 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40245 return SDValue(); // Nothing to do!
40246 Res = CanonicalizeShuffleInput(MaskVT, V1);
40247 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40248 return DAG.getBitcast(RootVT, Res);
40249 }
40250 }
40251 }
40252
40253 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40254 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40255 (!IsMaskedShuffle ||
40256 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40257 if (Depth == 0 && RootOpc == Shuffle)
40258 return SDValue(); // Nothing to do!
40259 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40260 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40261 return DAG.getBitcast(RootVT, Res);
40262 }
40263
40264 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40265 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40266 PermuteImm) &&
40267 (!IsMaskedShuffle ||
40268 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40269 if (Depth == 0 && RootOpc == Shuffle)
40270 return SDValue(); // Nothing to do!
40271 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40272 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40273 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40274 return DAG.getBitcast(RootVT, Res);
40275 }
40276 }
40277
40278 // Attempt to combine to INSERTPS, but only if the inserted element has come
40279 // from a scalar.
40280 // TODO: Handle other insertions here as well?
40281 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40282 Subtarget.hasSSE41() &&
40283 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40284 if (MaskEltSizeInBits == 32) {
40285 SDValue SrcV1 = V1, SrcV2 = V2;
40286 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40287 DAG) &&
40288 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40289 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40290 return SDValue(); // Nothing to do!
40291 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40292 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40293 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40294 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40295 return DAG.getBitcast(RootVT, Res);
40296 }
40297 }
40298 if (MaskEltSizeInBits == 64 &&
40299 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40301 V2.getScalarValueSizeInBits() <= 32) {
40302 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40303 return SDValue(); // Nothing to do!
40304 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40305 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40306 CanonicalizeShuffleInput(MVT::v4f32, V1),
40307 CanonicalizeShuffleInput(MVT::v4f32, V2),
40308 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40309 return DAG.getBitcast(RootVT, Res);
40310 }
40311 }
40312
40313 SDValue NewV1 = V1; // Save operands in case early exit happens.
40314 SDValue NewV2 = V2;
40315 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40316 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40317 ShuffleVT, UnaryShuffle) &&
40318 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40319 if (Depth == 0 && RootOpc == Shuffle)
40320 return SDValue(); // Nothing to do!
40321 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40322 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40323 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40324 return DAG.getBitcast(RootVT, Res);
40325 }
40326
40327 NewV1 = V1; // Save operands in case early exit happens.
40328 NewV2 = V2;
40329 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40330 AllowIntDomain, NewV1, NewV2, DL, DAG,
40331 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40332 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40333 if (Depth == 0 && RootOpc == Shuffle)
40334 return SDValue(); // Nothing to do!
40335 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40336 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40337 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40338 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40339 return DAG.getBitcast(RootVT, Res);
40340 }
40341
40342 // Typically from here on, we need an integer version of MaskVT.
40343 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40344 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40345
40346 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40347 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40348 uint64_t BitLen, BitIdx;
40349 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40350 Zeroable)) {
40351 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40352 return SDValue(); // Nothing to do!
40353 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40354 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40355 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40356 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40357 return DAG.getBitcast(RootVT, Res);
40358 }
40359
40360 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40361 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40362 return SDValue(); // Nothing to do!
40363 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40364 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40365 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40366 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40367 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40368 return DAG.getBitcast(RootVT, Res);
40369 }
40370 }
40371
40372 // Match shuffle against TRUNCATE patterns.
40373 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40374 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40375 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40376 Subtarget)) {
40377 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40378 ShuffleSrcVT.getVectorNumElements();
40379 unsigned Opc =
40380 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
40381 if (Depth == 0 && RootOpc == Opc)
40382 return SDValue(); // Nothing to do!
40383 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40384 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40385 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40386 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40387 return DAG.getBitcast(RootVT, Res);
40388 }
40389
40390 // Do we need a more general binary truncation pattern?
40391 if (RootSizeInBits < 512 &&
40392 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40393 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40394 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40395 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40396 // Bail if this was already a truncation or PACK node.
40397 // We sometimes fail to match PACK if we demand known undef elements.
40398 if (Depth == 0 &&
40399 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40400 RootOpc == X86ISD::PACKUS))
40401 return SDValue(); // Nothing to do!
40402 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40403 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40404 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40405 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40406 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40407 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40408 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40409 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40410 return DAG.getBitcast(RootVT, Res);
40411 }
40412 }
40413
40414 // Don't try to re-form single instruction chains under any circumstances now
40415 // that we've done encoding canonicalization for them.
40416 if (Depth < 1)
40417 return SDValue();
40418
40419 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40420 return isTargetShuffleVariableMask(N->getOpcode());
40421 });
40422 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40423 return (N->getOpcode() == X86ISD::VPERMV3 ||
40424 N->getOpcode() == X86ISD::VPERMV);
40425 });
40426
40427 // Depth threshold above which we can efficiently use variable mask shuffles.
40428 int VariableCrossLaneShuffleDepth =
40429 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40430 int VariablePerLaneShuffleDepth =
40431 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40432 AllowVariableCrossLaneMask &=
40433 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40434 AllowVariablePerLaneMask &=
40435 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40436 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40437 // higher depth before combining them.
40438 int BWIVPERMV3ShuffleDepth =
40439 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40440 bool AllowBWIVPERMV3 =
40441 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40442
40443 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40444 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40445 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40446
40447 bool MaskContainsZeros = isAnyZero(Mask);
40448
40449 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40450 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40451 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40452 if (Subtarget.hasAVX2() &&
40453 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40454 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40455 Res = CanonicalizeShuffleInput(MaskVT, V1);
40456 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40457 return DAG.getBitcast(RootVT, Res);
40458 }
40459 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40460 if ((Subtarget.hasAVX512() &&
40461 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40462 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40463 (Subtarget.hasBWI() &&
40464 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40465 (Subtarget.hasVBMI() &&
40466 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40467 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40468 V2 = DAG.getUNDEF(MaskVT);
40469 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40470 return DAG.getBitcast(RootVT, Res);
40471 }
40472 }
40473
40474 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40475 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40476 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40477 ((Subtarget.hasAVX512() &&
40478 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40479 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40480 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40481 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40482 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40483 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40484 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40485 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40486 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40487 for (unsigned i = 0; i != NumMaskElts; ++i)
40488 if (Mask[i] == SM_SentinelZero)
40489 Mask[i] = NumMaskElts + i;
40490 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40491 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40492 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40493 return DAG.getBitcast(RootVT, Res);
40494 }
40495
40496 // If that failed and either input is extracted then try to combine as a
40497 // shuffle with the larger type.
40499 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40500 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40501 IsMaskedShuffle, DAG, DL, Subtarget))
40502 return WideShuffle;
40503
40504 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40505 // (non-VLX will pad to 512-bit shuffles).
40506 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40507 ((Subtarget.hasAVX512() &&
40508 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40509 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40510 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40511 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40512 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40513 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40514 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40515 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40516 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40517 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40518 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40519 return DAG.getBitcast(RootVT, Res);
40520 }
40521 return SDValue();
40522 }
40523
40524 // See if we can combine a single input shuffle with zeros to a bit-mask,
40525 // which is much simpler than any shuffle.
40526 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40527 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40528 TLI.isTypeLegal(MaskVT)) {
40529 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40530 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40531 APInt UndefElts(NumMaskElts, 0);
40532 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40533 for (unsigned i = 0; i != NumMaskElts; ++i) {
40534 int M = Mask[i];
40535 if (M == SM_SentinelUndef) {
40536 UndefElts.setBit(i);
40537 continue;
40538 }
40539 if (M == SM_SentinelZero)
40540 continue;
40541 EltBits[i] = AllOnes;
40542 }
40543 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40544 Res = CanonicalizeShuffleInput(MaskVT, V1);
40545 unsigned AndOpcode =
40547 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40548 return DAG.getBitcast(RootVT, Res);
40549 }
40550
40551 // If we have a single input shuffle with different shuffle patterns in the
40552 // the 128-bit lanes use the variable mask to VPERMILPS.
40553 // TODO Combine other mask types at higher depths.
40554 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40555 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40556 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40557 SmallVector<SDValue, 16> VPermIdx;
40558 for (int M : Mask) {
40559 SDValue Idx =
40560 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40561 VPermIdx.push_back(Idx);
40562 }
40563 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40564 Res = CanonicalizeShuffleInput(MaskVT, V1);
40565 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40566 return DAG.getBitcast(RootVT, Res);
40567 }
40568
40569 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40570 // to VPERMIL2PD/VPERMIL2PS.
40571 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40572 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40573 MaskVT == MVT::v8f32)) {
40574 // VPERMIL2 Operation.
40575 // Bits[3] - Match Bit.
40576 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40577 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40578 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40579 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40580 SmallVector<int, 8> VPerm2Idx;
40581 unsigned M2ZImm = 0;
40582 for (int M : Mask) {
40583 if (M == SM_SentinelUndef) {
40584 VPerm2Idx.push_back(-1);
40585 continue;
40586 }
40587 if (M == SM_SentinelZero) {
40588 M2ZImm = 2;
40589 VPerm2Idx.push_back(8);
40590 continue;
40591 }
40592 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40593 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40594 VPerm2Idx.push_back(Index);
40595 }
40596 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40597 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40598 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40599 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40600 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40601 return DAG.getBitcast(RootVT, Res);
40602 }
40603
40604 // If we have 3 or more shuffle instructions or a chain involving a variable
40605 // mask, we can replace them with a single PSHUFB instruction profitably.
40606 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40607 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40608 // more aggressive.
40609 if (UnaryShuffle && AllowVariablePerLaneMask &&
40610 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40611 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40612 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40613 SmallVector<SDValue, 16> PSHUFBMask;
40614 int NumBytes = RootVT.getSizeInBits() / 8;
40615 int Ratio = NumBytes / NumMaskElts;
40616 for (int i = 0; i < NumBytes; ++i) {
40617 int M = Mask[i / Ratio];
40618 if (M == SM_SentinelUndef) {
40619 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40620 continue;
40621 }
40622 if (M == SM_SentinelZero) {
40623 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40624 continue;
40625 }
40626 M = Ratio * M + i % Ratio;
40627 assert((M / 16) == (i / 16) && "Lane crossing detected");
40628 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40629 }
40630 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40631 Res = CanonicalizeShuffleInput(ByteVT, V1);
40632 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40633 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40634 return DAG.getBitcast(RootVT, Res);
40635 }
40636
40637 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40638 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40639 // slower than PSHUFB on targets that support both.
40640 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40641 Subtarget.hasXOP()) {
40642 // VPPERM Mask Operation
40643 // Bits[4:0] - Byte Index (0 - 31)
40644 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40645 SmallVector<SDValue, 16> VPPERMMask;
40646 int NumBytes = 16;
40647 int Ratio = NumBytes / NumMaskElts;
40648 for (int i = 0; i < NumBytes; ++i) {
40649 int M = Mask[i / Ratio];
40650 if (M == SM_SentinelUndef) {
40651 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40652 continue;
40653 }
40654 if (M == SM_SentinelZero) {
40655 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40656 continue;
40657 }
40658 M = Ratio * M + i % Ratio;
40659 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40660 }
40661 MVT ByteVT = MVT::v16i8;
40662 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40663 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40664 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40665 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40666 return DAG.getBitcast(RootVT, Res);
40667 }
40668
40669 // If that failed and either input is extracted then try to combine as a
40670 // shuffle with the larger type.
40672 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40673 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40674 DAG, DL, Subtarget))
40675 return WideShuffle;
40676
40677 // If we have a dual input shuffle then lower to VPERMV3,
40678 // (non-VLX will pad to 512-bit shuffles)
40679 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40680 ((Subtarget.hasAVX512() &&
40681 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40682 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40683 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40684 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40685 MaskVT == MVT::v16i32)) ||
40686 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40687 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40688 MaskVT == MVT::v32i16)) ||
40689 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40690 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40691 MaskVT == MVT::v64i8)))) {
40692 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40693 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40694 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40695 return DAG.getBitcast(RootVT, Res);
40696 }
40697
40698 // Failed to find any combines.
40699 return SDValue();
40700}
40701
40702// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40703// instruction if possible.
40704//
40705// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40706// type size to attempt to combine:
40707// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40708// -->
40709// extract_subvector(shuffle(x,y,m2),0)
40711 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40712 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40713 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40714 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40715 const X86Subtarget &Subtarget) {
40716 unsigned NumMaskElts = BaseMask.size();
40717 unsigned NumInputs = Inputs.size();
40718 if (NumInputs == 0)
40719 return SDValue();
40720
40721 unsigned RootSizeInBits = RootVT.getSizeInBits();
40722 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40723 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40724
40725 // Peek through subvectors to find widest legal vector.
40726 // TODO: Handle ISD::TRUNCATE
40727 unsigned WideSizeInBits = RootSizeInBits;
40728 for (SDValue Input : Inputs) {
40729 Input = peekThroughBitcasts(Input);
40730 while (1) {
40731 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40732 Input = peekThroughBitcasts(Input.getOperand(0));
40733 continue;
40734 }
40735 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40736 Input.getOperand(0).isUndef() &&
40737 isNullConstant(Input.getOperand(2))) {
40738 Input = peekThroughBitcasts(Input.getOperand(1));
40739 continue;
40740 }
40741 break;
40742 }
40743 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40744 WideSizeInBits < Input.getValueSizeInBits())
40745 WideSizeInBits = Input.getValueSizeInBits();
40746 }
40747
40748 // Bail if we fail to find a source larger than the existing root.
40749 if (WideSizeInBits <= RootSizeInBits ||
40750 (WideSizeInBits % RootSizeInBits) != 0)
40751 return SDValue();
40752
40753 // Create new mask for larger type.
40754 SmallVector<int, 64> WideMask;
40755 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40756
40757 // Attempt to peek through inputs and adjust mask when we extract from an
40758 // upper subvector.
40759 int AdjustedMasks = 0;
40760 SmallVector<SDValue, 4> WideInputs(Inputs);
40761 for (unsigned I = 0; I != NumInputs; ++I) {
40762 SDValue &Input = WideInputs[I];
40763 Input = peekThroughBitcasts(Input);
40764 while (1) {
40765 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40766 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40768 if (Idx != 0) {
40769 ++AdjustedMasks;
40770 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40771 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40772
40773 int lo = I * WideMask.size();
40774 int hi = (I + 1) * WideMask.size();
40775 for (int &M : WideMask)
40776 if (lo <= M && M < hi)
40777 M += Idx;
40778 }
40779 Input = peekThroughBitcasts(Input.getOperand(0));
40780 continue;
40781 }
40782 // TODO: Handle insertions into upper subvectors.
40783 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40784 Input.getOperand(0).isUndef() &&
40785 isNullConstant(Input.getOperand(2))) {
40786 Input = peekThroughBitcasts(Input.getOperand(1));
40787 continue;
40788 }
40789 break;
40790 }
40791 }
40792
40793 // Remove unused/repeated shuffle source ops.
40794 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40795 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40796
40797 // Bail if we're always extracting from the lowest subvectors,
40798 // combineX86ShuffleChain should match this for the current width, or the
40799 // shuffle still references too many inputs.
40800 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40801 return SDValue();
40802
40803 // Minor canonicalization of the accumulated shuffle mask to make it easier
40804 // to match below. All this does is detect masks with sequential pairs of
40805 // elements, and shrink them to the half-width mask. It does this in a loop
40806 // so it will reduce the size of the mask to the minimal width mask which
40807 // performs an equivalent shuffle.
40808 while (WideMask.size() > 1) {
40809 SmallVector<int, 64> WidenedMask;
40810 if (!canWidenShuffleElements(WideMask, WidenedMask))
40811 break;
40812 WideMask = std::move(WidenedMask);
40813 }
40814
40815 // Canonicalization of binary shuffle masks to improve pattern matching by
40816 // commuting the inputs.
40817 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40819 std::swap(WideInputs[0], WideInputs[1]);
40820 }
40821
40822 // Increase depth for every upper subvector we've peeked through.
40823 Depth += AdjustedMasks;
40824
40825 // Attempt to combine wider chain.
40826 // TODO: Can we use a better Root?
40827 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40828 WideInputs.back().getValueSizeInBits()
40829 ? WideInputs.front()
40830 : WideInputs.back();
40831 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40832 "WideRootSize mismatch");
40833
40834 if (SDValue WideShuffle = combineX86ShuffleChain(
40835 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40836 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40837 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40838 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40839 return DAG.getBitcast(RootVT, WideShuffle);
40840 }
40841
40842 return SDValue();
40843}
40844
40845// Canonicalize the combined shuffle mask chain with horizontal ops.
40846// NOTE: This may update the Ops and Mask.
40849 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40850 const X86Subtarget &Subtarget) {
40851 if (Mask.empty() || Ops.empty())
40852 return SDValue();
40853
40855 for (SDValue Op : Ops)
40857
40858 // All ops must be the same horizop + type.
40859 SDValue BC0 = BC[0];
40860 EVT VT0 = BC0.getValueType();
40861 unsigned Opcode0 = BC0.getOpcode();
40862 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40863 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40864 }))
40865 return SDValue();
40866
40867 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40868 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40869 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40870 if (!isHoriz && !isPack)
40871 return SDValue();
40872
40873 // Do all ops have a single use?
40874 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40875 return Op.hasOneUse() &&
40877 });
40878
40879 int NumElts = VT0.getVectorNumElements();
40880 int NumLanes = VT0.getSizeInBits() / 128;
40881 int NumEltsPerLane = NumElts / NumLanes;
40882 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40883 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40884 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40885
40886 if (NumEltsPerLane >= 4 &&
40887 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40888 SmallVector<int> LaneMask, ScaledMask;
40889 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40890 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40891 // See if we can remove the shuffle by resorting the HOP chain so that
40892 // the HOP args are pre-shuffled.
40893 // TODO: Generalize to any sized/depth chain.
40894 // TODO: Add support for PACKSS/PACKUS.
40895 if (isHoriz) {
40896 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40897 auto GetHOpSrc = [&](int M) {
40898 if (M == SM_SentinelUndef)
40899 return DAG.getUNDEF(VT0);
40900 if (M == SM_SentinelZero)
40901 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40902 SDValue Src0 = BC[M / 4];
40903 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40904 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40905 return Src1.getOperand(M % 2);
40906 return SDValue();
40907 };
40908 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40909 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40910 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40911 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40912 if (M0 && M1 && M2 && M3) {
40913 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40914 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40915 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40916 }
40917 }
40918 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40919 if (Ops.size() >= 2) {
40920 SDValue LHS, RHS;
40921 auto GetHOpSrc = [&](int M, int &OutM) {
40922 // TODO: Support SM_SentinelZero
40923 if (M < 0)
40924 return M == SM_SentinelUndef;
40925 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40926 if (!LHS || LHS == Src) {
40927 LHS = Src;
40928 OutM = (M % 2);
40929 return true;
40930 }
40931 if (!RHS || RHS == Src) {
40932 RHS = Src;
40933 OutM = (M % 2) + 2;
40934 return true;
40935 }
40936 return false;
40937 };
40938 int PostMask[4] = {-1, -1, -1, -1};
40939 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40940 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40941 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40942 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40943 LHS = DAG.getBitcast(SrcVT, LHS);
40944 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40945 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40946 // Use SHUFPS for the permute so this will work on SSE2 targets,
40947 // shuffle combining and domain handling will simplify this later on.
40948 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40949 Res = DAG.getBitcast(ShuffleVT, Res);
40950 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40951 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40952 }
40953 }
40954 }
40955 }
40956
40957 if (2 < Ops.size())
40958 return SDValue();
40959
40960 SDValue BC1 = BC[BC.size() - 1];
40961 if (Mask.size() == VT0.getVectorNumElements()) {
40962 // Canonicalize binary shuffles of horizontal ops that use the
40963 // same sources to an unary shuffle.
40964 // TODO: Try to perform this fold even if the shuffle remains.
40965 if (Ops.size() == 2) {
40966 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40967 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40968 };
40969 // Commute if all BC0's ops are contained in BC1.
40970 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40971 ContainsOps(BC1, BC0.getOperand(1))) {
40973 std::swap(Ops[0], Ops[1]);
40974 std::swap(BC0, BC1);
40975 }
40976
40977 // If BC1 can be represented by BC0, then convert to unary shuffle.
40978 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40979 ContainsOps(BC0, BC1.getOperand(1))) {
40980 for (int &M : Mask) {
40981 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40982 continue;
40983 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40984 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40985 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40986 M += NumHalfEltsPerLane;
40987 }
40988 }
40989 }
40990
40991 // Canonicalize unary horizontal ops to only refer to lower halves.
40992 for (int i = 0; i != NumElts; ++i) {
40993 int &M = Mask[i];
40994 if (isUndefOrZero(M))
40995 continue;
40996 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40997 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40998 M -= NumHalfEltsPerLane;
40999 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
41000 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41001 M -= NumHalfEltsPerLane;
41002 }
41003 }
41004
41005 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41006 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41007 // represents the LHS/RHS inputs for the lower/upper halves.
41008 SmallVector<int, 16> TargetMask128, WideMask128;
41009 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41010 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41011 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41012 bool SingleOp = (Ops.size() == 1);
41013 if (isPack || OneUseOps ||
41014 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41015 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41016 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41017 Lo = Lo.getOperand(WideMask128[0] & 1);
41018 Hi = Hi.getOperand(WideMask128[1] & 1);
41019 if (SingleOp) {
41020 SDValue Undef = DAG.getUNDEF(SrcVT);
41021 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41022 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41023 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41024 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41025 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41026 }
41027 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41028 }
41029 }
41030
41031 // If we are post-shuffling a 256-bit hop and not requiring the upper
41032 // elements, then try to narrow to a 128-bit hop directly.
41033 SmallVector<int, 16> WideMask64;
41034 if (Ops.size() == 1 && NumLanes == 2 &&
41035 scaleShuffleElements(Mask, 4, WideMask64) &&
41036 isUndefInRange(WideMask64, 2, 2)) {
41037 int M0 = WideMask64[0];
41038 int M1 = WideMask64[1];
41039 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41041 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41042 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41043 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41044 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41045 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41046 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41047 }
41048 }
41049
41050 return SDValue();
41051}
41052
41053// Attempt to constant fold all of the constant source ops.
41054// Returns true if the entire shuffle is folded to a constant.
41055// TODO: Extend this to merge multiple constant Ops and update the mask.
41057 ArrayRef<int> Mask,
41058 ArrayRef<const SDNode *> SrcNodes,
41059 SelectionDAG &DAG, const SDLoc &DL,
41060 const X86Subtarget &Subtarget) {
41061 unsigned SizeInBits = VT.getSizeInBits();
41062 unsigned NumMaskElts = Mask.size();
41063 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41064 unsigned NumOps = Ops.size();
41065
41066 // Extract constant bits from each source op.
41067 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41068 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
41069 for (unsigned I = 0; I != NumOps; ++I)
41070 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41071 RawBitsOps[I],
41072 /*AllowWholeUndefs*/ true,
41073 /*AllowPartialUndefs*/ true))
41074 return SDValue();
41075
41076 // If we're optimizing for size, only fold if at least one of the constants is
41077 // only used once or the combined shuffle has included a variable mask
41078 // shuffle, this is to avoid constant pool bloat.
41079 bool IsOptimizingSize = DAG.shouldOptForSize();
41080 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41081 return isTargetShuffleVariableMask(N->getOpcode());
41082 });
41083 if (IsOptimizingSize && !HasVariableMask &&
41084 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41085 return SDValue();
41086
41087 // Shuffle the constant bits according to the mask.
41088 APInt UndefElts(NumMaskElts, 0);
41089 APInt ZeroElts(NumMaskElts, 0);
41090 APInt ConstantElts(NumMaskElts, 0);
41091 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41092 APInt::getZero(MaskSizeInBits));
41093 for (unsigned i = 0; i != NumMaskElts; ++i) {
41094 int M = Mask[i];
41095 if (M == SM_SentinelUndef) {
41096 UndefElts.setBit(i);
41097 continue;
41098 } else if (M == SM_SentinelZero) {
41099 ZeroElts.setBit(i);
41100 continue;
41101 }
41102 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41103
41104 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41105 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41106
41107 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41108 if (SrcUndefElts[SrcMaskIdx]) {
41109 UndefElts.setBit(i);
41110 continue;
41111 }
41112
41113 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41114 APInt &Bits = SrcEltBits[SrcMaskIdx];
41115 if (!Bits) {
41116 ZeroElts.setBit(i);
41117 continue;
41118 }
41119
41120 ConstantElts.setBit(i);
41121 ConstantBitData[i] = Bits;
41122 }
41123 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41124
41125 // Attempt to create a zero vector.
41126 if ((UndefElts | ZeroElts).isAllOnes())
41127 return getZeroVector(VT, Subtarget, DAG, DL);
41128
41129 // Create the constant data.
41130 MVT MaskSVT;
41131 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41132 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41133 else
41134 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41135
41136 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41137 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41138 return SDValue();
41139
41140 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41141 return DAG.getBitcast(VT, CstOp);
41142}
41143
41144namespace llvm {
41145 namespace X86 {
41146 enum {
41149 } // namespace X86
41150} // namespace llvm
41151
41152/// Fully generic combining of x86 shuffle instructions.
41153///
41154/// This should be the last combine run over the x86 shuffle instructions. Once
41155/// they have been fully optimized, this will recursively consider all chains
41156/// of single-use shuffle instructions, build a generic model of the cumulative
41157/// shuffle operation, and check for simpler instructions which implement this
41158/// operation. We use this primarily for two purposes:
41159///
41160/// 1) Collapse generic shuffles to specialized single instructions when
41161/// equivalent. In most cases, this is just an encoding size win, but
41162/// sometimes we will collapse multiple generic shuffles into a single
41163/// special-purpose shuffle.
41164/// 2) Look for sequences of shuffle instructions with 3 or more total
41165/// instructions, and replace them with the slightly more expensive SSSE3
41166/// PSHUFB instruction if available. We do this as the last combining step
41167/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41168/// a suitable short sequence of other instructions. The PSHUFB will either
41169/// use a register or have to read from memory and so is slightly (but only
41170/// slightly) more expensive than the other shuffle instructions.
41171///
41172/// Because this is inherently a quadratic operation (for each shuffle in
41173/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41174/// This should never be an issue in practice as the shuffle lowering doesn't
41175/// produce sequences of more than 8 instructions.
41176///
41177/// FIXME: We will currently miss some cases where the redundant shuffling
41178/// would simplify under the threshold for PSHUFB formation because of
41179/// combine-ordering. To fix this, we should do the redundant instruction
41180/// combining in this recursive walk.
41182 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41183 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41184 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41185 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41186 const SDLoc &DL, const X86Subtarget &Subtarget) {
41187 assert(!RootMask.empty() &&
41188 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41189 "Illegal shuffle root mask");
41190 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41191 unsigned RootSizeInBits = RootVT.getSizeInBits();
41192 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41193
41194 // Bound the depth of our recursive combine because this is ultimately
41195 // quadratic in nature.
41196 if (Depth >= MaxDepth)
41197 return SDValue();
41198
41199 // Directly rip through bitcasts to find the underlying operand.
41200 SDValue Op = SrcOps[SrcOpIndex];
41202
41203 EVT VT = Op.getValueType();
41204 if (!VT.isVector() || !VT.isSimple())
41205 return SDValue(); // Bail if we hit a non-simple non-vector.
41206
41207 // FIXME: Just bail on f16 for now.
41208 if (VT.getVectorElementType() == MVT::f16)
41209 return SDValue();
41210
41211 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41212 "Can only combine shuffles upto size of the root op.");
41213
41214 // Create a demanded elts mask from the referenced elements of Op.
41215 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41216 for (int M : RootMask) {
41217 int BaseIdx = RootMask.size() * SrcOpIndex;
41218 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41219 OpDemandedElts.setBit(M - BaseIdx);
41220 }
41221 if (RootSizeInBits != VT.getSizeInBits()) {
41222 // Op is smaller than Root - extract the demanded elts for the subvector.
41223 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41224 unsigned NumOpMaskElts = RootMask.size() / Scale;
41225 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41226 assert(OpDemandedElts
41227 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41228 .isZero() &&
41229 "Out of range elements referenced in root mask");
41230 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41231 }
41232 OpDemandedElts =
41233 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41234
41235 // Extract target shuffle mask and resolve sentinels and inputs.
41236 SmallVector<int, 64> OpMask;
41237 SmallVector<SDValue, 2> OpInputs;
41238 APInt OpUndef, OpZero;
41239 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41240 OpZero, DAG, Depth, false)) {
41241 // Shuffle inputs must not be larger than the shuffle result.
41242 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41243 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41244 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41245 }))
41246 return SDValue();
41247 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41248 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41249 !isNullConstant(Op.getOperand(1))) {
41250 SDValue SrcVec = Op.getOperand(0);
41251 int ExtractIdx = Op.getConstantOperandVal(1);
41252 unsigned NumElts = VT.getVectorNumElements();
41253 OpInputs.assign({SrcVec});
41254 OpMask.assign(NumElts, SM_SentinelUndef);
41255 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41256 OpZero = OpUndef = APInt::getZero(NumElts);
41257 } else {
41258 return SDValue();
41259 }
41260
41261 // If the shuffle result was smaller than the root, we need to adjust the
41262 // mask indices and pad the mask with undefs.
41263 if (RootSizeInBits > VT.getSizeInBits()) {
41264 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41265 unsigned OpMaskSize = OpMask.size();
41266 if (OpInputs.size() > 1) {
41267 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41268 for (int &M : OpMask) {
41269 if (M < 0)
41270 continue;
41271 int EltIdx = M % OpMaskSize;
41272 int OpIdx = M / OpMaskSize;
41273 M = (PaddedMaskSize * OpIdx) + EltIdx;
41274 }
41275 }
41276 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41277 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41278 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41279 }
41280
41283
41284 // We don't need to merge masks if the root is empty.
41285 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41286 if (EmptyRoot) {
41287 // Only resolve zeros if it will remove an input, otherwise we might end
41288 // up in an infinite loop.
41289 bool ResolveKnownZeros = true;
41290 if (!OpZero.isZero()) {
41291 APInt UsedInputs = APInt::getZero(OpInputs.size());
41292 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41293 int M = OpMask[i];
41294 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41295 continue;
41296 UsedInputs.setBit(M / OpMask.size());
41297 if (UsedInputs.isAllOnes()) {
41298 ResolveKnownZeros = false;
41299 break;
41300 }
41301 }
41302 }
41303 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41304 ResolveKnownZeros);
41305
41306 Mask = OpMask;
41307 Ops.append(OpInputs.begin(), OpInputs.end());
41308 } else {
41309 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41310
41311 // Add the inputs to the Ops list, avoiding duplicates.
41312 Ops.append(SrcOps.begin(), SrcOps.end());
41313
41314 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41315 // Attempt to find an existing match.
41316 SDValue InputBC = peekThroughBitcasts(Input);
41317 for (int i = 0, e = Ops.size(); i < e; ++i)
41318 if (InputBC == peekThroughBitcasts(Ops[i]))
41319 return i;
41320 // Match failed - should we replace an existing Op?
41321 if (InsertionPoint >= 0) {
41322 Ops[InsertionPoint] = Input;
41323 return InsertionPoint;
41324 }
41325 // Add to the end of the Ops list.
41326 Ops.push_back(Input);
41327 return Ops.size() - 1;
41328 };
41329
41330 SmallVector<int, 2> OpInputIdx;
41331 for (SDValue OpInput : OpInputs)
41332 OpInputIdx.push_back(
41333 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41334
41335 assert(((RootMask.size() > OpMask.size() &&
41336 RootMask.size() % OpMask.size() == 0) ||
41337 (OpMask.size() > RootMask.size() &&
41338 OpMask.size() % RootMask.size() == 0) ||
41339 OpMask.size() == RootMask.size()) &&
41340 "The smaller number of elements must divide the larger.");
41341
41342 // This function can be performance-critical, so we rely on the power-of-2
41343 // knowledge that we have about the mask sizes to replace div/rem ops with
41344 // bit-masks and shifts.
41345 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
41346 "Non-power-of-2 shuffle mask sizes");
41347 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
41348 "Non-power-of-2 shuffle mask sizes");
41349 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41350 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41351
41352 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41353 unsigned RootRatio =
41354 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41355 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41356 assert((RootRatio == 1 || OpRatio == 1) &&
41357 "Must not have a ratio for both incoming and op masks!");
41358
41359 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41360 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41361 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41362 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41363 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41364
41365 Mask.resize(MaskWidth, SM_SentinelUndef);
41366
41367 // Merge this shuffle operation's mask into our accumulated mask. Note that
41368 // this shuffle's mask will be the first applied to the input, followed by
41369 // the root mask to get us all the way to the root value arrangement. The
41370 // reason for this order is that we are recursing up the operation chain.
41371 for (unsigned i = 0; i < MaskWidth; ++i) {
41372 unsigned RootIdx = i >> RootRatioLog2;
41373 if (RootMask[RootIdx] < 0) {
41374 // This is a zero or undef lane, we're done.
41375 Mask[i] = RootMask[RootIdx];
41376 continue;
41377 }
41378
41379 unsigned RootMaskedIdx =
41380 RootRatio == 1
41381 ? RootMask[RootIdx]
41382 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41383
41384 // Just insert the scaled root mask value if it references an input other
41385 // than the SrcOp we're currently inserting.
41386 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41387 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41388 Mask[i] = RootMaskedIdx;
41389 continue;
41390 }
41391
41392 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41393 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41394 if (OpMask[OpIdx] < 0) {
41395 // The incoming lanes are zero or undef, it doesn't matter which ones we
41396 // are using.
41397 Mask[i] = OpMask[OpIdx];
41398 continue;
41399 }
41400
41401 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41402 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41403 : (OpMask[OpIdx] << OpRatioLog2) +
41404 (RootMaskedIdx & (OpRatio - 1));
41405
41406 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41407 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41408 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41409 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41410
41411 Mask[i] = OpMaskedIdx;
41412 }
41413 }
41414
41415 // Peek through any free bitcasts to insert_subvector vector widenings or
41416 // extract_subvector nodes back to root size.
41417 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41418 for (auto [I, Op] : enumerate(Ops)) {
41419 SDValue BC = Op;
41420 while (1) {
41421 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41422 BC = BC.getOperand(0);
41423 continue;
41424 }
41425 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41426 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41427 // Set out of bounds mask indices to undef.
41428 Op = BC = BC.getOperand(1);
41429 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41430 int Lo = I * Mask.size();
41431 int Hi = (I + 1) * Mask.size();
41432 int NewHi = Lo + (Mask.size() / Scale);
41433 for (int &M : Mask) {
41434 if (Lo <= M && NewHi <= M && M < Hi)
41435 M = SM_SentinelUndef;
41436 }
41437 continue;
41438 }
41439 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41440 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41441 isNullConstant(BC.getOperand(1))) {
41442 Op = BC = BC.getOperand(0);
41443 continue;
41444 }
41445 break;
41446 }
41447 }
41448
41449 // Remove unused/repeated shuffle source ops.
41451
41452 // Handle the all undef/zero/ones cases early.
41453 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41454 return DAG.getUNDEF(RootVT);
41455 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41456 return getZeroVector(RootVT, Subtarget, DAG, DL);
41457 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41459 return getOnesVector(RootVT, DAG, DL);
41460
41461 assert(!Ops.empty() && "Shuffle with no inputs detected");
41462
41463 // Update the list of shuffle nodes that have been combined so far.
41464 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41465 CombinedNodes.push_back(Op.getNode());
41466
41467 // See if we can recurse into each shuffle source op (if it's a target
41468 // shuffle). The source op should only be generally combined if it either has
41469 // a single use (i.e. current Op) or all its users have already been combined,
41470 // if not then we can still combine but should prevent generation of variable
41471 // shuffles to avoid constant pool bloat.
41472 // Don't recurse if we already have more source ops than we can combine in
41473 // the remaining recursion depth.
41474 if (Ops.size() < (MaxDepth - Depth)) {
41475 for (int i = 0, e = Ops.size(); i < e; ++i) {
41476 // For empty roots, we need to resolve zeroable elements before combining
41477 // them with other shuffles.
41478 SmallVector<int, 64> ResolvedMask = Mask;
41479 if (EmptyRoot)
41480 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41481 bool AllowCrossLaneVar = false;
41482 bool AllowPerLaneVar = false;
41483 if (Ops[i].getNode()->hasOneUse() ||
41484 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41485 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41486 AllowPerLaneVar = AllowVariablePerLaneMask;
41487 }
41489 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41490 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41491 DAG, DL, Subtarget))
41492 return Res;
41493 }
41494 }
41495
41496 // Attempt to constant fold all of the constant source ops.
41498 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41499 return Cst;
41500
41501 // If constant fold failed and we only have constants - then we have
41502 // multiple uses by a single non-variable shuffle - just bail.
41503 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41504 APInt UndefElts;
41505 SmallVector<APInt> RawBits;
41506 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41507 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41508 RawBits,
41509 /*AllowWholeUndefs*/ true,
41510 /*AllowPartialUndefs*/ true);
41511 })) {
41512 return SDValue();
41513 }
41514
41515 // Canonicalize the combined shuffle mask chain with horizontal ops.
41516 // NOTE: This will update the Ops and Mask.
41518 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41519 return DAG.getBitcast(RootVT, HOp);
41520
41521 // Try to refine our inputs given our knowledge of target shuffle mask.
41522 for (auto I : enumerate(Ops)) {
41523 int OpIdx = I.index();
41524 SDValue &Op = I.value();
41525
41526 // What range of shuffle mask element values results in picking from Op?
41527 int Lo = OpIdx * Mask.size();
41528 int Hi = Lo + Mask.size();
41529
41530 // Which elements of Op do we demand, given the mask's granularity?
41531 APInt OpDemandedElts(Mask.size(), 0);
41532 for (int MaskElt : Mask) {
41533 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41534 int OpEltIdx = MaskElt - Lo;
41535 OpDemandedElts.setBit(OpEltIdx);
41536 }
41537 }
41538
41539 // Is the shuffle result smaller than the root?
41540 if (Op.getValueSizeInBits() < RootSizeInBits) {
41541 // We padded the mask with undefs. But we now need to undo that.
41542 unsigned NumExpectedVectorElts = Mask.size();
41543 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41544 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41545 assert(!OpDemandedElts.extractBits(
41546 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41547 "Demanding the virtual undef widening padding?");
41548 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41549 }
41550
41551 // The Op itself may be of different VT, so we need to scale the mask.
41552 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41553 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41554
41555 // Can this operand be simplified any further, given it's demanded elements?
41557 Op, OpScaledDemandedElts, DAG))
41558 Op = NewOp;
41559 }
41560 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41561
41562 // Widen any subvector shuffle inputs we've collected.
41563 // TODO: Remove this to avoid generating temporary nodes, we should only
41564 // widen once combineX86ShuffleChain has found a match.
41565 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41566 return Op.getValueSizeInBits() < RootSizeInBits;
41567 })) {
41568 for (SDValue &Op : Ops)
41569 if (Op.getValueSizeInBits() < RootSizeInBits)
41570 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41571 RootSizeInBits);
41572 // Reresolve - we might have repeated subvector sources.
41574 }
41575
41576 // We can only combine unary and binary shuffle mask cases.
41577 if (Ops.size() <= 2) {
41578 // Minor canonicalization of the accumulated shuffle mask to make it easier
41579 // to match below. All this does is detect masks with sequential pairs of
41580 // elements, and shrink them to the half-width mask. It does this in a loop
41581 // so it will reduce the size of the mask to the minimal width mask which
41582 // performs an equivalent shuffle.
41583 while (Mask.size() > 1) {
41584 SmallVector<int, 64> WidenedMask;
41585 if (!canWidenShuffleElements(Mask, WidenedMask))
41586 break;
41587 Mask = std::move(WidenedMask);
41588 }
41589
41590 // Canonicalization of binary shuffle masks to improve pattern matching by
41591 // commuting the inputs.
41592 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41594 std::swap(Ops[0], Ops[1]);
41595 }
41596
41597 // Try to combine into a single shuffle instruction.
41598 if (SDValue Shuffle = combineX86ShuffleChain(
41599 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41600 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41601 IsMaskedShuffle, DAG, DL, Subtarget))
41602 return Shuffle;
41603
41604 // If all the operands come from the same larger vector, fallthrough and try
41605 // to use combineX86ShuffleChainWithExtract.
41608 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41609 (RootSizeInBits / Mask.size()) != 64 ||
41610 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41611 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41612 LHS.getOperand(0) != RHS.getOperand(0))
41613 return SDValue();
41614 }
41615
41616 // If that failed and any input is extracted then try to combine as a
41617 // shuffle with the larger type.
41619 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41620 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41621 DAG, DL, Subtarget);
41622}
41623
41624/// Helper entry wrapper to combineX86ShufflesRecursively.
41626 const X86Subtarget &Subtarget) {
41628 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41629 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41630 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41631 SDLoc(Op), Subtarget);
41632}
41633
41634/// Get the PSHUF-style mask from PSHUF node.
41635///
41636/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41637/// PSHUF-style masks that can be reused with such instructions.
41639 MVT VT = N.getSimpleValueType();
41642 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41643 (void)HaveMask;
41644 assert(HaveMask);
41645
41646 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41647 // matter. Check that the upper masks are repeats and remove them.
41648 if (VT.getSizeInBits() > 128) {
41649 int LaneElts = 128 / VT.getScalarSizeInBits();
41650#ifndef NDEBUG
41651 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41652 for (int j = 0; j < LaneElts; ++j)
41653 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41654 "Mask doesn't repeat in high 128-bit lanes!");
41655#endif
41656 Mask.resize(LaneElts);
41657 }
41658
41659 switch (N.getOpcode()) {
41660 case X86ISD::PSHUFD:
41661 return Mask;
41662 case X86ISD::PSHUFLW:
41663 Mask.resize(4);
41664 return Mask;
41665 case X86ISD::PSHUFHW:
41666 Mask.erase(Mask.begin(), Mask.begin() + 4);
41667 for (int &M : Mask)
41668 M -= 4;
41669 return Mask;
41670 default:
41671 llvm_unreachable("No valid shuffle instruction found!");
41672 }
41673}
41674
41675/// Get the expanded blend mask from a BLENDI node.
41676/// For v16i16 nodes, this will splat the repeated i8 mask.
41678 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41679 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41680 APInt Mask = V.getConstantOperandAPInt(2);
41681 if (Mask.getBitWidth() > NumElts)
41682 Mask = Mask.trunc(NumElts);
41683 if (NumElts == 16) {
41684 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41685 Mask = APInt::getSplat(16, Mask);
41686 }
41687 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41688 return Mask;
41689}
41690
41691/// Search for a combinable shuffle across a chain ending in pshufd.
41692///
41693/// We walk up the chain and look for a combinable shuffle, skipping over
41694/// shuffles that we could hoist this shuffle's transformation past without
41695/// altering anything.
41698 const SDLoc &DL,
41699 SelectionDAG &DAG) {
41700 assert(N.getOpcode() == X86ISD::PSHUFD &&
41701 "Called with something other than an x86 128-bit half shuffle!");
41702
41703 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41704 // of the shuffles in the chain so that we can form a fresh chain to replace
41705 // this one.
41707 SDValue V = N.getOperand(0);
41708 for (; V.hasOneUse(); V = V.getOperand(0)) {
41709 switch (V.getOpcode()) {
41710 default:
41711 return SDValue(); // Nothing combined!
41712
41713 case ISD::BITCAST:
41714 // Skip bitcasts as we always know the type for the target specific
41715 // instructions.
41716 continue;
41717
41718 case X86ISD::PSHUFD:
41719 // Found another dword shuffle.
41720 break;
41721
41722 case X86ISD::PSHUFLW:
41723 // Check that the low words (being shuffled) are the identity in the
41724 // dword shuffle, and the high words are self-contained.
41725 if (Mask[0] != 0 || Mask[1] != 1 ||
41726 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41727 return SDValue();
41728
41729 Chain.push_back(V);
41730 continue;
41731
41732 case X86ISD::PSHUFHW:
41733 // Check that the high words (being shuffled) are the identity in the
41734 // dword shuffle, and the low words are self-contained.
41735 if (Mask[2] != 2 || Mask[3] != 3 ||
41736 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41737 return SDValue();
41738
41739 Chain.push_back(V);
41740 continue;
41741
41742 case X86ISD::UNPCKL:
41743 case X86ISD::UNPCKH:
41744 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41745 // shuffle into a preceding word shuffle.
41746 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41747 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41748 return SDValue();
41749
41750 // Search for a half-shuffle which we can combine with.
41751 unsigned CombineOp =
41752 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41753 if (V.getOperand(0) != V.getOperand(1) ||
41754 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41755 return SDValue();
41756 Chain.push_back(V);
41757 V = V.getOperand(0);
41758 do {
41759 switch (V.getOpcode()) {
41760 default:
41761 return SDValue(); // Nothing to combine.
41762
41763 case X86ISD::PSHUFLW:
41764 case X86ISD::PSHUFHW:
41765 if (V.getOpcode() == CombineOp)
41766 break;
41767
41768 Chain.push_back(V);
41769
41770 [[fallthrough]];
41771 case ISD::BITCAST:
41772 V = V.getOperand(0);
41773 continue;
41774 }
41775 break;
41776 } while (V.hasOneUse());
41777 break;
41778 }
41779 // Break out of the loop if we break out of the switch.
41780 break;
41781 }
41782
41783 if (!V.hasOneUse())
41784 // We fell out of the loop without finding a viable combining instruction.
41785 return SDValue();
41786
41787 // Merge this node's mask and our incoming mask.
41789 for (int &M : Mask)
41790 M = VMask[M];
41791 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41792 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41793
41794 // Rebuild the chain around this new shuffle.
41795 while (!Chain.empty()) {
41796 SDValue W = Chain.pop_back_val();
41797
41798 if (V.getValueType() != W.getOperand(0).getValueType())
41799 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41800
41801 switch (W.getOpcode()) {
41802 default:
41803 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41804
41805 case X86ISD::UNPCKL:
41806 case X86ISD::UNPCKH:
41807 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41808 break;
41809
41810 case X86ISD::PSHUFD:
41811 case X86ISD::PSHUFLW:
41812 case X86ISD::PSHUFHW:
41813 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41814 break;
41815 }
41816 }
41817 if (V.getValueType() != N.getValueType())
41818 V = DAG.getBitcast(N.getValueType(), V);
41819
41820 // Return the new chain to replace N.
41821 return V;
41822}
41823
41824// Attempt to commute shufps LHS loads:
41825// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41827 SelectionDAG &DAG) {
41828 // TODO: Add vXf64 support.
41829 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41830 return SDValue();
41831
41832 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41833 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41834 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41835 return SDValue();
41836 SDValue N0 = V.getOperand(0);
41837 SDValue N1 = V.getOperand(1);
41838 unsigned Imm = V.getConstantOperandVal(2);
41839 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41840 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41842 return SDValue();
41843 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41844 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41845 DAG.getTargetConstant(Imm, DL, MVT::i8));
41846 };
41847
41848 switch (N.getOpcode()) {
41849 case X86ISD::VPERMILPI:
41850 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41851 unsigned Imm = N.getConstantOperandVal(1);
41852 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41853 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41854 }
41855 break;
41856 case X86ISD::SHUFP: {
41857 SDValue N0 = N.getOperand(0);
41858 SDValue N1 = N.getOperand(1);
41859 unsigned Imm = N.getConstantOperandVal(2);
41860 if (N0 == N1) {
41861 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41862 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41863 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41864 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41865 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41866 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41867 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41868 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41869 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41870 }
41871 break;
41872 }
41873 }
41874
41875 return SDValue();
41876}
41877
41878// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41879// iff we don't demand the same element index for both X and Y.
41880static SDValue
41882 const APInt &DemandedElts, SelectionDAG &DAG,
41883 const X86Subtarget &Subtarget, const SDLoc &DL) {
41884 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41885 if (!N0.hasOneUse() || !N1.hasOneUse())
41886 return SDValue();
41887
41888 unsigned NumElts = VT.getVectorNumElements();
41891
41892 // See if both operands are shuffles, and that we can scale the shuffle masks
41893 // to the same width as the blend mask.
41894 // TODO: Support SM_SentinelZero?
41895 SmallVector<SDValue, 2> Ops0, Ops1;
41896 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41897 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41898 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41899 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41900 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41901 return SDValue();
41902
41903 // Determine the demanded elts from both permutes.
41904 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41905 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41906 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41907 Demanded1,
41908 /*AllowUndefElts=*/true) ||
41909 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41910 DemandedRHS0, /*AllowUndefElts=*/true) ||
41911 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41912 DemandedRHS1, /*AllowUndefElts=*/true))
41913 return SDValue();
41914
41915 // Confirm that we only use a single operand from both permutes and that we
41916 // don't demand the same index from both.
41917 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41918 DemandedLHS0.intersects(DemandedLHS1))
41919 return SDValue();
41920
41921 // Use the permute demanded elts masks as the new blend mask.
41922 // Create the new permute mask as a blend of the 2 original permute masks.
41923 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41924 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41925 for (unsigned I = 0; I != NumElts; ++I) {
41926 if (Demanded0[I]) {
41927 int M = ScaledMask0[I];
41928 if (0 <= M) {
41929 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41930 "BlendMask demands LHS AND RHS");
41931 NewBlendMask[M] = M;
41932 NewPermuteMask[I] = M;
41933 }
41934 } else if (Demanded1[I]) {
41935 int M = ScaledMask1[I];
41936 if (0 <= M) {
41937 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41938 "BlendMask demands LHS AND RHS");
41939 NewBlendMask[M] = M + NumElts;
41940 NewPermuteMask[I] = M;
41941 }
41942 }
41943 }
41944 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41945 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41946
41947 // v16i16 shuffles can explode in complexity very easily, only accept them if
41948 // the blend mask is the same in the 128-bit subvectors (or can widen to
41949 // v8i32) and the permute can be widened as well.
41950 if (VT == MVT::v16i16) {
41951 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41952 !canWidenShuffleElements(NewBlendMask))
41953 return SDValue();
41954 if (!canWidenShuffleElements(NewPermuteMask))
41955 return SDValue();
41956 }
41957
41958 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41959 // widened to a lane permute (vperm2f128).
41960 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41962 NewPermuteMask) &&
41963 !canScaleShuffleElements(NewPermuteMask, 2))
41964 return SDValue();
41965
41966 SDValue NewBlend =
41967 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
41968 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
41969 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
41970 NewPermuteMask);
41971}
41972
41973// TODO - move this to TLI like isBinOp?
41974static bool isUnaryOp(unsigned Opcode) {
41975 switch (Opcode) {
41976 case ISD::CTLZ:
41977 case ISD::CTTZ:
41978 case ISD::CTPOP:
41979 return true;
41980 }
41981 return false;
41982}
41983
41984// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41985// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41987 const SDLoc &DL) {
41988 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41989 EVT ShuffleVT = N.getValueType();
41990 unsigned Opc = N.getOpcode();
41991
41992 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
41993 // AllZeros/AllOnes constants are freely shuffled and will peek through
41994 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
41995 // merge with target shuffles if it has one use so shuffle combining is
41996 // likely to kick in. Shuffles of splats are expected to be removed.
41997 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
41998 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42001 getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
42002 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42003 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42004 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42005 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42006 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42007 };
42008 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42009 // Ensure we only shuffle whole vector src elements, unless its a logical
42010 // binops where we can more aggressively move shuffles from dst to src.
42011 return isLogicOp(BinOp) ||
42012 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42013 };
42014
42015 switch (Opc) {
42016 // Unary and Unary+Permute Shuffles.
42017 case X86ISD::PSHUFB: {
42018 // Don't merge PSHUFB if it contains zero'd elements.
42019 SmallVector<int> Mask;
42021 if (!getTargetShuffleMask(N, false, Ops, Mask))
42022 break;
42023 [[fallthrough]];
42024 }
42025 case X86ISD::VBROADCAST:
42026 case X86ISD::MOVDDUP:
42027 case X86ISD::PSHUFD:
42028 case X86ISD::PSHUFHW:
42029 case X86ISD::PSHUFLW:
42030 case X86ISD::VPERMV:
42031 case X86ISD::VPERMI:
42032 case X86ISD::VPERMILPI: {
42033 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42034 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42035 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42036 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42037 unsigned SrcOpcode = N0.getOpcode();
42038 EVT OpVT = N0.getValueType();
42039 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42042 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42043 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42044 IsMergeableWithShuffle(Op01, FoldShuf)) {
42045 SDValue LHS, RHS;
42046 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42047 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42048 if (Opc == X86ISD::VPERMV) {
42049 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42050 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42051 } else if (N.getNumOperands() == 2) {
42052 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42053 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42054 } else {
42055 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42056 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42057 }
42058 return DAG.getBitcast(ShuffleVT,
42059 DAG.getNode(SrcOpcode, DL, OpVT,
42060 DAG.getBitcast(OpVT, LHS),
42061 DAG.getBitcast(OpVT, RHS)));
42062 }
42063 }
42064 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42065 OpVT.getScalarSizeInBits() ==
42067 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42068 if (Opc == X86ISD::VPERMV)
42069 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42070 else if (N.getNumOperands() == 2)
42071 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42072 else
42073 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42074 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42075 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42076 }
42077 }
42078 break;
42079 }
42080 // Binary and Binary+Permute Shuffles.
42081 case X86ISD::INSERTPS: {
42082 // Don't merge INSERTPS if it contains zero'd elements.
42083 unsigned InsertPSMask = N.getConstantOperandVal(2);
42084 unsigned ZeroMask = InsertPSMask & 0xF;
42085 if (ZeroMask != 0)
42086 break;
42087 [[fallthrough]];
42088 }
42089 case X86ISD::MOVSD:
42090 case X86ISD::MOVSS:
42091 case X86ISD::BLENDI:
42092 case X86ISD::SHUFP:
42093 case X86ISD::UNPCKH:
42094 case X86ISD::UNPCKL: {
42095 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42096 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42097 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42098 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42099 unsigned SrcOpcode = N0.getOpcode();
42100 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42101 N0.getValueType() == N1.getValueType() &&
42102 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42103 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42108 // Ensure the total number of shuffles doesn't increase by folding this
42109 // shuffle through to the source ops.
42110 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42111 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42112 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42113 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42114 SDValue LHS, RHS;
42115 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42116 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42117 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42118 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42119 if (N.getNumOperands() == 3) {
42120 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42121 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42122 } else {
42123 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42124 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42125 }
42126 EVT OpVT = N0.getValueType();
42127 return DAG.getBitcast(ShuffleVT,
42128 DAG.getNode(SrcOpcode, DL, OpVT,
42129 DAG.getBitcast(OpVT, LHS),
42130 DAG.getBitcast(OpVT, RHS)));
42131 }
42132 }
42133 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42134 N0.getValueType() == N1.getValueType() &&
42135 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42136 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42139 SDValue Res;
42140 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42141 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42142 if (N.getNumOperands() == 3) {
42143 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42144 } else {
42145 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42146 }
42147 EVT OpVT = N0.getValueType();
42148 return DAG.getBitcast(
42149 ShuffleVT,
42150 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42151 }
42152 // TODO: We can generalize this for other shuffles/conversions.
42153 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42154 N1.getOpcode() == SrcOpcode &&
42155 N0.getValueType() == N1.getValueType() &&
42156 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42157 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42158 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42159 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42160 EVT OpSrcVT = N0.getOperand(0).getValueType();
42161 EVT OpDstVT = N0.getValueType();
42162 SDValue Res =
42163 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42164 return DAG.getBitcast(ShuffleVT,
42165 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42166 }
42167 }
42168 break;
42169 }
42170 }
42171 return SDValue();
42172}
42173
42174/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42176 SelectionDAG &DAG,
42177 const SDLoc &DL) {
42178 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42179
42180 MVT VT = V.getSimpleValueType();
42181 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42182 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42183 unsigned SrcOpc0 = Src0.getOpcode();
42184 unsigned SrcOpc1 = Src1.getOpcode();
42185 EVT SrcVT0 = Src0.getValueType();
42186 EVT SrcVT1 = Src1.getValueType();
42187
42188 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42189 return SDValue();
42190
42191 switch (SrcOpc0) {
42192 case X86ISD::MOVDDUP: {
42193 SDValue LHS = Src0.getOperand(0);
42194 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42195 SDValue Res =
42196 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42197 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42198 return DAG.getBitcast(VT, Res);
42199 }
42200 case X86ISD::VPERMILPI:
42201 // TODO: Handle v4f64 permutes with different low/high lane masks.
42202 if (SrcVT0 == MVT::v4f64) {
42203 uint64_t Mask = Src0.getConstantOperandVal(1);
42204 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42205 break;
42206 }
42207 [[fallthrough]];
42208 case X86ISD::VSHLI:
42209 case X86ISD::VSRLI:
42210 case X86ISD::VSRAI:
42211 case X86ISD::PSHUFD:
42212 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42213 SDValue LHS = Src0.getOperand(0);
42214 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42215 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42216 V.getOperand(2));
42217 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42218 return DAG.getBitcast(VT, Res);
42219 }
42220 break;
42221 }
42222
42223 return SDValue();
42224}
42225
42226/// Try to combine x86 target specific shuffles.
42228 SelectionDAG &DAG,
42230 const X86Subtarget &Subtarget) {
42231 using namespace SDPatternMatch;
42232
42233 MVT VT = N.getSimpleValueType();
42234 unsigned NumElts = VT.getVectorNumElements();
42236 unsigned Opcode = N.getOpcode();
42237 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42238
42239 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42240 return R;
42241
42242 // Handle specific target shuffles.
42243 switch (Opcode) {
42244 case X86ISD::MOVDDUP: {
42245 SDValue Src = N.getOperand(0);
42246 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42247 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42248 ISD::isNormalLoad(Src.getNode())) {
42249 LoadSDNode *LN = cast<LoadSDNode>(Src);
42250 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42251 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42252 DCI.CombineTo(N.getNode(), Movddup);
42253 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42255 return N; // Return N so it doesn't get rechecked!
42256 }
42257 }
42258
42259 return SDValue();
42260 }
42261 case X86ISD::VBROADCAST: {
42262 SDValue Src = N.getOperand(0);
42263 SDValue BC = peekThroughBitcasts(Src);
42264 EVT SrcVT = Src.getValueType();
42265 EVT BCVT = BC.getValueType();
42266
42267 // If broadcasting from another shuffle, attempt to simplify it.
42268 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42269 if (isTargetShuffle(BC.getOpcode()) &&
42270 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42271 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42272 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42274 for (unsigned i = 0; i != Scale; ++i)
42275 DemandedMask[i] = i;
42277 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42278 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42279 /*AllowVariableCrossLaneMask=*/true,
42280 /*AllowVariablePerLaneMask=*/true,
42281 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42282 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42283 DAG.getBitcast(SrcVT, Res));
42284 }
42285
42286 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42287 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42288 if (Src.getOpcode() == ISD::BITCAST &&
42289 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42290 TLI.isTypeLegal(BCVT) &&
42292 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42293 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42295 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42296 }
42297
42298 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42299 // If we're re-broadcasting a smaller type then broadcast with that type and
42300 // bitcast.
42301 // TODO: Do this for any splat?
42302 if (Src.getOpcode() == ISD::BITCAST &&
42303 (BC.getOpcode() == X86ISD::VBROADCAST ||
42305 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42306 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42307 MVT NewVT =
42309 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42310 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42311 }
42312
42313 // Reduce broadcast source vector to lowest 128-bits.
42314 if (SrcVT.getSizeInBits() > 128)
42315 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42316 extract128BitVector(Src, 0, DAG, DL));
42317
42318 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42319 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42320 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42321 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42322
42323 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42324 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42325 isNullConstant(Src.getOperand(1)) &&
42326 Src.getValueType() ==
42327 Src.getOperand(0).getValueType().getScalarType() &&
42328 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42329 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42330
42331 // Share broadcast with the longest vector and extract low subvector (free).
42332 // Ensure the same SDValue from the SDNode use is being used.
42333 for (SDNode *User : Src->users())
42334 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42335 Src == User->getOperand(0) &&
42336 User->getValueSizeInBits(0).getFixedValue() >
42337 VT.getFixedSizeInBits()) {
42338 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42339 VT.getSizeInBits());
42340 }
42341
42342 // vbroadcast(scalarload X) -> vbroadcast_load X
42343 // For float loads, extract other uses of the scalar from the broadcast.
42344 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42345 ISD::isNormalLoad(Src.getNode())) {
42346 LoadSDNode *LN = cast<LoadSDNode>(Src);
42347 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42348 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42349 SDValue BcastLd =
42351 LN->getMemoryVT(), LN->getMemOperand());
42352 // If the load value is used only by N, replace it via CombineTo N.
42353 bool NoReplaceExtract = Src.hasOneUse();
42354 DCI.CombineTo(N.getNode(), BcastLd);
42355 if (NoReplaceExtract) {
42356 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42358 } else {
42359 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42360 DAG.getVectorIdxConstant(0, DL));
42361 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42362 }
42363 return N; // Return N so it doesn't get rechecked!
42364 }
42365
42366 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42367 // i16. So shrink it ourselves if we can make a broadcast_load.
42368 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42369 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42370 assert(Subtarget.hasAVX2() && "Expected AVX2");
42371 SDValue TruncIn = Src.getOperand(0);
42372
42373 // If this is a truncate of a non extending load we can just narrow it to
42374 // use a broadcast_load.
42375 if (ISD::isNormalLoad(TruncIn.getNode())) {
42376 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42377 // Unless its volatile or atomic.
42378 if (LN->isSimple()) {
42379 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42380 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42381 SDValue BcastLd = DAG.getMemIntrinsicNode(
42382 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42383 LN->getPointerInfo(), LN->getBaseAlign(),
42384 LN->getMemOperand()->getFlags());
42385 DCI.CombineTo(N.getNode(), BcastLd);
42386 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42387 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42388 return N; // Return N so it doesn't get rechecked!
42389 }
42390 }
42391
42392 // If this is a truncate of an i16 extload, we can directly replace it.
42393 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42394 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42395 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42396 if (LN->getMemoryVT().getSizeInBits() == 16) {
42397 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42398 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42399 SDValue BcastLd =
42401 LN->getMemoryVT(), LN->getMemOperand());
42402 DCI.CombineTo(N.getNode(), BcastLd);
42403 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42404 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42405 return N; // Return N so it doesn't get rechecked!
42406 }
42407 }
42408
42409 // If this is a truncate of load that has been shifted right, we can
42410 // offset the pointer and use a narrower load.
42411 if (TruncIn.getOpcode() == ISD::SRL &&
42412 TruncIn.getOperand(0).hasOneUse() &&
42413 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42414 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42415 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42416 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42417 // Make sure the shift amount and the load size are divisible by 16.
42418 // Don't do this if the load is volatile or atomic.
42419 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42420 LN->isSimple()) {
42421 unsigned Offset = ShiftAmt / 8;
42422 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42425 SDValue Ops[] = { LN->getChain(), Ptr };
42426 SDValue BcastLd = DAG.getMemIntrinsicNode(
42427 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42429 LN->getMemOperand()->getFlags());
42430 DCI.CombineTo(N.getNode(), BcastLd);
42431 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42432 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42433 return N; // Return N so it doesn't get rechecked!
42434 }
42435 }
42436 }
42437
42438 // vbroadcast(vzload X) -> vbroadcast_load X
42439 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42440 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
42441 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42442 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42443 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42444 SDValue BcastLd =
42446 LN->getMemoryVT(), LN->getMemOperand());
42447 DCI.CombineTo(N.getNode(), BcastLd);
42448 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42450 return N; // Return N so it doesn't get rechecked!
42451 }
42452 }
42453
42454 // vbroadcast(vector load X) -> vbroadcast_load
42455 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42456 LoadSDNode *LN = cast<LoadSDNode>(Src);
42457 // Unless the load is volatile or atomic.
42458 if (LN->isSimple()) {
42459 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42460 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42461 SDValue BcastLd = DAG.getMemIntrinsicNode(
42462 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
42463 LN->getPointerInfo(), LN->getBaseAlign(),
42464 LN->getMemOperand()->getFlags());
42465 DCI.CombineTo(N.getNode(), BcastLd);
42466 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42468 return N; // Return N so it doesn't get rechecked!
42469 }
42470 }
42471
42472 return SDValue();
42473 }
42474 case X86ISD::VZEXT_MOVL: {
42475 SDValue N0 = N.getOperand(0);
42476
42477 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42478 // Zeroing out the upper elements means we're just shifting a zero value.
42479 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42480 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42481 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42482 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42483 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42484 if (N0.hasOneUse())
42485 return DAG.getNode(
42486 N0.getOpcode(), DL, VT,
42487 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42488 N0.getOperand(1));
42489 }
42490
42491 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42492 // the load is volatile.
42493 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42494 auto *LN = cast<LoadSDNode>(N0);
42495 if (SDValue VZLoad =
42496 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42497 DCI.CombineTo(N.getNode(), VZLoad);
42498 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42500 return N;
42501 }
42502 }
42503
42504 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42505 // and can just use a VZEXT_LOAD.
42506 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42507 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42508 auto *LN = cast<MemSDNode>(N0);
42509 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42510 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42511 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42512 SDValue VZLoad =
42514 LN->getMemoryVT(), LN->getMemOperand());
42515 DCI.CombineTo(N.getNode(), VZLoad);
42516 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42518 return N;
42519 }
42520 }
42521
42522 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42523 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42524 // if the upper bits of the i64 are zero.
42525 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42526 N0.getOperand(0).hasOneUse() &&
42527 N0.getOperand(0).getValueType() == MVT::i64) {
42528 SDValue In = N0.getOperand(0);
42529 APInt Mask = APInt::getHighBitsSet(64, 32);
42530 if (DAG.MaskedValueIsZero(In, Mask)) {
42531 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42532 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42533 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42534 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42535 return DAG.getBitcast(VT, Movl);
42536 }
42537 }
42538
42539 // Load a scalar integer constant directly to XMM instead of transferring an
42540 // immediate value from GPR.
42541 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42542 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42543 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42544 // Create a vector constant - scalar constant followed by zeros.
42545 EVT ScalarVT = N0.getOperand(0).getValueType();
42546 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42547 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42548 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42549 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42550
42551 // Load the vector constant from constant pool.
42552 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42553 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42554 MachinePointerInfo MPI =
42556 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42557 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42559 }
42560 }
42561
42562 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42563 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42564 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42565 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42566 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42568
42569 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42570 isNullConstant(V.getOperand(2))) {
42571 SDValue In = V.getOperand(1);
42573 In.getValueSizeInBits() /
42574 VT.getScalarSizeInBits());
42575 In = DAG.getBitcast(SubVT, In);
42576 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42577 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42578 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42579 V.getOperand(2));
42580 }
42581 }
42582
42583 return SDValue();
42584 }
42585 case X86ISD::BLENDI: {
42586 SDValue N0 = N.getOperand(0);
42587 SDValue N1 = N.getOperand(1);
42588 unsigned EltBits = VT.getScalarSizeInBits();
42589
42590 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42591 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42592 // TODO: Handle MVT::v16i16 repeated blend mask.
42593 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42594 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42595 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42596 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42597 unsigned NewSize = SrcVT.getVectorNumElements();
42598 APInt BlendMask = getBLENDIBlendMask(N);
42599 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42600 return DAG.getBitcast(
42601 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42602 N1.getOperand(0),
42603 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42604 DL, MVT::i8)));
42605 }
42606 }
42607 // Share PSHUFB masks:
42608 // blend(pshufb(x,m1),pshufb(y,m2))
42609 // --> m3 = blend(m1,m2)
42610 // blend(pshufb(x,m3),pshufb(y,m3))
42611 if (N0.hasOneUse() && N1.hasOneUse()) {
42612 SmallVector<int> Mask, ByteMask;
42616 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42617 RHS.getOpcode() == X86ISD::PSHUFB &&
42618 LHS.getOperand(1) != RHS.getOperand(1) &&
42619 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42620 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42621 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42622 RHS == peekThroughOneUseBitcasts(Ops[1]) &&
42623 "BLENDI decode mismatch");
42624 MVT ShufVT = LHS.getSimpleValueType();
42625 SDValue MaskLHS = LHS.getOperand(1);
42626 SDValue MaskRHS = RHS.getOperand(1);
42627 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42629 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42630 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42631 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42632 LHS.getOperand(0), NewMask);
42633 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42634 RHS.getOperand(0), NewMask);
42635 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42636 DAG.getBitcast(VT, NewLHS),
42637 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42638 }
42639 }
42640 }
42641 }
42642 return SDValue();
42643 }
42644 case X86ISD::SHUFP: {
42645 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42646 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42647 // TODO: Support types other than v4f32.
42648 if (VT == MVT::v4f32) {
42649 bool Updated = false;
42650 SmallVector<int> Mask;
42652 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42653 for (int i = 0; i != 2; ++i) {
42654 SmallVector<SDValue> SubOps;
42655 SmallVector<int> SubMask, SubScaledMask;
42657 // TODO: Scaling might be easier if we specify the demanded elts.
42658 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42659 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42660 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42661 int Ofs = i * 2;
42662 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42663 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42664 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42665 Updated = true;
42666 }
42667 }
42668 }
42669 if (Updated) {
42670 for (int &M : Mask)
42671 M %= 4;
42672 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42673 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42674 }
42675 }
42676 return SDValue();
42677 }
42678 case X86ISD::VPERMI: {
42679 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42680 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42681 SDValue N0 = N.getOperand(0);
42682 SDValue N1 = N.getOperand(1);
42683 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42684 if (N0.getOpcode() == ISD::BITCAST &&
42685 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42686 SDValue Src = N0.getOperand(0);
42687 EVT SrcVT = Src.getValueType();
42688 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42689 return DAG.getBitcast(VT, Res);
42690 }
42691 return SDValue();
42692 }
42693 case X86ISD::SHUF128: {
42694 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42695 // see if we can peek through and access the subvector directly.
42696 if (VT.is512BitVector()) {
42697 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42698 // the upper subvector is used.
42699 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42700 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42701 uint64_t Mask = N->getConstantOperandVal(2);
42702 SmallVector<SDValue> LHSOps, RHSOps;
42703 SDValue NewLHS, NewRHS;
42704 if ((Mask & 0x0A) == 0x0A &&
42705 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42706 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42707 Mask &= ~0x0A;
42708 }
42709 if ((Mask & 0xA0) == 0xA0 &&
42710 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42711 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42712 Mask &= ~0xA0;
42713 }
42714 if (NewLHS || NewRHS)
42715 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42716 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42717 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42718 DAG.getTargetConstant(Mask, DL, MVT::i8));
42719 }
42720 return SDValue();
42721 }
42722 case X86ISD::VPERM2X128: {
42723 SDValue LHS = N->getOperand(0);
42724 SDValue RHS = N->getOperand(1);
42725 unsigned Imm = N.getConstantOperandVal(2) & 255;
42726
42727 // Canonicalize unary/repeated operands to LHS.
42728 if (LHS.isUndef() && !RHS.isUndef())
42729 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42730 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42731 if (LHS == RHS)
42732 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42733 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42734
42735 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42736 if (LHS.getOpcode() == ISD::BITCAST &&
42737 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42738 EVT SrcVT = LHS.getOperand(0).getValueType();
42739 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42740 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42741 DAG.getBitcast(SrcVT, LHS),
42742 DAG.getBitcast(SrcVT, RHS),
42743 N->getOperand(2)));
42744 }
42745 }
42746
42747 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42749 return Res;
42750
42751 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42752 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42753 auto FindSubVector128 = [&](unsigned Idx) {
42754 if (Idx > 3)
42755 return SDValue();
42756 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42757 SmallVector<SDValue> SubOps;
42758 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42759 return SubOps[Idx & 1];
42760 unsigned NumElts = Src.getValueType().getVectorNumElements();
42761 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42762 Src.getOperand(1).getValueSizeInBits() == 128 &&
42763 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42764 return Src.getOperand(1);
42765 }
42766 return SDValue();
42767 };
42768 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42769 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42770 MVT SubVT = VT.getHalfNumVectorElementsVT();
42771 SubLo = DAG.getBitcast(SubVT, SubLo);
42772 SubHi = DAG.getBitcast(SubVT, SubHi);
42773 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42774 }
42775 }
42776
42777 // Attempt to match VBROADCAST*128 subvector broadcast load.
42778 if (RHS.isUndef()) {
42780 DecodeVPERM2X128Mask(4, Imm, Mask);
42781 if (isUndefOrInRange(Mask, 0, 4)) {
42782 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42783 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42784 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42785 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42786 MVT MemVT = VT.getHalfNumVectorElementsVT();
42787 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42789 cast<LoadSDNode>(LHS), Ofs, DAG);
42790 }
42791 }
42792 }
42793
42794 return SDValue();
42795 }
42796 case X86ISD::PSHUFD:
42797 case X86ISD::PSHUFLW:
42798 case X86ISD::PSHUFHW: {
42799 SDValue N0 = N.getOperand(0);
42800 SDValue N1 = N.getOperand(1);
42801 if (N0->hasOneUse()) {
42803 switch (V.getOpcode()) {
42804 case X86ISD::VSHL:
42805 case X86ISD::VSRL:
42806 case X86ISD::VSRA:
42807 case X86ISD::VSHLI:
42808 case X86ISD::VSRLI:
42809 case X86ISD::VSRAI:
42810 case X86ISD::VROTLI:
42811 case X86ISD::VROTRI: {
42812 MVT InnerVT = V.getSimpleValueType();
42813 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42814 SDValue Res = DAG.getNode(Opcode, DL, VT,
42815 DAG.getBitcast(VT, V.getOperand(0)), N1);
42816 Res = DAG.getBitcast(InnerVT, Res);
42817 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42818 return DAG.getBitcast(VT, Res);
42819 }
42820 break;
42821 }
42822 }
42823 }
42824
42825 Mask = getPSHUFShuffleMask(N);
42826 assert(Mask.size() == 4);
42827 break;
42828 }
42829 case X86ISD::MOVSD:
42830 case X86ISD::MOVSH:
42831 case X86ISD::MOVSS: {
42832 SDValue N0 = N.getOperand(0);
42833 SDValue N1 = N.getOperand(1);
42834
42835 // Canonicalize scalar FPOps:
42836 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42837 // If commutable, allow OP(N1[0], N0[0]).
42838 unsigned Opcode1 = N1.getOpcode();
42839 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42840 Opcode1 == ISD::FDIV) {
42841 SDValue N10 = N1.getOperand(0);
42842 SDValue N11 = N1.getOperand(1);
42843 if (N10 == N0 ||
42844 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42845 if (N10 != N0)
42846 std::swap(N10, N11);
42847 MVT SVT = VT.getVectorElementType();
42848 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42849 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42850 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42851 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42852 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42853 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42854 }
42855 }
42856
42857 return SDValue();
42858 }
42859 case X86ISD::INSERTPS: {
42860 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42861 SDValue Op0 = N.getOperand(0);
42862 SDValue Op1 = N.getOperand(1);
42863 unsigned InsertPSMask = N.getConstantOperandVal(2);
42864 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42865 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42866 unsigned ZeroMask = InsertPSMask & 0xF;
42867
42868 // If we zero out all elements from Op0 then we don't need to reference it.
42869 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42870 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42871 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42872
42873 // If we zero out the element from Op1 then we don't need to reference it.
42874 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42875 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42876 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42877
42878 // Attempt to merge insertps Op1 with an inner target shuffle node.
42879 SmallVector<int, 8> TargetMask1;
42881 APInt KnownUndef1, KnownZero1;
42882 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42883 KnownZero1)) {
42884 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42885 // Zero/UNDEF insertion - zero out element and remove dependency.
42886 InsertPSMask |= (1u << DstIdx);
42887 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42888 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42889 }
42890 // Update insertps mask srcidx and reference the source input directly.
42891 int M = TargetMask1[SrcIdx];
42892 assert(0 <= M && M < 8 && "Shuffle index out of range");
42893 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42894 Op1 = Ops1[M < 4 ? 0 : 1];
42895 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42896 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42897 }
42898
42899 // Attempt to merge insertps Op0 with an inner target shuffle node.
42900 SmallVector<int, 8> TargetMask0;
42902 APInt KnownUndef0, KnownZero0;
42903 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42904 KnownZero0)) {
42905 bool Updated = false;
42906 bool UseInput00 = false;
42907 bool UseInput01 = false;
42908 for (int i = 0; i != 4; ++i) {
42909 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42910 // No change if element is already zero or the inserted element.
42911 continue;
42912 }
42913
42914 if (KnownUndef0[i] || KnownZero0[i]) {
42915 // If the target mask is undef/zero then we must zero the element.
42916 InsertPSMask |= (1u << i);
42917 Updated = true;
42918 continue;
42919 }
42920
42921 // The input vector element must be inline.
42922 int M = TargetMask0[i];
42923 if (M != i && M != (i + 4))
42924 return SDValue();
42925
42926 // Determine which inputs of the target shuffle we're using.
42927 UseInput00 |= (0 <= M && M < 4);
42928 UseInput01 |= (4 <= M);
42929 }
42930
42931 // If we're not using both inputs of the target shuffle then use the
42932 // referenced input directly.
42933 if (UseInput00 && !UseInput01) {
42934 Updated = true;
42935 Op0 = Ops0[0];
42936 } else if (!UseInput00 && UseInput01) {
42937 Updated = true;
42938 Op0 = Ops0[1];
42939 }
42940
42941 if (Updated)
42942 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42943 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42944 }
42945
42946 // If we're inserting an element from a vbroadcast load, fold the
42947 // load into the X86insertps instruction. We need to convert the scalar
42948 // load to a vector and clear the source lane of the INSERTPS control.
42949 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42950 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42951 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42952 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42953 MemIntr->getBasePtr(),
42954 MemIntr->getMemOperand());
42955 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42957 Load),
42958 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42959 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42960 return Insert;
42961 }
42962 }
42963
42964 return SDValue();
42965 }
42966 case X86ISD::VPERMV: {
42967 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
42969 SmallVector<SDValue, 2> SrcOps, SubOps;
42970 SDValue Src = peekThroughBitcasts(N.getOperand(1));
42971 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
42972 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
42973 collectConcatOps(Src.getNode(), SubOps, DAG)) {
42974 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42975 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
42976 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
42977 "Unexpected split ops");
42978 // Bail if we were permuting a widened vector.
42979 if (SubOps[1].isUndef() &&
42980 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
42981 return SDValue();
42982 // Bail if any subops would have folded into the concat.
42983 if (any_of(SubOps, isShuffleFoldableLoad))
42984 return SDValue();
42985 // Concat 4x128 back to 2x256.
42986 if (SubOps.size() == 4) {
42987 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
42988 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
42989 }
42990 // Convert mask to 2 operand shuffle.
42991 int HalfElts = NumElts / 2;
42992 for (int &M : Mask)
42993 M += M >= HalfElts ? HalfElts : 0;
42994 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
42995 VT.getSizeInBits());
42996 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
42997 VT.getSizeInBits());
42998 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
42999 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43000 }
43001 return SDValue();
43002 }
43003 case X86ISD::VPERMV3: {
43004 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43005 bool CanConcat = VT.is128BitVector() ||
43006 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43009 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43010 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43011 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43012 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43013 // Canonicalize to VPERMV if both sources are the same.
43014 if (V1 == V2) {
43015 for (int &M : Mask)
43016 M = (M < 0 ? M : (M & (NumElts - 1)));
43017 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43018 DAG.getUNDEF(VT), Subtarget, DAG);
43019 }
43020 // If sources are half width, then concat and use VPERMV with adjusted
43021 // mask.
43022 SDValue Ops[2];
43023 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43024 if (sd_match(V1,
43025 m_InsertSubvector(m_Undef(), m_Value(Ops[0]), m_Zero())) &&
43026 sd_match(V2,
43027 m_InsertSubvector(m_Undef(), m_Value(Ops[1]), m_Zero())) &&
43028 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43029 if (SDValue ConcatSrc =
43030 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43031 for (int &M : Mask)
43032 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43033 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43034 DAG.getUNDEF(VT), Subtarget, DAG);
43035 }
43036 }
43037 // Commute foldable source to the RHS.
43038 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43039 !isShuffleFoldableLoad(N.getOperand(2))) {
43041 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43042 N.getOperand(0), Subtarget, DAG);
43043 }
43044 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43045 // freely concatenated, with a commuted shuffle mask.
43046 if (CanConcat) {
43047 if (SDValue ConcatSrc = combineConcatVectorOps(
43048 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43049 Subtarget)) {
43051 Mask.append(NumElts, SM_SentinelUndef);
43052 SDValue Perm =
43053 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43054 DAG.getUNDEF(WideVT), Subtarget, DAG);
43055 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43056 DAG.getVectorIdxConstant(0, DL));
43057 }
43058 }
43059 }
43060 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43061 // freely concatenated.
43062 if (CanConcat) {
43063 if (SDValue ConcatSrc = combineConcatVectorOps(
43064 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43065 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43066 DL, WideVT.getSizeInBits());
43067 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43068 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43069 DAG.getVectorIdxConstant(0, DL));
43070 }
43071 }
43072 return SDValue();
43073 }
43074 default:
43075 return SDValue();
43076 }
43077
43078 // Nuke no-op shuffles that show up after combining.
43079 if (isNoopShuffleMask(Mask))
43080 return N.getOperand(0);
43081
43082 // Look for simplifications involving one or two shuffle instructions.
43083 SDValue V = N.getOperand(0);
43084 switch (N.getOpcode()) {
43085 default:
43086 break;
43087 case X86ISD::PSHUFLW:
43088 case X86ISD::PSHUFHW:
43089 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43090
43091 // See if this reduces to a PSHUFD which is no more expensive and can
43092 // combine with more operations. Note that it has to at least flip the
43093 // dwords as otherwise it would have been removed as a no-op.
43094 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43095 int DMask[] = {0, 1, 2, 3};
43096 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43097 DMask[DOffset + 0] = DOffset + 1;
43098 DMask[DOffset + 1] = DOffset + 0;
43099 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43100 V = DAG.getBitcast(DVT, V);
43101 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43102 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43103 return DAG.getBitcast(VT, V);
43104 }
43105
43106 // Look for shuffle patterns which can be implemented as a single unpack.
43107 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43108 // only works when we have a PSHUFD followed by two half-shuffles.
43109 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43110 (V.getOpcode() == X86ISD::PSHUFLW ||
43111 V.getOpcode() == X86ISD::PSHUFHW) &&
43112 V.getOpcode() != N.getOpcode() &&
43113 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43114 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43115 if (D.getOpcode() == X86ISD::PSHUFD) {
43118 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43119 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43120 int WordMask[8];
43121 for (int i = 0; i < 4; ++i) {
43122 WordMask[i + NOffset] = Mask[i] + NOffset;
43123 WordMask[i + VOffset] = VMask[i] + VOffset;
43124 }
43125 // Map the word mask through the DWord mask.
43126 int MappedMask[8];
43127 for (int i = 0; i < 8; ++i)
43128 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43129 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43130 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43131 // We can replace all three shuffles with an unpack.
43132 V = DAG.getBitcast(VT, D.getOperand(0));
43133 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43135 DL, VT, V, V);
43136 }
43137 }
43138 }
43139
43140 break;
43141
43142 case X86ISD::PSHUFD:
43143 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43144 return NewN;
43145
43146 break;
43147 }
43148
43149 return SDValue();
43150}
43151
43152/// Checks if the shuffle mask takes subsequent elements
43153/// alternately from two vectors.
43154/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43155static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43156
43157 int ParitySrc[2] = {-1, -1};
43158 unsigned Size = Mask.size();
43159 for (unsigned i = 0; i != Size; ++i) {
43160 int M = Mask[i];
43161 if (M < 0)
43162 continue;
43163
43164 // Make sure we are using the matching element from the input.
43165 if ((M % Size) != i)
43166 return false;
43167
43168 // Make sure we use the same input for all elements of the same parity.
43169 int Src = M / Size;
43170 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43171 return false;
43172 ParitySrc[i % 2] = Src;
43173 }
43174
43175 // Make sure each input is used.
43176 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43177 return false;
43178
43179 Op0Even = ParitySrc[0] == 0;
43180 return true;
43181}
43182
43183/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43184/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43185/// are written to the parameters \p Opnd0 and \p Opnd1.
43186///
43187/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43188/// so it is easier to generically match. We also insert dummy vector shuffle
43189/// nodes for the operands which explicitly discard the lanes which are unused
43190/// by this operation to try to flow through the rest of the combiner
43191/// the fact that they're unused.
43192static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43193 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43194 bool &IsSubAdd, bool &HasAllowContract) {
43195
43196 EVT VT = N->getValueType(0);
43197 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43198 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43200 return false;
43201
43202 // We only handle target-independent shuffles.
43203 // FIXME: It would be easy and harmless to use the target shuffle mask
43204 // extraction tool to support more.
43205 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43206 return false;
43207
43208 SDValue V1 = N->getOperand(0);
43209 SDValue V2 = N->getOperand(1);
43210
43211 // Make sure we have an FADD and an FSUB.
43212 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43213 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43214 V1.getOpcode() == V2.getOpcode())
43215 return false;
43216
43217 // If there are other uses of these operations we can't fold them.
43218 if (!V1->hasOneUse() || !V2->hasOneUse())
43219 return false;
43220
43221 // Ensure that both operations have the same operands. Note that we can
43222 // commute the FADD operands.
43223 SDValue LHS, RHS;
43224 if (V1.getOpcode() == ISD::FSUB) {
43225 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43226 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43227 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43228 return false;
43229 } else {
43230 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43231 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43232 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43233 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43234 return false;
43235 }
43236
43237 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43238 bool Op0Even;
43239 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43240 return false;
43241
43242 // It's a subadd if the vector in the even parity is an FADD.
43243 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43244 : V2->getOpcode() == ISD::FADD;
43245 HasAllowContract =
43247
43248 Opnd0 = LHS;
43249 Opnd1 = RHS;
43250 return true;
43251}
43252
43253/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43255 const X86Subtarget &Subtarget,
43256 SelectionDAG &DAG) {
43257 // We only handle target-independent shuffles.
43258 // FIXME: It would be easy and harmless to use the target shuffle mask
43259 // extraction tool to support more.
43260 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43261 return SDValue();
43262
43263 MVT VT = N->getSimpleValueType(0);
43264 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43265 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43266 return SDValue();
43267
43268 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43269 SDValue Op0 = N->getOperand(0);
43270 SDValue Op1 = N->getOperand(1);
43271 SDValue FMAdd = Op0, FMSub = Op1;
43272 if (FMSub.getOpcode() != X86ISD::FMSUB)
43273 std::swap(FMAdd, FMSub);
43274
43275 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43276 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43277 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43278 FMAdd.getOperand(2) != FMSub.getOperand(2))
43279 return SDValue();
43280
43281 // Check for correct shuffle mask.
43282 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43283 bool Op0Even;
43284 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43285 return SDValue();
43286
43287 // FMAddSub takes zeroth operand from FMSub node.
43288 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43289 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43290 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43291 FMAdd.getOperand(2));
43292}
43293
43294/// Try to combine a shuffle into a target-specific add-sub or
43295/// mul-add-sub node.
43297 const X86Subtarget &Subtarget,
43298 SelectionDAG &DAG) {
43299 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43300 return V;
43301
43302 SDValue Opnd0, Opnd1;
43303 bool IsSubAdd;
43304 bool HasAllowContract;
43305 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43306 HasAllowContract))
43307 return SDValue();
43308
43309 MVT VT = N->getSimpleValueType(0);
43310
43311 // Try to generate X86ISD::FMADDSUB node here.
43312 SDValue Opnd2;
43313 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43314 HasAllowContract)) {
43315 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43316 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43317 }
43318
43319 if (IsSubAdd)
43320 return SDValue();
43321
43322 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43323 // the ADDSUB idiom has been successfully recognized. There are no known
43324 // X86 targets with 512-bit ADDSUB instructions!
43325 if (VT.is512BitVector())
43326 return SDValue();
43327
43328 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43329 // the ADDSUB idiom has been successfully recognized. There are no known
43330 // X86 targets with FP16 ADDSUB instructions!
43331 if (VT.getVectorElementType() == MVT::f16)
43332 return SDValue();
43333
43334 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43335}
43336
43337/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43338/// low half of each source vector and does not set any high half elements in
43339/// the destination vector, narrow the shuffle to half its original size.
43341 EVT VT = Shuf->getValueType(0);
43342 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43343 return SDValue();
43344 if (!VT.is256BitVector() && !VT.is512BitVector())
43345 return SDValue();
43346
43347 // See if we can ignore all of the high elements of the shuffle.
43348 ArrayRef<int> Mask = Shuf->getMask();
43349 if (!isUndefUpperHalf(Mask))
43350 return SDValue();
43351
43352 // Check if the shuffle mask accesses only the low half of each input vector
43353 // (half-index output is 0 or 2).
43354 int HalfIdx1, HalfIdx2;
43355 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43356 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43357 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43358 return SDValue();
43359
43360 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43361 // The trick is knowing that all of the insert/extract are actually free
43362 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43363 // of narrow inputs into a narrow output, and that is always cheaper than
43364 // the wide shuffle that we started with.
43365 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43366 Shuf->getOperand(1), HalfMask, HalfIdx1,
43367 HalfIdx2, false, DAG, /*UseConcat*/ true);
43368}
43369
43372 const X86Subtarget &Subtarget) {
43373 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43374 if (SDValue V = narrowShuffle(Shuf, DAG))
43375 return V;
43376
43377 // If we have legalized the vector types, look for blends of FADD and FSUB
43378 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43379 SDLoc dl(N);
43380 EVT VT = N->getValueType(0);
43381 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43382 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43383 if (SDValue AddSub =
43384 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43385 return AddSub;
43386
43387 // Attempt to combine into a vector load/broadcast.
43389 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43390 return LD;
43391
43392 if (isTargetShuffle(N->getOpcode())) {
43393 SDValue Op(N, 0);
43394 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43395 return Shuffle;
43396
43397 // Try recursively combining arbitrary sequences of x86 shuffle
43398 // instructions into higher-order shuffles. We do this after combining
43399 // specific PSHUF instruction sequences into their minimal form so that we
43400 // can evaluate how many specialized shuffle instructions are involved in
43401 // a particular chain.
43402 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43403 return Res;
43404
43405 // Simplify source operands based on shuffle mask.
43406 // TODO - merge this into combineX86ShufflesRecursively.
43407 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43408 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43409 return SDValue(N, 0);
43410
43411 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43412 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43413 // Perform this after other shuffle combines to allow inner shuffles to be
43414 // combined away first.
43415 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43416 return BinOp;
43417 }
43418
43419 return SDValue();
43420}
43421
43422// Simplify variable target shuffle masks based on the demanded elements.
43423// TODO: Handle DemandedBits in mask indices as well?
43425 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43426 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43427 // If we're demanding all elements don't bother trying to simplify the mask.
43428 unsigned NumElts = DemandedElts.getBitWidth();
43429 if (DemandedElts.isAllOnes())
43430 return false;
43431
43432 SDValue Mask = Op.getOperand(MaskIndex);
43433 if (!Mask.hasOneUse())
43434 return false;
43435
43436 // Attempt to generically simplify the variable shuffle mask.
43437 APInt MaskUndef, MaskZero;
43438 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43439 Depth + 1))
43440 return true;
43441
43442 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43443 // TODO: Support other types from getTargetShuffleMaskIndices?
43445 EVT BCVT = BC.getValueType();
43446 auto *Load = dyn_cast<LoadSDNode>(BC);
43447 if (!Load || !Load->getBasePtr().hasOneUse())
43448 return false;
43449
43450 const Constant *C = getTargetConstantFromNode(Load);
43451 if (!C)
43452 return false;
43453
43454 Type *CTy = C->getType();
43455 if (!CTy->isVectorTy() ||
43456 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43457 return false;
43458
43459 // Handle scaling for i64 elements on 32-bit targets.
43460 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43461 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43462 return false;
43463 unsigned Scale = NumCstElts / NumElts;
43464
43465 // Simplify mask if we have an undemanded element that is not undef.
43466 bool Simplified = false;
43467 SmallVector<Constant *, 32> ConstVecOps;
43468 for (unsigned i = 0; i != NumCstElts; ++i) {
43469 Constant *Elt = C->getAggregateElement(i);
43470 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43471 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43472 Simplified = true;
43473 continue;
43474 }
43475 ConstVecOps.push_back(Elt);
43476 }
43477 if (!Simplified)
43478 return false;
43479
43480 // Generate new constant pool entry + legalize immediately for the load.
43481 SDLoc DL(Op);
43482 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43483 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43484 SDValue NewMask = TLO.DAG.getLoad(
43485 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43487 Load->getAlign());
43488 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43489}
43490
43492 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43493 TargetLoweringOpt &TLO, unsigned Depth) const {
43494 int NumElts = DemandedElts.getBitWidth();
43495 unsigned Opc = Op.getOpcode();
43496 EVT VT = Op.getValueType();
43497
43498 // Handle special case opcodes.
43499 switch (Opc) {
43500 case X86ISD::PMULDQ:
43501 case X86ISD::PMULUDQ: {
43502 APInt LHSUndef, LHSZero;
43503 APInt RHSUndef, RHSZero;
43504 SDValue LHS = Op.getOperand(0);
43505 SDValue RHS = Op.getOperand(1);
43506 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43507 Depth + 1))
43508 return true;
43509 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43510 Depth + 1))
43511 return true;
43512 // Multiply by zero.
43513 KnownZero = LHSZero | RHSZero;
43514 break;
43515 }
43516 case X86ISD::VPMADDUBSW:
43517 case X86ISD::VPMADDWD: {
43518 APInt LHSUndef, LHSZero;
43519 APInt RHSUndef, RHSZero;
43520 SDValue LHS = Op.getOperand(0);
43521 SDValue RHS = Op.getOperand(1);
43522 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43523
43524 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43525 Depth + 1))
43526 return true;
43527 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43528 Depth + 1))
43529 return true;
43530
43531 // TODO: Multiply by zero.
43532
43533 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43534 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43535 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43536 Depth + 1))
43537 return true;
43538 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43539 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43540 Depth + 1))
43541 return true;
43542 break;
43543 }
43544 case X86ISD::PSADBW: {
43545 SDValue LHS = Op.getOperand(0);
43546 SDValue RHS = Op.getOperand(1);
43547 assert(VT.getScalarType() == MVT::i64 &&
43548 LHS.getValueType() == RHS.getValueType() &&
43549 LHS.getValueType().getScalarType() == MVT::i8 &&
43550 "Unexpected PSADBW types");
43551
43552 // Aggressively peek through ops to get at the demanded elts.
43553 if (!DemandedElts.isAllOnes()) {
43554 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43555 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43557 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43559 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43560 if (NewLHS || NewRHS) {
43561 NewLHS = NewLHS ? NewLHS : LHS;
43562 NewRHS = NewRHS ? NewRHS : RHS;
43563 return TLO.CombineTo(
43564 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43565 }
43566 }
43567 break;
43568 }
43569 case X86ISD::VSHL:
43570 case X86ISD::VSRL:
43571 case X86ISD::VSRA: {
43572 // We only need the bottom 64-bits of the (128-bit) shift amount.
43573 SDValue Amt = Op.getOperand(1);
43574 MVT AmtVT = Amt.getSimpleValueType();
43575 assert(AmtVT.is128BitVector() && "Unexpected value type");
43576
43577 // If we reuse the shift amount just for sse shift amounts then we know that
43578 // only the bottom 64-bits are only ever used.
43579 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43580 unsigned UseOpc = Use->getOpcode();
43581 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43582 UseOpc == X86ISD::VSRA) &&
43583 Use->getOperand(0) != Amt;
43584 });
43585
43586 APInt AmtUndef, AmtZero;
43587 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43588 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43589 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43590 Depth + 1, AssumeSingleUse))
43591 return true;
43592 [[fallthrough]];
43593 }
43594 case X86ISD::VSHLI:
43595 case X86ISD::VSRLI:
43596 case X86ISD::VSRAI: {
43597 SDValue Src = Op.getOperand(0);
43598 APInt SrcUndef;
43599 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43600 Depth + 1))
43601 return true;
43602
43603 // Fold shift(0,x) -> 0
43604 if (DemandedElts.isSubsetOf(KnownZero))
43605 return TLO.CombineTo(
43606 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43607
43608 // Aggressively peek through ops to get at the demanded elts.
43609 if (!DemandedElts.isAllOnes())
43611 Src, DemandedElts, TLO.DAG, Depth + 1))
43612 return TLO.CombineTo(
43613 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43614 break;
43615 }
43616 case X86ISD::VPSHA:
43617 case X86ISD::VPSHL:
43618 case X86ISD::VSHLV:
43619 case X86ISD::VSRLV:
43620 case X86ISD::VSRAV: {
43621 APInt LHSUndef, LHSZero;
43622 APInt RHSUndef, RHSZero;
43623 SDValue LHS = Op.getOperand(0);
43624 SDValue RHS = Op.getOperand(1);
43625 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43626 Depth + 1))
43627 return true;
43628
43629 // Fold shift(0,x) -> 0
43630 if (DemandedElts.isSubsetOf(LHSZero))
43631 return TLO.CombineTo(
43632 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43633
43634 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43635 Depth + 1))
43636 return true;
43637
43638 KnownZero = LHSZero;
43639 break;
43640 }
43641 case X86ISD::CMPM:
43642 case X86ISD::CMPP: {
43643 // Scalarize packed fp comparison if we only require element 0.
43644 if (DemandedElts == 1) {
43645 SDLoc dl(Op);
43646 MVT VT = Op.getSimpleValueType();
43647 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43648 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43649 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43650 SDValue CC = Op.getOperand(2);
43651 if (Opc == X86ISD::CMPM) {
43652 SDValue Cmp =
43653 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43654 return TLO.CombineTo(
43655 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43656 }
43657 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43658 return TLO.CombineTo(Op,
43659 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43660 }
43661 break;
43662 }
43663 case X86ISD::PCMPEQ:
43664 case X86ISD::PCMPGT: {
43665 APInt LHSUndef, LHSZero;
43666 APInt RHSUndef, RHSZero;
43667 SDValue LHS = Op.getOperand(0);
43668 SDValue RHS = Op.getOperand(1);
43669 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43670 Depth + 1))
43671 return true;
43672 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43673 Depth + 1))
43674 return true;
43675 break;
43676 }
43677 case X86ISD::KSHIFTL: {
43678 SDValue Src = Op.getOperand(0);
43679 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43680 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43681 unsigned ShiftAmt = Amt->getZExtValue();
43682
43683 if (ShiftAmt == 0)
43684 return TLO.CombineTo(Op, Src);
43685
43686 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43687 // single shift. We can do this if the bottom bits (which are shifted
43688 // out) are never demanded.
43689 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43690 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43691 unsigned C1 = Src.getConstantOperandVal(1);
43692 unsigned NewOpc = X86ISD::KSHIFTL;
43693 int Diff = ShiftAmt - C1;
43694 if (Diff < 0) {
43695 Diff = -Diff;
43696 NewOpc = X86ISD::KSHIFTR;
43697 }
43698
43699 SDLoc dl(Op);
43700 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43701 return TLO.CombineTo(
43702 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43703 }
43704 }
43705
43706 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43707 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43708 Depth + 1))
43709 return true;
43710
43711 KnownUndef <<= ShiftAmt;
43712 KnownZero <<= ShiftAmt;
43713 KnownZero.setLowBits(ShiftAmt);
43714 break;
43715 }
43716 case X86ISD::KSHIFTR: {
43717 SDValue Src = Op.getOperand(0);
43718 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43719 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43720 unsigned ShiftAmt = Amt->getZExtValue();
43721
43722 if (ShiftAmt == 0)
43723 return TLO.CombineTo(Op, Src);
43724
43725 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43726 // single shift. We can do this if the top bits (which are shifted
43727 // out) are never demanded.
43728 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43729 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43730 unsigned C1 = Src.getConstantOperandVal(1);
43731 unsigned NewOpc = X86ISD::KSHIFTR;
43732 int Diff = ShiftAmt - C1;
43733 if (Diff < 0) {
43734 Diff = -Diff;
43735 NewOpc = X86ISD::KSHIFTL;
43736 }
43737
43738 SDLoc dl(Op);
43739 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43740 return TLO.CombineTo(
43741 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43742 }
43743 }
43744
43745 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43746 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43747 Depth + 1))
43748 return true;
43749
43750 KnownUndef.lshrInPlace(ShiftAmt);
43751 KnownZero.lshrInPlace(ShiftAmt);
43752 KnownZero.setHighBits(ShiftAmt);
43753 break;
43754 }
43755 case X86ISD::ANDNP: {
43756 // ANDNP = (~LHS & RHS);
43757 SDValue LHS = Op.getOperand(0);
43758 SDValue RHS = Op.getOperand(1);
43759
43760 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43761 APInt UndefElts;
43762 SmallVector<APInt> EltBits;
43763 int NumElts = VT.getVectorNumElements();
43764 int EltSizeInBits = VT.getScalarSizeInBits();
43765 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43766 APInt OpElts = DemandedElts;
43767 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43768 EltBits)) {
43769 OpBits.clearAllBits();
43770 OpElts.clearAllBits();
43771 for (int I = 0; I != NumElts; ++I) {
43772 if (!DemandedElts[I])
43773 continue;
43774 if (UndefElts[I]) {
43775 // We can't assume an undef src element gives an undef dst - the
43776 // other src might be zero.
43777 OpBits.setAllBits();
43778 OpElts.setBit(I);
43779 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43780 (!Invert && !EltBits[I].isZero())) {
43781 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43782 OpElts.setBit(I);
43783 }
43784 }
43785 }
43786 return std::make_pair(OpBits, OpElts);
43787 };
43788 APInt BitsLHS, EltsLHS;
43789 APInt BitsRHS, EltsRHS;
43790 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43791 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43792
43793 APInt LHSUndef, LHSZero;
43794 APInt RHSUndef, RHSZero;
43795 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43796 Depth + 1))
43797 return true;
43798 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43799 Depth + 1))
43800 return true;
43801
43802 if (!DemandedElts.isAllOnes()) {
43803 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43804 TLO.DAG, Depth + 1);
43805 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43806 TLO.DAG, Depth + 1);
43807 if (NewLHS || NewRHS) {
43808 NewLHS = NewLHS ? NewLHS : LHS;
43809 NewRHS = NewRHS ? NewRHS : RHS;
43810 return TLO.CombineTo(
43811 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43812 }
43813 }
43814 break;
43815 }
43816 case X86ISD::CVTSI2P:
43817 case X86ISD::CVTUI2P:
43818 case X86ISD::CVTPH2PS:
43819 case X86ISD::CVTPS2PH: {
43820 SDValue Src = Op.getOperand(0);
43821 EVT SrcVT = Src.getValueType();
43822 APInt SrcUndef, SrcZero;
43823 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43824 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43825 Depth + 1))
43826 return true;
43827 break;
43828 }
43829 case X86ISD::PACKSS:
43830 case X86ISD::PACKUS: {
43831 SDValue N0 = Op.getOperand(0);
43832 SDValue N1 = Op.getOperand(1);
43833
43834 APInt DemandedLHS, DemandedRHS;
43835 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43836
43837 APInt LHSUndef, LHSZero;
43838 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43839 Depth + 1))
43840 return true;
43841 APInt RHSUndef, RHSZero;
43842 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43843 Depth + 1))
43844 return true;
43845
43846 // TODO - pass on known zero/undef.
43847
43848 // Aggressively peek through ops to get at the demanded elts.
43849 // TODO - we should do this for all target/faux shuffles ops.
43850 if (!DemandedElts.isAllOnes()) {
43851 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43852 TLO.DAG, Depth + 1);
43853 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43854 TLO.DAG, Depth + 1);
43855 if (NewN0 || NewN1) {
43856 NewN0 = NewN0 ? NewN0 : N0;
43857 NewN1 = NewN1 ? NewN1 : N1;
43858 return TLO.CombineTo(Op,
43859 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43860 }
43861 }
43862 break;
43863 }
43864 case X86ISD::HADD:
43865 case X86ISD::HSUB:
43866 case X86ISD::FHADD:
43867 case X86ISD::FHSUB: {
43868 SDValue N0 = Op.getOperand(0);
43869 SDValue N1 = Op.getOperand(1);
43870
43871 APInt DemandedLHS, DemandedRHS;
43872 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43873
43874 APInt LHSUndef, LHSZero;
43875 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43876 Depth + 1))
43877 return true;
43878 APInt RHSUndef, RHSZero;
43879 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43880 Depth + 1))
43881 return true;
43882
43883 // TODO - pass on known zero/undef.
43884
43885 // Aggressively peek through ops to get at the demanded elts.
43886 // TODO: Handle repeated operands.
43887 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43888 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43889 TLO.DAG, Depth + 1);
43890 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43891 TLO.DAG, Depth + 1);
43892 if (NewN0 || NewN1) {
43893 NewN0 = NewN0 ? NewN0 : N0;
43894 NewN1 = NewN1 ? NewN1 : N1;
43895 return TLO.CombineTo(Op,
43896 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43897 }
43898 }
43899 break;
43900 }
43901 case X86ISD::VTRUNC:
43902 case X86ISD::VTRUNCS:
43903 case X86ISD::VTRUNCUS: {
43904 SDValue Src = Op.getOperand(0);
43905 MVT SrcVT = Src.getSimpleValueType();
43906 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43907 APInt SrcUndef, SrcZero;
43908 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43909 Depth + 1))
43910 return true;
43911 KnownZero = SrcZero.zextOrTrunc(NumElts);
43912 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43913 break;
43914 }
43915 case X86ISD::BLENDI: {
43916 SmallVector<int, 16> BlendMask;
43917 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43919 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43920 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43921 return TLO.CombineTo(Op, R);
43922 break;
43923 }
43924 case X86ISD::BLENDV: {
43925 APInt SelUndef, SelZero;
43926 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43927 SelZero, TLO, Depth + 1))
43928 return true;
43929
43930 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43931 APInt LHSUndef, LHSZero;
43932 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43933 LHSZero, TLO, Depth + 1))
43934 return true;
43935
43936 APInt RHSUndef, RHSZero;
43937 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43938 RHSZero, TLO, Depth + 1))
43939 return true;
43940
43941 KnownZero = LHSZero & RHSZero;
43942 KnownUndef = LHSUndef & RHSUndef;
43943 break;
43944 }
43945 case X86ISD::VZEXT_MOVL: {
43946 // If upper demanded elements are already zero then we have nothing to do.
43947 SDValue Src = Op.getOperand(0);
43948 APInt DemandedUpperElts = DemandedElts;
43949 DemandedUpperElts.clearLowBits(1);
43950 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43951 return TLO.CombineTo(Op, Src);
43952 break;
43953 }
43954 case X86ISD::VZEXT_LOAD: {
43955 // If upper demanded elements are not demanded then simplify to a
43956 // scalar_to_vector(load()).
43958 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43959 SDLoc DL(Op);
43960 auto *Mem = cast<MemSDNode>(Op);
43961 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43962 Mem->getMemOperand());
43963 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43964 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43965 }
43966 break;
43967 }
43968 case X86ISD::VBROADCAST: {
43969 SDValue Src = Op.getOperand(0);
43970 MVT SrcVT = Src.getSimpleValueType();
43971 // Don't bother broadcasting if we just need the 0'th element.
43972 if (DemandedElts == 1) {
43973 if (!SrcVT.isVector())
43974 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
43975 else if (Src.getValueType() != VT)
43976 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43977 SDLoc(Op));
43978 return TLO.CombineTo(Op, Src);
43979 }
43980 if (!SrcVT.isVector())
43981 break;
43982 APInt SrcUndef, SrcZero;
43983 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43984 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43985 Depth + 1))
43986 return true;
43987 // Aggressively peek through src to get at the demanded elt.
43988 // TODO - we should do this for all target/faux shuffles ops.
43990 Src, SrcElts, TLO.DAG, Depth + 1))
43991 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43992 break;
43993 }
43994 case X86ISD::VPERMV:
43995 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
43996 Depth))
43997 return true;
43998 break;
43999 case X86ISD::PSHUFB:
44000 case X86ISD::VPERMV3:
44001 case X86ISD::VPERMILPV:
44002 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44003 Depth))
44004 return true;
44005 break;
44006 case X86ISD::VPPERM:
44007 case X86ISD::VPERMIL2:
44008 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44009 Depth))
44010 return true;
44011 break;
44012 }
44013
44014 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44015 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44016 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44017 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44018 DemandedElts.lshr(NumElts / 2) == 0) {
44019 unsigned SizeInBits = VT.getSizeInBits();
44020 unsigned ExtSizeInBits = SizeInBits / 2;
44021
44022 // See if 512-bit ops only use the bottom 128-bits.
44023 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44024 ExtSizeInBits = SizeInBits / 4;
44025
44026 switch (Opc) {
44027 // Scalar broadcast.
44028 case X86ISD::VBROADCAST: {
44029 SDLoc DL(Op);
44030 SDValue Src = Op.getOperand(0);
44031 if (Src.getValueSizeInBits() > ExtSizeInBits)
44032 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44033 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44034 ExtSizeInBits / VT.getScalarSizeInBits());
44035 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44036 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44037 TLO.DAG, DL, ExtSizeInBits));
44038 }
44040 SDLoc DL(Op);
44041 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44042 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44043 ExtSizeInBits / VT.getScalarSizeInBits());
44044 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44045 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44046 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44047 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44048 MemIntr->getMemOperand());
44050 Bcst.getValue(1));
44051 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44052 TLO.DAG, DL, ExtSizeInBits));
44053 }
44054 // Subvector broadcast.
44056 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44057 EVT MemVT = MemIntr->getMemoryVT();
44058 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44059 SDLoc DL(Op);
44060 SDValue Ld =
44061 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44062 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44064 Ld.getValue(1));
44065 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44066 TLO.DAG, DL, ExtSizeInBits));
44067 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44068 SDLoc DL(Op);
44069 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44070 ExtSizeInBits / VT.getScalarSizeInBits());
44071 if (SDValue BcstLd =
44072 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44073 return TLO.CombineTo(Op,
44074 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44075 TLO.DAG, DL, ExtSizeInBits));
44076 }
44077 break;
44078 }
44079 // Byte shifts by immediate.
44080 case X86ISD::VSHLDQ:
44081 case X86ISD::VSRLDQ:
44082 // Shift by uniform.
44083 case X86ISD::VSHL:
44084 case X86ISD::VSRL:
44085 case X86ISD::VSRA:
44086 // Shift by immediate.
44087 case X86ISD::VSHLI:
44088 case X86ISD::VSRLI:
44089 case X86ISD::VSRAI: {
44090 SDLoc DL(Op);
44091 SDValue Ext0 =
44092 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44093 SDValue ExtOp =
44094 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44095 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44096 SDValue Insert =
44097 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44098 return TLO.CombineTo(Op, Insert);
44099 }
44100 case X86ISD::VPERMI: {
44101 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44102 // TODO: This should be done in shuffle combining.
44103 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44105 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44106 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44107 SDLoc DL(Op);
44108 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44109 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44110 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44111 return TLO.CombineTo(Op, Insert);
44112 }
44113 }
44114 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44115 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44116 SDLoc DL(Op);
44117 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44118 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44119 Op.getOperand(1));
44120 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44121 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44122 return TLO.CombineTo(Op, Insert);
44123 }
44124 break;
44125 }
44126 case X86ISD::VPERMV: {
44129 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44130 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44131 VT == MVT::v16f32) &&
44132 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44133 // For lane-crossing shuffles, only split in half in case we're still
44134 // referencing higher elements.
44135 unsigned HalfElts = NumElts / 2;
44136 unsigned HalfSize = SizeInBits / 2;
44137 Mask.resize(HalfElts);
44138 if (all_of(Mask,
44139 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44141 SDLoc DL(Op);
44142 SDValue Ext;
44143 SDValue M =
44144 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44145 SDValue V =
44146 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44147 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44148 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44149 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44150 else {
44152 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44153 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44154 TLO.DAG.getBitcast(ShufVT, V), M);
44155 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44156 }
44157 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44158 Subtarget, TLO.DAG, DL, SizeInBits);
44159 return TLO.CombineTo(Op, Insert);
44160 }
44161 }
44162 break;
44163 }
44164 case X86ISD::VPERMV3: {
44167 if (Subtarget.hasVLX() &&
44168 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44169 // For lane-crossing shuffles, only split in half in case we're still
44170 // referencing higher elements.
44171 unsigned HalfElts = NumElts / 2;
44172 unsigned HalfSize = SizeInBits / 2;
44173 Mask.resize(HalfElts);
44174 if (all_of(Mask, [&](int M) {
44175 return isUndefOrInRange(M, 0, HalfElts) ||
44176 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44177 })) {
44178 // Adjust mask elements for 2nd operand to point to half width.
44179 for (int &M : Mask)
44180 M = (M < NumElts) ? M : (M - HalfElts);
44182 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44183 SDLoc DL(Op);
44184 SDValue Ext = TLO.DAG.getNode(
44185 Opc, DL, HalfVT,
44186 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44187 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44188 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44189 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44190 Subtarget, TLO.DAG, DL, SizeInBits);
44191 return TLO.CombineTo(Op, Insert);
44192 }
44193 }
44194 break;
44195 }
44196 case X86ISD::VPERM2X128: {
44197 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44198 SDLoc DL(Op);
44199 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44200 if (LoMask & 0x8)
44201 return TLO.CombineTo(
44202 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44203 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44204 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44205 SDValue ExtOp =
44206 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44207 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44208 SDValue Insert =
44209 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44210 return TLO.CombineTo(Op, Insert);
44211 }
44212 // Conversions.
44213 // TODO: Add more CVT opcodes when we have test coverage.
44214 case X86ISD::CVTTP2UI: {
44215 if (!Subtarget.hasVLX())
44216 break;
44217 [[fallthrough]];
44218 }
44219 case X86ISD::CVTTP2SI: {
44220 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44221 !Subtarget.hasVLX())
44222 break;
44223 [[fallthrough]];
44224 }
44225 case X86ISD::CVTPH2PS: {
44226 SDLoc DL(Op);
44227 unsigned Scale = SizeInBits / ExtSizeInBits;
44228 SDValue SrcOp = Op.getOperand(0);
44229 MVT SrcVT = SrcOp.getSimpleValueType();
44230 unsigned SrcExtSize =
44231 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44233 ExtSizeInBits / VT.getScalarSizeInBits());
44234 SDValue ExtOp = TLO.DAG.getNode(
44235 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44236 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44237 SDValue Insert =
44238 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44239 return TLO.CombineTo(Op, Insert);
44240 }
44241 // Zero upper elements.
44242 case X86ISD::VZEXT_MOVL:
44243 // Variable blend.
44244 case X86ISD::BLENDV:
44245 // Target unary shuffles:
44246 case X86ISD::MOVDDUP:
44247 // Target unary shuffles by immediate:
44248 case X86ISD::PSHUFD:
44249 case X86ISD::PSHUFLW:
44250 case X86ISD::PSHUFHW:
44251 case X86ISD::VPERMILPI:
44252 // (Non-Lane Crossing) Target Shuffles.
44253 case X86ISD::VPERMILPV:
44254 case X86ISD::VPERMIL2:
44255 case X86ISD::PSHUFB:
44256 case X86ISD::UNPCKL:
44257 case X86ISD::UNPCKH:
44258 case X86ISD::BLENDI:
44259 // Integer ops.
44260 case X86ISD::PACKSS:
44261 case X86ISD::PACKUS:
44262 case X86ISD::PCMPEQ:
44263 case X86ISD::PCMPGT:
44264 case X86ISD::PMULUDQ:
44265 case X86ISD::PMULDQ:
44266 case X86ISD::VSHLV:
44267 case X86ISD::VSRLV:
44268 case X86ISD::VSRAV:
44269 // Float ops.
44270 case X86ISD::FMAX:
44271 case X86ISD::FMIN:
44272 case X86ISD::FMAXC:
44273 case X86ISD::FMINC:
44274 case X86ISD::FRSQRT:
44275 case X86ISD::FRCP:
44276 // Horizontal Ops.
44277 case X86ISD::HADD:
44278 case X86ISD::HSUB:
44279 case X86ISD::FHADD:
44280 case X86ISD::FHSUB: {
44281 SDLoc DL(Op);
44283 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44284 SDValue SrcOp = Op.getOperand(i);
44285 EVT SrcVT = SrcOp.getValueType();
44286 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44287 "Unsupported vector size");
44288 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44289 ExtSizeInBits)
44290 : SrcOp);
44291 }
44292 MVT ExtVT = VT.getSimpleVT();
44293 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44294 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44295 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44296 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44297 SDValue Insert =
44298 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44299 return TLO.CombineTo(Op, Insert);
44300 }
44301 }
44302 }
44303
44304 // For splats, unless we *only* demand the 0'th element,
44305 // stop attempts at simplification here, we aren't going to improve things,
44306 // this is better than any potential shuffle.
44307 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44308 return false;
44309
44310 // Get target/faux shuffle mask.
44311 APInt OpUndef, OpZero;
44312 SmallVector<int, 64> OpMask;
44313 SmallVector<SDValue, 2> OpInputs;
44314 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44315 OpZero, TLO.DAG, Depth, false))
44316 return false;
44317
44318 // Shuffle inputs must be the same size as the result.
44319 if (OpMask.size() != (unsigned)NumElts ||
44320 llvm::any_of(OpInputs, [VT](SDValue V) {
44321 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44322 !V.getValueType().isVector();
44323 }))
44324 return false;
44325
44326 KnownZero = OpZero;
44327 KnownUndef = OpUndef;
44328
44329 // Check if shuffle mask can be simplified to undef/zero/identity.
44330 int NumSrcs = OpInputs.size();
44331 for (int i = 0; i != NumElts; ++i)
44332 if (!DemandedElts[i])
44333 OpMask[i] = SM_SentinelUndef;
44334
44335 if (isUndefInRange(OpMask, 0, NumElts)) {
44336 KnownUndef.setAllBits();
44337 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44338 }
44339 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44340 KnownZero.setAllBits();
44341 return TLO.CombineTo(
44342 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44343 }
44344 for (int Src = 0; Src != NumSrcs; ++Src)
44345 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44346 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44347
44348 // Attempt to simplify inputs.
44349 for (int Src = 0; Src != NumSrcs; ++Src) {
44350 // TODO: Support inputs of different types.
44351 if (OpInputs[Src].getValueType() != VT)
44352 continue;
44353
44354 int Lo = Src * NumElts;
44355 APInt SrcElts = APInt::getZero(NumElts);
44356 for (int i = 0; i != NumElts; ++i)
44357 if (DemandedElts[i]) {
44358 int M = OpMask[i] - Lo;
44359 if (0 <= M && M < NumElts)
44360 SrcElts.setBit(M);
44361 }
44362
44363 // TODO - Propagate input undef/zero elts.
44364 APInt SrcUndef, SrcZero;
44365 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44366 TLO, Depth + 1))
44367 return true;
44368 }
44369
44370 // If we don't demand all elements, then attempt to combine to a simpler
44371 // shuffle.
44372 // We need to convert the depth to something combineX86ShufflesRecursively
44373 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44374 // to match. This prevents combineX86ShuffleChain from returning a
44375 // combined shuffle that's the same as the original root, causing an
44376 // infinite loop.
44377 if (!DemandedElts.isAllOnes()) {
44378 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44379
44380 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44381 for (int i = 0; i != NumElts; ++i)
44382 if (DemandedElts[i])
44383 DemandedMask[i] = i;
44384
44386 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44388 /*AllowVariableCrossLaneMask=*/true,
44389 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44390 TLO.DAG, SDLoc(Op), Subtarget);
44391 if (NewShuffle)
44392 return TLO.CombineTo(Op, NewShuffle);
44393 }
44394
44395 return false;
44396}
44397
44399 SDValue Op, const APInt &OriginalDemandedBits,
44400 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44401 unsigned Depth) const {
44402 EVT VT = Op.getValueType();
44403 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44404 unsigned Opc = Op.getOpcode();
44405 switch(Opc) {
44406 case X86ISD::VTRUNC: {
44407 KnownBits KnownOp;
44408 SDValue Src = Op.getOperand(0);
44409 MVT SrcVT = Src.getSimpleValueType();
44410
44411 // Simplify the input, using demanded bit information.
44412 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44413 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44414 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44415 return true;
44416 break;
44417 }
44418 case X86ISD::PMULDQ:
44419 case X86ISD::PMULUDQ: {
44420 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44421 KnownBits KnownLHS, KnownRHS;
44422 SDValue LHS = Op.getOperand(0);
44423 SDValue RHS = Op.getOperand(1);
44424
44425 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44426 // FIXME: Can we bound this better?
44427 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44428 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44429 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44430
44431 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44432 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44433 DemandedMaskLHS = DemandedMask;
44434 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44435 DemandedMaskRHS = DemandedMask;
44436
44437 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44438 KnownLHS, TLO, Depth + 1))
44439 return true;
44440 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44441 KnownRHS, TLO, Depth + 1))
44442 return true;
44443
44444 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44445 KnownRHS = KnownRHS.trunc(32);
44446 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44447 KnownRHS.getConstant().isOne()) {
44448 SDLoc DL(Op);
44449 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44450 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44451 }
44452
44453 // Aggressively peek through ops to get at the demanded low bits.
44455 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44457 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44458 if (DemandedLHS || DemandedRHS) {
44459 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44460 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44461 return TLO.CombineTo(
44462 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44463 }
44464 break;
44465 }
44466 case X86ISD::ANDNP: {
44467 KnownBits Known2;
44468 SDValue Op0 = Op.getOperand(0);
44469 SDValue Op1 = Op.getOperand(1);
44470
44471 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44472 Known, TLO, Depth + 1))
44473 return true;
44474
44475 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44476 OriginalDemandedElts, Known2, TLO, Depth + 1))
44477 return true;
44478
44479 // If the RHS is a constant, see if we can simplify it.
44480 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44481 OriginalDemandedElts, TLO))
44482 return true;
44483
44484 // ANDNP = (~Op0 & Op1);
44485 Known.One &= Known2.Zero;
44486 Known.Zero |= Known2.One;
44487 break;
44488 }
44489 case X86ISD::VSHLI: {
44490 SDValue Op0 = Op.getOperand(0);
44491 SDValue Op1 = Op.getOperand(1);
44492
44493 unsigned ShAmt = Op1->getAsZExtVal();
44494 if (ShAmt >= BitWidth)
44495 break;
44496
44497 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44498
44499 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44500 // single shift. We can do this if the bottom bits (which are shifted
44501 // out) are never demanded.
44502 if (Op0.getOpcode() == X86ISD::VSRLI &&
44503 OriginalDemandedBits.countr_zero() >= ShAmt) {
44504 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44505 if (Shift2Amt < BitWidth) {
44506 int Diff = ShAmt - Shift2Amt;
44507 if (Diff == 0)
44508 return TLO.CombineTo(Op, Op0.getOperand(0));
44509
44510 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44511 SDValue NewShift = TLO.DAG.getNode(
44512 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44513 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44514 return TLO.CombineTo(Op, NewShift);
44515 }
44516 }
44517
44518 // If we are only demanding sign bits then we can use the shift source directly.
44519 unsigned NumSignBits =
44520 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44521 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44522 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44523 return TLO.CombineTo(Op, Op0);
44524
44525 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44526 TLO, Depth + 1))
44527 return true;
44528
44529 Known <<= ShAmt;
44530
44531 // Low bits known zero.
44532 Known.Zero.setLowBits(ShAmt);
44533
44534 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44535 // Attempt to avoid multi-use ops if we don't need anything from them.
44536 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44537 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44538 SDValue NewOp =
44539 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44540 return TLO.CombineTo(Op, NewOp);
44541 }
44542 }
44543 return false;
44544 }
44545 case X86ISD::VSRLI: {
44546 SDValue Op0 = Op.getOperand(0);
44547 SDValue Op1 = Op.getOperand(1);
44548
44549 unsigned ShAmt = Op1->getAsZExtVal();
44550 if (ShAmt >= BitWidth)
44551 break;
44552
44553 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44554
44555 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44556 TLO, Depth + 1))
44557 return true;
44558
44559 Known >>= ShAmt;
44560
44561 // High bits known zero.
44562 Known.Zero.setHighBits(ShAmt);
44563
44564 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44565 // Attempt to avoid multi-use ops if we don't need anything from them.
44566 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44567 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44568 SDValue NewOp =
44569 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44570 return TLO.CombineTo(Op, NewOp);
44571 }
44572 }
44573 return false;
44574 }
44575 case X86ISD::VSRAI: {
44576 SDValue Op0 = Op.getOperand(0);
44577 SDValue Op1 = Op.getOperand(1);
44578
44579 unsigned ShAmt = Op1->getAsZExtVal();
44580 if (ShAmt >= BitWidth)
44581 break;
44582
44583 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44584
44585 // If we just want the sign bit then we don't need to shift it.
44586 if (OriginalDemandedBits.isSignMask())
44587 return TLO.CombineTo(Op, Op0);
44588
44589 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44590 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44591 SDValue Op00 = Op0.getOperand(0);
44592 unsigned NumSignBits =
44593 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44594 if (ShAmt < NumSignBits)
44595 return TLO.CombineTo(Op, Op00);
44596 }
44597
44598 // If any of the demanded bits are produced by the sign extension, we also
44599 // demand the input sign bit.
44600 if (OriginalDemandedBits.countl_zero() < ShAmt)
44601 DemandedMask.setSignBit();
44602
44603 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44604 TLO, Depth + 1))
44605 return true;
44606
44607 Known >>= ShAmt;
44608
44609 // If the input sign bit is known to be zero, or if none of the top bits
44610 // are demanded, turn this into an unsigned shift right.
44611 if (Known.Zero[BitWidth - ShAmt - 1] ||
44612 OriginalDemandedBits.countl_zero() >= ShAmt)
44613 return TLO.CombineTo(
44614 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44615
44616 // High bits are known one.
44617 if (Known.One[BitWidth - ShAmt - 1])
44618 Known.One.setHighBits(ShAmt);
44619
44620 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44621 // Attempt to avoid multi-use ops if we don't need anything from them.
44622 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44623 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44624 SDValue NewOp =
44625 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44626 return TLO.CombineTo(Op, NewOp);
44627 }
44628 }
44629 return false;
44630 }
44631 case X86ISD::BLENDI: {
44632 SDValue LHS = Op.getOperand(0);
44633 SDValue RHS = Op.getOperand(1);
44634 APInt Mask = getBLENDIBlendMask(Op);
44635
44636 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44637 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44638 TLO, Depth + 1))
44639 return true;
44640
44641 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44642 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44643 TLO, Depth + 1))
44644 return true;
44645
44646 // Attempt to avoid multi-use ops if we don't need anything from them.
44648 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44650 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44651 if (NewLHS || NewRHS) {
44652 NewLHS = NewLHS ? NewLHS : LHS;
44653 NewRHS = NewRHS ? NewRHS : RHS;
44654 return TLO.CombineTo(Op,
44655 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44656 NewLHS, NewRHS, Op.getOperand(2)));
44657 }
44658 break;
44659 }
44660 case X86ISD::BLENDV: {
44661 SDValue Sel = Op.getOperand(0);
44662 SDValue LHS = Op.getOperand(1);
44663 SDValue RHS = Op.getOperand(2);
44664
44665 APInt SignMask = APInt::getSignMask(BitWidth);
44667 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44669 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44671 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44672
44673 if (NewSel || NewLHS || NewRHS) {
44674 NewSel = NewSel ? NewSel : Sel;
44675 NewLHS = NewLHS ? NewLHS : LHS;
44676 NewRHS = NewRHS ? NewRHS : RHS;
44677 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44678 NewSel, NewLHS, NewRHS));
44679 }
44680 break;
44681 }
44682 case X86ISD::PEXTRB:
44683 case X86ISD::PEXTRW: {
44684 SDValue Vec = Op.getOperand(0);
44685 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44686 MVT VecVT = Vec.getSimpleValueType();
44687 unsigned NumVecElts = VecVT.getVectorNumElements();
44688
44689 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44690 unsigned Idx = CIdx->getZExtValue();
44691 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44692
44693 // If we demand no bits from the vector then we must have demanded
44694 // bits from the implict zext - simplify to zero.
44695 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44696 if (DemandedVecBits == 0)
44697 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44698
44699 APInt KnownUndef, KnownZero;
44700 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44701 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44702 KnownZero, TLO, Depth + 1))
44703 return true;
44704
44705 KnownBits KnownVec;
44706 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44707 KnownVec, TLO, Depth + 1))
44708 return true;
44709
44711 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44712 return TLO.CombineTo(
44713 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44714
44715 Known = KnownVec.zext(BitWidth);
44716 return false;
44717 }
44718 break;
44719 }
44720 case X86ISD::PINSRB:
44721 case X86ISD::PINSRW: {
44722 SDValue Vec = Op.getOperand(0);
44723 SDValue Scl = Op.getOperand(1);
44724 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44725 MVT VecVT = Vec.getSimpleValueType();
44726
44727 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44728 unsigned Idx = CIdx->getZExtValue();
44729 if (!OriginalDemandedElts[Idx])
44730 return TLO.CombineTo(Op, Vec);
44731
44732 KnownBits KnownVec;
44733 APInt DemandedVecElts(OriginalDemandedElts);
44734 DemandedVecElts.clearBit(Idx);
44735 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44736 KnownVec, TLO, Depth + 1))
44737 return true;
44738
44739 KnownBits KnownScl;
44740 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44741 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44742 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44743 return true;
44744
44745 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44746 Known = KnownVec.intersectWith(KnownScl);
44747 return false;
44748 }
44749 break;
44750 }
44751 case X86ISD::PACKSS:
44752 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44753 // sign bit then we can just ask for the source operands sign bit.
44754 // TODO - add known bits handling.
44755 if (OriginalDemandedBits.isSignMask()) {
44756 APInt DemandedLHS, DemandedRHS;
44757 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44758
44759 KnownBits KnownLHS, KnownRHS;
44760 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44761 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44762 KnownLHS, TLO, Depth + 1))
44763 return true;
44764 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44765 KnownRHS, TLO, Depth + 1))
44766 return true;
44767
44768 // Attempt to avoid multi-use ops if we don't need anything from them.
44770 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44772 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44773 if (DemandedOp0 || DemandedOp1) {
44774 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44775 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44776 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44777 }
44778 }
44779 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44780 break;
44781 case X86ISD::VBROADCAST: {
44782 SDValue Src = Op.getOperand(0);
44783 MVT SrcVT = Src.getSimpleValueType();
44784 APInt DemandedElts = APInt::getOneBitSet(
44785 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44786 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44787 TLO, Depth + 1))
44788 return true;
44789 // If we don't need the upper bits, attempt to narrow the broadcast source.
44790 // Don't attempt this on AVX512 as it might affect broadcast folding.
44791 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44792 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44793 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44794 Src->hasOneUse()) {
44795 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44796 SDValue NewSrc =
44797 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44798 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44799 SDValue NewBcst =
44800 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44801 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44802 }
44803 break;
44804 }
44805 case X86ISD::PCMPGT:
44806 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44807 // iff we only need the sign bit then we can use R directly.
44808 if (OriginalDemandedBits.isSignMask() &&
44809 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44810 return TLO.CombineTo(Op, Op.getOperand(1));
44811 break;
44812 case X86ISD::MOVMSK: {
44813 SDValue Src = Op.getOperand(0);
44814 MVT SrcVT = Src.getSimpleValueType();
44815 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44816 unsigned NumElts = SrcVT.getVectorNumElements();
44817
44818 // If we don't need the sign bits at all just return zero.
44819 if (OriginalDemandedBits.countr_zero() >= NumElts)
44820 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44821
44822 // See if we only demand bits from the lower 128-bit vector.
44823 if (SrcVT.is256BitVector() &&
44824 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44825 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44826 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44827 }
44828
44829 // Only demand the vector elements of the sign bits we need.
44830 APInt KnownUndef, KnownZero;
44831 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44832 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44833 TLO, Depth + 1))
44834 return true;
44835
44836 Known.Zero = KnownZero.zext(BitWidth);
44837 Known.Zero.setHighBits(BitWidth - NumElts);
44838
44839 // MOVMSK only uses the MSB from each vector element.
44840 KnownBits KnownSrc;
44841 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44842 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44843 Depth + 1))
44844 return true;
44845
44846 if (KnownSrc.One[SrcBits - 1])
44847 Known.One.setLowBits(NumElts);
44848 else if (KnownSrc.Zero[SrcBits - 1])
44849 Known.Zero.setLowBits(NumElts);
44850
44851 // Attempt to avoid multi-use os if we don't need anything from it.
44853 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44854 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44855 return false;
44856 }
44857 case X86ISD::TESTP: {
44858 SDValue Op0 = Op.getOperand(0);
44859 SDValue Op1 = Op.getOperand(1);
44860 MVT OpVT = Op0.getSimpleValueType();
44861 assert((OpVT.getVectorElementType() == MVT::f32 ||
44862 OpVT.getVectorElementType() == MVT::f64) &&
44863 "Illegal vector type for X86ISD::TESTP");
44864
44865 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44866 KnownBits KnownSrc;
44867 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44868 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44869 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44870 AssumeSingleUse) ||
44871 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44872 AssumeSingleUse);
44873 }
44874 case X86ISD::CMOV: {
44875 KnownBits Known2;
44876 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44877 OriginalDemandedElts, Known2, TLO, Depth + 1))
44878 return true;
44879 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44880 OriginalDemandedElts, Known, TLO, Depth + 1))
44881 return true;
44882
44883 // Only known if known in both the LHS and RHS.
44884 Known = Known.intersectWith(Known2);
44885 return false;
44886 }
44887 case X86ISD::BEXTR:
44888 case X86ISD::BEXTRI: {
44889 SDValue Op0 = Op.getOperand(0);
44890 SDValue Op1 = Op.getOperand(1);
44891
44892 // Only bottom 16-bits of the control bits are required.
44893 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44894 // NOTE: SimplifyDemandedBits won't do this for constants.
44895 uint64_t Val1 = Cst1->getZExtValue();
44896 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44897 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44898 SDLoc DL(Op);
44899 return TLO.CombineTo(
44900 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44901 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44902 }
44903
44904 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44905 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44906
44907 // If the length is 0, the result is 0.
44908 if (Length == 0) {
44909 Known.setAllZero();
44910 return false;
44911 }
44912
44913 if ((Shift + Length) <= BitWidth) {
44914 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44915 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44916 return true;
44917
44918 Known = Known.extractBits(Length, Shift);
44919 Known = Known.zextOrTrunc(BitWidth);
44920 return false;
44921 }
44922 } else {
44923 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44924 KnownBits Known1;
44925 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44926 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44927 return true;
44928
44929 // If the length is 0, replace with 0.
44930 KnownBits LengthBits = Known1.extractBits(8, 8);
44931 if (LengthBits.isZero())
44932 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44933 }
44934
44935 break;
44936 }
44937 case X86ISD::PDEP: {
44938 SDValue Op0 = Op.getOperand(0);
44939 SDValue Op1 = Op.getOperand(1);
44940
44941 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44942 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44943
44944 // If the demanded bits has leading zeroes, we don't demand those from the
44945 // mask.
44946 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44947 return true;
44948
44949 // The number of possible 1s in the mask determines the number of LSBs of
44950 // operand 0 used. Undemanded bits from the mask don't matter so filter
44951 // them before counting.
44952 KnownBits Known2;
44953 uint64_t Count = (~Known.Zero & LoMask).popcount();
44954 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44955 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44956 return true;
44957
44958 // Zeroes are retained from the mask, but not ones.
44959 Known.One.clearAllBits();
44960 // The result will have at least as many trailing zeros as the non-mask
44961 // operand since bits can only map to the same or higher bit position.
44962 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44963 return false;
44964 }
44965 case X86ISD::VPMADD52L:
44966 case X86ISD::VPMADD52H: {
44967 KnownBits KnownOp0, KnownOp1;
44968 SDValue Op0 = Op.getOperand(0);
44969 SDValue Op1 = Op.getOperand(1);
44970 SDValue Op2 = Op.getOperand(2);
44971 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
44972 // operand 2).
44973 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
44974 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
44975 TLO, Depth + 1))
44976 return true;
44977
44978 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
44979 TLO, Depth + 1))
44980 return true;
44981
44982 KnownBits KnownMul;
44983 KnownOp0 = KnownOp0.trunc(52);
44984 KnownOp1 = KnownOp1.trunc(52);
44985 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
44986 : KnownBits::mulhu(KnownOp0, KnownOp1);
44987 KnownMul = KnownMul.zext(64);
44988
44989 // lo/hi(X * Y) + Z --> C + Z
44990 if (KnownMul.isConstant()) {
44991 SDLoc DL(Op);
44992 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
44993 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
44994 }
44995
44996 // TODO: Compute the known bits for VPMADD52L/VPMADD52H.
44997 break;
44998 }
44999 }
45000
45002 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45003}
45004
45006 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45007 SelectionDAG &DAG, unsigned Depth) const {
45008 int NumElts = DemandedElts.getBitWidth();
45009 unsigned Opc = Op.getOpcode();
45010 EVT VT = Op.getValueType();
45011
45012 switch (Opc) {
45013 case X86ISD::PINSRB:
45014 case X86ISD::PINSRW: {
45015 // If we don't demand the inserted element, return the base vector.
45016 SDValue Vec = Op.getOperand(0);
45017 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45018 MVT VecVT = Vec.getSimpleValueType();
45019 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45020 !DemandedElts[CIdx->getZExtValue()])
45021 return Vec;
45022 break;
45023 }
45024 case X86ISD::VSHLI: {
45025 // If we are only demanding sign bits then we can use the shift source
45026 // directly.
45027 SDValue Op0 = Op.getOperand(0);
45028 unsigned ShAmt = Op.getConstantOperandVal(1);
45029 unsigned BitWidth = DemandedBits.getBitWidth();
45030 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45031 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45032 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45033 return Op0;
45034 break;
45035 }
45036 case X86ISD::VSRAI:
45037 // iff we only need the sign bit then we can use the source directly.
45038 // TODO: generalize where we only demand extended signbits.
45039 if (DemandedBits.isSignMask())
45040 return Op.getOperand(0);
45041 break;
45042 case X86ISD::PCMPGT:
45043 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45044 // iff we only need the sign bit then we can use R directly.
45045 if (DemandedBits.isSignMask() &&
45046 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45047 return Op.getOperand(1);
45048 break;
45049 case X86ISD::BLENDV: {
45050 // BLENDV: Cond (MSB) ? LHS : RHS
45051 SDValue Cond = Op.getOperand(0);
45052 SDValue LHS = Op.getOperand(1);
45053 SDValue RHS = Op.getOperand(2);
45054
45055 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45056 if (CondKnown.isNegative())
45057 return LHS;
45058 if (CondKnown.isNonNegative())
45059 return RHS;
45060 break;
45061 }
45062 case X86ISD::ANDNP: {
45063 // ANDNP = (~LHS & RHS);
45064 SDValue LHS = Op.getOperand(0);
45065 SDValue RHS = Op.getOperand(1);
45066
45067 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45068 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45069
45070 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45071 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45072 // this context, so return RHS.
45073 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45074 return RHS;
45075 break;
45076 }
45077 }
45078
45079 APInt ShuffleUndef, ShuffleZero;
45080 SmallVector<int, 16> ShuffleMask;
45082 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45083 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45084 // If all the demanded elts are from one operand and are inline,
45085 // then we can use the operand directly.
45086 int NumOps = ShuffleOps.size();
45087 if (ShuffleMask.size() == (unsigned)NumElts &&
45089 return VT.getSizeInBits() == V.getValueSizeInBits();
45090 })) {
45091
45092 if (DemandedElts.isSubsetOf(ShuffleUndef))
45093 return DAG.getUNDEF(VT);
45094 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45095 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45096
45097 // Bitmask that indicates which ops have only been accessed 'inline'.
45098 APInt IdentityOp = APInt::getAllOnes(NumOps);
45099 for (int i = 0; i != NumElts; ++i) {
45100 int M = ShuffleMask[i];
45101 if (!DemandedElts[i] || ShuffleUndef[i])
45102 continue;
45103 int OpIdx = M / NumElts;
45104 int EltIdx = M % NumElts;
45105 if (M < 0 || EltIdx != i) {
45106 IdentityOp.clearAllBits();
45107 break;
45108 }
45109 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45110 if (IdentityOp == 0)
45111 break;
45112 }
45113 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45114 "Multiple identity shuffles detected");
45115
45116 if (IdentityOp != 0)
45117 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45118 }
45119 }
45120
45122 Op, DemandedBits, DemandedElts, DAG, Depth);
45123}
45124
45126 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45127 bool PoisonOnly, unsigned Depth) const {
45128 unsigned NumElts = DemandedElts.getBitWidth();
45129
45130 switch (Op.getOpcode()) {
45132 case X86ISD::Wrapper:
45133 case X86ISD::WrapperRIP:
45134 return true;
45135 case X86ISD::BLENDI:
45136 case X86ISD::PSHUFD:
45137 case X86ISD::UNPCKL:
45138 case X86ISD::UNPCKH:
45139 case X86ISD::VPERMILPI:
45140 case X86ISD::VPERMV3: {
45143 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45144 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45145 APInt::getZero(NumElts));
45146 for (auto M : enumerate(Mask)) {
45147 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45148 continue;
45149 if (M.value() == SM_SentinelUndef)
45150 return false;
45151 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45152 "Shuffle mask index out of range");
45153 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45154 }
45155 for (auto Op : enumerate(Ops))
45156 if (!DemandedSrcElts[Op.index()].isZero() &&
45158 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45159 return false;
45160 return true;
45161 }
45162 break;
45163 }
45164 }
45166 Op, DemandedElts, DAG, PoisonOnly, Depth);
45167}
45168
45170 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45171 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45172
45173 switch (Op.getOpcode()) {
45174 // SSE bit logic.
45175 case X86ISD::FAND:
45176 case X86ISD::FOR:
45177 case X86ISD::FXOR:
45178 case X86ISD::FANDN:
45179 case X86ISD::ANDNP:
45180 case X86ISD::VPTERNLOG:
45181 return false;
45182 // SSE vector insert/extracts use modulo indices.
45183 case X86ISD::PINSRB:
45184 case X86ISD::PINSRW:
45185 case X86ISD::PEXTRB:
45186 case X86ISD::PEXTRW:
45187 return false;
45188 // SSE vector multiplies are either inbounds or saturate.
45189 case X86ISD::VPMADDUBSW:
45190 case X86ISD::VPMADDWD:
45191 return false;
45192 // SSE vector shifts handle out of bounds shift amounts.
45193 case X86ISD::VSHLI:
45194 case X86ISD::VSRLI:
45195 case X86ISD::VSRAI:
45196 return false;
45197 // SSE blends.
45198 case X86ISD::BLENDI:
45199 case X86ISD::BLENDV:
45200 return false;
45201 // SSE target shuffles.
45202 case X86ISD::PSHUFD:
45203 case X86ISD::UNPCKL:
45204 case X86ISD::UNPCKH:
45205 case X86ISD::VPERMILPI:
45206 case X86ISD::VPERMV3:
45207 return false;
45208 // SSE comparisons handle all icmp/fcmp cases.
45209 // TODO: Add CMPM/MM with test coverage.
45210 case X86ISD::CMPP:
45211 case X86ISD::PCMPEQ:
45212 case X86ISD::PCMPGT:
45213 return false;
45214 // SSE signbit extraction.
45215 case X86ISD::MOVMSK:
45216 return false;
45217 // GFNI instructions.
45220 case X86ISD::GF2P8MULB:
45221 return false;
45223 switch (Op->getConstantOperandVal(0)) {
45224 case Intrinsic::x86_sse2_pmadd_wd:
45225 case Intrinsic::x86_avx2_pmadd_wd:
45226 case Intrinsic::x86_avx512_pmaddw_d_512:
45227 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45228 case Intrinsic::x86_avx2_pmadd_ub_sw:
45229 case Intrinsic::x86_avx512_pmaddubs_w_512:
45230 return false;
45231 case Intrinsic::x86_avx512_vpermi2var_d_128:
45232 case Intrinsic::x86_avx512_vpermi2var_d_256:
45233 case Intrinsic::x86_avx512_vpermi2var_d_512:
45234 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45235 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45236 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45237 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45238 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45239 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45240 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45241 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45242 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45243 case Intrinsic::x86_avx512_vpermi2var_q_128:
45244 case Intrinsic::x86_avx512_vpermi2var_q_256:
45245 case Intrinsic::x86_avx512_vpermi2var_q_512:
45246 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45247 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45248 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45249 return false;
45250 }
45251 }
45253 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45254}
45255
45257 const APInt &DemandedElts,
45258 APInt &UndefElts,
45259 const SelectionDAG &DAG,
45260 unsigned Depth) const {
45261 unsigned NumElts = DemandedElts.getBitWidth();
45262 unsigned Opc = Op.getOpcode();
45263
45264 switch (Opc) {
45265 case X86ISD::VBROADCAST:
45267 UndefElts = APInt::getZero(NumElts);
45268 return true;
45269 }
45270
45271 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45272 DAG, Depth);
45273}
45274
45275// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45276// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45277static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45278 bool AllowTruncate, unsigned Depth) {
45279 // Limit recursion.
45281 return false;
45282 switch (Src.getOpcode()) {
45283 case ISD::TRUNCATE:
45284 if (!AllowTruncate)
45285 return false;
45286 [[fallthrough]];
45287 case ISD::SETCC:
45288 return Src.getOperand(0).getValueSizeInBits() == Size;
45289 case ISD::FREEZE:
45290 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45291 Depth + 1);
45292 case ISD::AND:
45293 case ISD::XOR:
45294 case ISD::OR:
45295 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45296 Depth + 1) &&
45297 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45298 Depth + 1);
45299 case ISD::SELECT:
45300 case ISD::VSELECT:
45301 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45302 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45303 Depth + 1) &&
45304 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45305 Depth + 1);
45306 case ISD::BUILD_VECTOR:
45307 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45308 ISD::isBuildVectorAllOnes(Src.getNode());
45309 }
45310 return false;
45311}
45312
45313// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45314static unsigned getAltBitOpcode(unsigned Opcode) {
45315 switch(Opcode) {
45316 // clang-format off
45317 case ISD::AND: return X86ISD::FAND;
45318 case ISD::OR: return X86ISD::FOR;
45319 case ISD::XOR: return X86ISD::FXOR;
45320 case X86ISD::ANDNP: return X86ISD::FANDN;
45321 // clang-format on
45322 }
45323 llvm_unreachable("Unknown bitwise opcode");
45324}
45325
45326// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45328 const SDLoc &DL) {
45329 EVT SrcVT = Src.getValueType();
45330 if (SrcVT != MVT::v4i1)
45331 return SDValue();
45332
45333 switch (Src.getOpcode()) {
45334 case ISD::SETCC:
45335 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45336 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45337 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45338 SDValue Op0 = Src.getOperand(0);
45339 if (ISD::isNormalLoad(Op0.getNode()))
45340 return DAG.getBitcast(MVT::v4f32, Op0);
45341 if (Op0.getOpcode() == ISD::BITCAST &&
45342 Op0.getOperand(0).getValueType() == MVT::v4f32)
45343 return Op0.getOperand(0);
45344 }
45345 break;
45346 case ISD::AND:
45347 case ISD::XOR:
45348 case ISD::OR: {
45349 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45350 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45351 if (Op0 && Op1)
45352 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45353 Op1);
45354 break;
45355 }
45356 }
45357 return SDValue();
45358}
45359
45360// Helper to push sign extension of vXi1 SETCC result through bitops.
45362 SDValue Src, const SDLoc &DL) {
45363 switch (Src.getOpcode()) {
45364 case ISD::SETCC:
45365 case ISD::FREEZE:
45366 case ISD::TRUNCATE:
45367 case ISD::BUILD_VECTOR:
45368 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45369 case ISD::AND:
45370 case ISD::XOR:
45371 case ISD::OR:
45372 return DAG.getNode(
45373 Src.getOpcode(), DL, SExtVT,
45374 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45375 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45376 case ISD::SELECT:
45377 case ISD::VSELECT:
45378 return DAG.getSelect(
45379 DL, SExtVT, Src.getOperand(0),
45380 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45381 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45382 }
45383 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45384}
45385
45386// Try to match patterns such as
45387// (i16 bitcast (v16i1 x))
45388// ->
45389// (i16 movmsk (16i8 sext (v16i1 x)))
45390// before the illegal vector is scalarized on subtargets that don't have legal
45391// vxi1 types.
45393 const SDLoc &DL,
45394 const X86Subtarget &Subtarget) {
45395 EVT SrcVT = Src.getValueType();
45396 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
45397 return SDValue();
45398
45399 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45400 // legalization destroys the v4i32 type.
45401 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45402 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45403 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45404 DAG.getBitcast(MVT::v4f32, V));
45405 return DAG.getZExtOrTrunc(V, DL, VT);
45406 }
45407 }
45408
45409 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45410 // movmskb even with avx512. This will be better than truncating to vXi1 and
45411 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45412 // vpcmpeqb/vpcmpgtb.
45413 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45414 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45415 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45416 Src.getOperand(0).getValueType() == MVT::v64i8);
45417
45418 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45419 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45420 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45421 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45422 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45423 EVT CmpVT = Src.getOperand(0).getValueType();
45424 EVT EltVT = CmpVT.getVectorElementType();
45425 if (CmpVT.getSizeInBits() <= 256 &&
45426 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45427 PreferMovMsk = true;
45428 }
45429
45430 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45431 // MOVMSK is supported in SSE2 or later.
45432 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45433 return SDValue();
45434
45435 // If the upper ops of a concatenation are undef, then try to bitcast the
45436 // lower op and extend.
45437 SmallVector<SDValue, 4> SubSrcOps;
45438 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45439 SubSrcOps.size() >= 2) {
45440 SDValue LowerOp = SubSrcOps[0];
45441 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45442 if (LowerOp.getOpcode() == ISD::SETCC &&
45443 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45444 EVT SubVT = VT.getIntegerVT(
45445 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45446 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45447 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45448 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45449 }
45450 }
45451 }
45452
45453 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45454 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45455 // v8i16 and v16i16.
45456 // For these two cases, we can shuffle the upper element bytes to a
45457 // consecutive sequence at the start of the vector and treat the results as
45458 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45459 // for v16i16 this is not the case, because the shuffle is expensive, so we
45460 // avoid sign-extending to this type entirely.
45461 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45462 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45463 MVT SExtVT;
45464 bool PropagateSExt = false;
45465 switch (SrcVT.getSimpleVT().SimpleTy) {
45466 default:
45467 return SDValue();
45468 case MVT::v2i1:
45469 SExtVT = MVT::v2i64;
45470 break;
45471 case MVT::v4i1:
45472 SExtVT = MVT::v4i32;
45473 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45474 // sign-extend to a 256-bit operation to avoid truncation.
45475 if (Subtarget.hasAVX() &&
45476 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45477 SExtVT = MVT::v4i64;
45478 PropagateSExt = true;
45479 }
45480 break;
45481 case MVT::v8i1:
45482 SExtVT = MVT::v8i16;
45483 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45484 // sign-extend to a 256-bit operation to match the compare.
45485 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45486 // 256-bit because the shuffle is cheaper than sign extending the result of
45487 // the compare.
45488 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45489 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45490 SExtVT = MVT::v8i32;
45491 PropagateSExt = true;
45492 }
45493 break;
45494 case MVT::v16i1:
45495 SExtVT = MVT::v16i8;
45496 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45497 // it is not profitable to sign-extend to 256-bit because this will
45498 // require an extra cross-lane shuffle which is more expensive than
45499 // truncating the result of the compare to 128-bits.
45500 break;
45501 case MVT::v32i1:
45502 SExtVT = MVT::v32i8;
45503 break;
45504 case MVT::v64i1:
45505 // If we have AVX512F, but not AVX512BW and the input is truncated from
45506 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45507 if (Subtarget.hasAVX512()) {
45508 if (Subtarget.hasBWI())
45509 return SDValue();
45510 SExtVT = MVT::v64i8;
45511 break;
45512 }
45513 // Split if this is a <64 x i8> comparison result.
45514 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45515 SExtVT = MVT::v64i8;
45516 break;
45517 }
45518 return SDValue();
45519 };
45520
45521 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45522 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45523
45524 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45525 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45526 } else {
45527 if (SExtVT == MVT::v8i16) {
45528 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45529 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45530 }
45531 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45532 }
45533
45534 EVT IntVT =
45536 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45537 return DAG.getBitcast(VT, V);
45538}
45539
45540// Convert a vXi1 constant build vector to the same width scalar integer.
45542 EVT SrcVT = Op.getValueType();
45543 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45544 "Expected a vXi1 vector");
45546 "Expected a constant build vector");
45547
45548 APInt Imm(SrcVT.getVectorNumElements(), 0);
45549 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45550 SDValue In = Op.getOperand(Idx);
45551 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45552 Imm.setBit(Idx);
45553 }
45554 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45555 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45556}
45557
45560 const X86Subtarget &Subtarget) {
45561 using namespace SDPatternMatch;
45562 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45563
45564 if (!DCI.isBeforeLegalizeOps())
45565 return SDValue();
45566
45567 // Only do this if we have k-registers.
45568 if (!Subtarget.hasAVX512())
45569 return SDValue();
45570
45571 EVT DstVT = N->getValueType(0);
45572 SDValue Op = N->getOperand(0);
45573 EVT SrcVT = Op.getValueType();
45574
45575 // Make sure we have a bitcast between mask registers and a scalar type.
45576 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45577 DstVT.isScalarInteger()) &&
45578 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45579 SrcVT.isScalarInteger()))
45580 return SDValue();
45581
45582 SDValue LHS, RHS;
45583
45584 // Look for logic ops.
45585 if (!sd_match(Op, m_OneUse(m_BitwiseLogic(m_Value(LHS), m_Value(RHS)))))
45586 return SDValue();
45587
45588 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45589 // least one of the getBitcast() will fold away).
45590 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45591 sd_match(RHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))))
45592 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45593 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45594
45595 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45596 // Most of these have to move a constant from the scalar domain anyway.
45599 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45600 DAG.getBitcast(DstVT, LHS), RHS);
45601 }
45602
45603 return SDValue();
45604}
45605
45607 const X86Subtarget &Subtarget) {
45608 SDLoc DL(BV);
45609 unsigned NumElts = BV->getNumOperands();
45610 SDValue Splat = BV->getSplatValue();
45611
45612 // Build MMX element from integer GPR or SSE float values.
45613 auto CreateMMXElement = [&](SDValue V) {
45614 if (V.isUndef())
45615 return DAG.getUNDEF(MVT::x86mmx);
45616 if (V.getValueType().isFloatingPoint()) {
45617 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45618 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45619 V = DAG.getBitcast(MVT::v2i64, V);
45620 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45621 }
45622 V = DAG.getBitcast(MVT::i32, V);
45623 } else {
45624 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45625 }
45626 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45627 };
45628
45629 // Convert build vector ops to MMX data in the bottom elements.
45631
45632 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45633
45634 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45635 if (Splat) {
45636 if (Splat.isUndef())
45637 return DAG.getUNDEF(MVT::x86mmx);
45638
45639 Splat = CreateMMXElement(Splat);
45640
45641 if (Subtarget.hasSSE1()) {
45642 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45643 if (NumElts == 8)
45644 Splat = DAG.getNode(
45645 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45646 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45647 TLI.getPointerTy(DAG.getDataLayout())),
45648 Splat, Splat);
45649
45650 // Use PSHUFW to repeat 16-bit elements.
45651 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45652 return DAG.getNode(
45653 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45654 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45655 TLI.getPointerTy(DAG.getDataLayout())),
45656 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45657 }
45658 Ops.append(NumElts, Splat);
45659 } else {
45660 for (unsigned i = 0; i != NumElts; ++i)
45661 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45662 }
45663
45664 // Use tree of PUNPCKLs to build up general MMX vector.
45665 while (Ops.size() > 1) {
45666 unsigned NumOps = Ops.size();
45667 unsigned IntrinOp =
45668 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45669 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45670 : Intrinsic::x86_mmx_punpcklbw));
45671 SDValue Intrin = DAG.getTargetConstant(
45672 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45673 for (unsigned i = 0; i != NumOps; i += 2)
45674 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45675 Ops[i], Ops[i + 1]);
45676 Ops.resize(NumOps / 2);
45677 }
45678
45679 return Ops[0];
45680}
45681
45682// Recursive function that attempts to find if a bool vector node was originally
45683// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45684// integer. If so, replace the scalar ops with bool vector equivalents back down
45685// the chain.
45687 SelectionDAG &DAG,
45688 const X86Subtarget &Subtarget,
45689 unsigned Depth = 0) {
45691 return SDValue(); // Limit search depth.
45692
45693 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45694 unsigned Opc = V.getOpcode();
45695 switch (Opc) {
45696 case ISD::BITCAST: {
45697 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45698 SDValue Src = V.getOperand(0);
45699 EVT SrcVT = Src.getValueType();
45700 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45701 return DAG.getBitcast(VT, Src);
45702 break;
45703 }
45704 case ISD::Constant: {
45705 auto *C = cast<ConstantSDNode>(V);
45706 if (C->isZero())
45707 return DAG.getConstant(0, DL, VT);
45708 if (C->isAllOnes())
45709 return DAG.getAllOnesConstant(DL, VT);
45710 break;
45711 }
45712 case ISD::TRUNCATE: {
45713 // If we find a suitable source, a truncated scalar becomes a subvector.
45714 SDValue Src = V.getOperand(0);
45715 EVT NewSrcVT =
45716 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45717 if (TLI.isTypeLegal(NewSrcVT))
45718 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45719 Subtarget, Depth + 1))
45720 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45721 DAG.getVectorIdxConstant(0, DL));
45722 break;
45723 }
45724 case ISD::ANY_EXTEND:
45725 case ISD::ZERO_EXTEND: {
45726 // If we find a suitable source, an extended scalar becomes a subvector.
45727 SDValue Src = V.getOperand(0);
45728 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45729 Src.getScalarValueSizeInBits());
45730 if (TLI.isTypeLegal(NewSrcVT))
45731 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45732 Subtarget, Depth + 1))
45733 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45734 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45735 : DAG.getConstant(0, DL, VT),
45736 N0, DAG.getVectorIdxConstant(0, DL));
45737 break;
45738 }
45739 case ISD::OR:
45740 case ISD::XOR: {
45741 // If we find suitable sources, we can just move the op to the vector
45742 // domain.
45743 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45744 Subtarget, Depth + 1))
45745 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45746 Subtarget, Depth + 1))
45747 return DAG.getNode(Opc, DL, VT, N0, N1);
45748 break;
45749 }
45750 case ISD::SHL: {
45751 // If we find a suitable source, a SHL becomes a KSHIFTL.
45752 SDValue Src0 = V.getOperand(0);
45753 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45754 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45755 break;
45756
45757 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45758 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45759 Depth + 1))
45760 return DAG.getNode(
45761 X86ISD::KSHIFTL, DL, VT, N0,
45762 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45763 break;
45764 }
45765 }
45766
45767 // Does the inner bitcast already exist?
45768 if (Depth > 0)
45769 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45770 return SDValue(Alt, 0);
45771
45772 return SDValue();
45773}
45774
45777 const X86Subtarget &Subtarget) {
45778 SDValue N0 = N->getOperand(0);
45779 EVT VT = N->getValueType(0);
45780 EVT SrcVT = N0.getValueType();
45781 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45782
45783 // Try to match patterns such as
45784 // (i16 bitcast (v16i1 x))
45785 // ->
45786 // (i16 movmsk (16i8 sext (v16i1 x)))
45787 // before the setcc result is scalarized on subtargets that don't have legal
45788 // vxi1 types.
45789 if (DCI.isBeforeLegalize()) {
45790 SDLoc dl(N);
45791 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45792 return V;
45793
45794 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45795 // type, widen both sides to avoid a trip through memory.
45796 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45797 Subtarget.hasAVX512()) {
45798 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45799 N0 = DAG.getBitcast(MVT::v8i1, N0);
45800 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45801 DAG.getVectorIdxConstant(0, dl));
45802 }
45803
45804 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45805 // type, widen both sides to avoid a trip through memory.
45806 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45807 Subtarget.hasAVX512()) {
45808 // Use zeros for the widening if we already have some zeroes. This can
45809 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45810 // stream of this.
45811 // FIXME: It might make sense to detect a concat_vectors with a mix of
45812 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45813 // a separate combine. What we can't do is canonicalize the operands of
45814 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45815 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45816 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45817 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45818 SrcVT = LastOp.getValueType();
45819 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45820 SmallVector<SDValue, 4> Ops(N0->ops());
45821 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45822 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45823 N0 = DAG.getBitcast(MVT::i8, N0);
45824 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45825 }
45826 }
45827
45828 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45829 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45830 Ops[0] = N0;
45831 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45832 N0 = DAG.getBitcast(MVT::i8, N0);
45833 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45834 }
45835 } else if (DCI.isAfterLegalizeDAG()) {
45836 // If we're bitcasting from iX to vXi1, see if the integer originally
45837 // began as a vXi1 and whether we can remove the bitcast entirely.
45838 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45839 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45840 if (SDValue V =
45841 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45842 return V;
45843 }
45844 }
45845
45846 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45847 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45848 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45849 // we can help with known bits propagation from the vXi1 domain to the
45850 // scalar domain.
45851 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45852 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45853 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45855 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45856 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45857
45858 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45859 // and the vbroadcast_load are both integer or both fp. In some cases this
45860 // will remove the bitcast entirely.
45861 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45862 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45863 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45864 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45865 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45866 // Don't swap i8/i16 since don't have fp types that size.
45867 if (MemSize >= 32) {
45868 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45869 : MVT::getIntegerVT(MemSize);
45870 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45871 : MVT::getIntegerVT(SrcVTSize);
45872 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45873
45874 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45875 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45876 SDValue ResNode =
45878 MemVT, BCast->getMemOperand());
45879 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45880 return DAG.getBitcast(VT, ResNode);
45881 }
45882 }
45883
45884 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45885 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45886 SDValue Src = peekThroughTruncates(N0);
45887 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45888 Src.getOperand(0).getValueSizeInBits() == 128 &&
45889 isNullConstant(Src.getOperand(1))) {
45890 SDLoc DL(N);
45891 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45892 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45893 DAG.getVectorIdxConstant(0, DL));
45894 }
45895 }
45896
45897 // Since MMX types are special and don't usually play with other vector types,
45898 // it's better to handle them early to be sure we emit efficient code by
45899 // avoiding store-load conversions.
45900 if (VT == MVT::x86mmx) {
45901 // Detect MMX constant vectors.
45902 APInt UndefElts;
45903 SmallVector<APInt, 1> EltBits;
45904 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45905 /*AllowWholeUndefs*/ true,
45906 /*AllowPartialUndefs*/ true)) {
45907 SDLoc DL(N0);
45908 // Handle zero-extension of i32 with MOVD.
45909 if (EltBits[0].countl_zero() >= 32)
45910 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45911 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45912 // Else, bitcast to a double.
45913 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45914 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45915 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45916 }
45917
45918 // Detect bitcasts to x86mmx low word.
45919 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45920 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45921 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45922 bool LowUndef = true, AllUndefOrZero = true;
45923 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45924 SDValue Op = N0.getOperand(i);
45925 LowUndef &= Op.isUndef() || (i >= e/2);
45926 AllUndefOrZero &= isNullConstantOrUndef(Op);
45927 }
45928 if (AllUndefOrZero) {
45929 SDValue N00 = N0.getOperand(0);
45930 SDLoc dl(N00);
45931 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45932 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45933 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45934 }
45935 }
45936
45937 // Detect bitcasts of 64-bit build vectors and convert to a
45938 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45939 // lowest element.
45940 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45941 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45942 SrcVT == MVT::v8i8))
45943 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45944
45945 // Detect bitcasts between element or subvector extraction to x86mmx.
45946 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45948 isNullConstant(N0.getOperand(1))) {
45949 SDValue N00 = N0.getOperand(0);
45950 if (N00.getValueType().is128BitVector())
45951 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45952 DAG.getBitcast(MVT::v2i64, N00));
45953 }
45954
45955 // Detect bitcasts from FP_TO_SINT to x86mmx.
45956 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
45957 SDLoc DL(N0);
45958 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
45959 DAG.getUNDEF(MVT::v2i32));
45960 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
45961 DAG.getBitcast(MVT::v2i64, Res));
45962 }
45963 }
45964
45965 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
45966 // most of these to scalar anyway.
45967 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
45968 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45970 return combinevXi1ConstantToInteger(N0, DAG);
45971 }
45972
45973 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
45974 VT.getVectorElementType() == MVT::i1) {
45975 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
45976 if (C->isAllOnes())
45977 return DAG.getConstant(1, SDLoc(N0), VT);
45978 if (C->isZero())
45979 return DAG.getConstant(0, SDLoc(N0), VT);
45980 }
45981 }
45982
45983 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
45984 // Turn it into a sign bit compare that produces a k-register. This avoids
45985 // a trip through a GPR.
45986 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
45987 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
45989 unsigned NumElts = VT.getVectorNumElements();
45990 SDValue Src = N0;
45991
45992 // Peek through truncate.
45993 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
45994 Src = N0.getOperand(0);
45995
45996 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
45997 SDValue MovmskIn = Src.getOperand(0);
45998 MVT MovmskVT = MovmskIn.getSimpleValueType();
45999 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46000
46001 // We allow extra bits of the movmsk to be used since they are known zero.
46002 // We can't convert a VPMOVMSKB without avx512bw.
46003 if (MovMskElts <= NumElts &&
46004 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46005 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46006 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46007 SDLoc dl(N);
46008 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46009 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46010 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46011 if (EVT(CmpVT) == VT)
46012 return Cmp;
46013
46014 // Pad with zeroes up to original VT to replace the zeroes that were
46015 // being used from the MOVMSK.
46016 unsigned NumConcats = NumElts / MovMskElts;
46017 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46018 Ops[0] = Cmp;
46019 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46020 }
46021 }
46022 }
46023
46024 // Try to remove bitcasts from input and output of mask arithmetic to
46025 // remove GPR<->K-register crossings.
46026 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46027 return V;
46028
46029 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46030 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46031 SrcVT.getVectorNumElements() == 1)
46032 return N0.getOperand(1);
46033
46034 // Convert a bitcasted integer logic operation that has one bitcasted
46035 // floating-point operand into a floating-point logic operation. This may
46036 // create a load of a constant, but that is cheaper than materializing the
46037 // constant in an integer register and transferring it to an SSE register or
46038 // transferring the SSE operand to integer register and back.
46039 unsigned FPOpcode;
46040 switch (N0.getOpcode()) {
46041 // clang-format off
46042 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46043 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46044 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46045 default: return SDValue();
46046 // clang-format on
46047 }
46048
46049 // Check if we have a bitcast from another integer type as well.
46050 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46051 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46052 (Subtarget.hasFP16() && VT == MVT::f16) ||
46053 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46054 TLI.isTypeLegal(VT))))
46055 return SDValue();
46056
46057 SDValue LogicOp0 = N0.getOperand(0);
46058 SDValue LogicOp1 = N0.getOperand(1);
46059 SDLoc DL0(N0);
46060
46061 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46062 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46063 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46064 LogicOp0.getOperand(0).getValueType() == VT &&
46065 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46066 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46067 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46068 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46069 }
46070 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46071 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46072 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46073 LogicOp1.getOperand(0).getValueType() == VT &&
46074 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46075 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46076 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46077 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46078 }
46079
46080 return SDValue();
46081}
46082
46083// (mul (zext a), (sext, b))
46084static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46085 SDValue &Op1) {
46086 Op0 = Mul.getOperand(0);
46087 Op1 = Mul.getOperand(1);
46088
46089 // The operand1 should be signed extend
46090 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46091 std::swap(Op0, Op1);
46092
46093 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46094 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46095 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46096 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46097 return true;
46098
46099 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46100 return (BV && BV->isConstant());
46101 };
46102
46103 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46104 // value, we need to check Op0 is zero extended value. Op1 should be signed
46105 // value, so we just check the signed bits.
46106 if ((IsFreeTruncation(Op0) &&
46107 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46108 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46109 return true;
46110
46111 return false;
46112}
46113
46115 unsigned &LogBias, const SDLoc &DL,
46116 const X86Subtarget &Subtarget) {
46117 // Extend or truncate to MVT::i8 first.
46118 MVT Vi8VT =
46119 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46120 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46121 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46122
46123 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46124 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46125 // The src A, B element type is i8, but the dst C element type is i32.
46126 // When we calculate the reduce stage, we use src vector type vXi8 for it
46127 // so we need logbias 2 to avoid extra 2 stages.
46128 LogBias = 2;
46129
46130 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46131 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46132 RegSize = std::max(512u, RegSize);
46133
46134 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46135 // fill in the missing vector elements with 0.
46136 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46137 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46138 Ops[0] = LHS;
46139 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46140 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46141 Ops[0] = RHS;
46142 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46143
46144 // Actually build the DotProduct, split as 256/512 bits for
46145 // AVXVNNI/AVX512VNNI.
46146 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46147 ArrayRef<SDValue> Ops) {
46148 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46149 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46150 };
46151 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46152 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46153
46154 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46155 DpBuilder, false);
46156}
46157
46158// Create a PSADBW given two sources representable as zexts of vXi8.
46160 const SDLoc &DL, const X86Subtarget &Subtarget) {
46161 // Find the appropriate width for the PSADBW.
46162 EVT DstVT = N0.getValueType();
46163 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46164 DstVT.getVectorElementCount());
46165 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46166
46167 // Widen the vXi8 vectors, padding with zero vector elements.
46168 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46169 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46170 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46171 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46172 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46173 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46174 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46175
46176 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46177 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46178 ArrayRef<SDValue> Ops) {
46179 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46180 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46181 };
46182 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46183 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46184 PSADBWBuilder);
46185}
46186
46187// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46188// PHMINPOSUW.
46190 const X86Subtarget &Subtarget) {
46191 // Bail without SSE41.
46192 if (!Subtarget.hasSSE41())
46193 return SDValue();
46194
46195 EVT ExtractVT = Extract->getValueType(0);
46196 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46197 return SDValue();
46198
46199 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46200 ISD::NodeType BinOp;
46201 SDValue Src = DAG.matchBinOpReduction(
46202 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46203 if (!Src)
46204 return SDValue();
46205
46206 EVT SrcVT = Src.getValueType();
46207 EVT SrcSVT = SrcVT.getScalarType();
46208 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46209 return SDValue();
46210
46211 SDLoc DL(Extract);
46212 SDValue MinPos = Src;
46213
46214 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46215 while (SrcVT.getSizeInBits() > 128) {
46216 SDValue Lo, Hi;
46217 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46218 SrcVT = Lo.getValueType();
46219 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46220 }
46221 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46222 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46223 "Unexpected value type");
46224
46225 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46226 // to flip the value accordingly.
46227 SDValue Mask;
46228 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46229 if (BinOp == ISD::SMAX)
46230 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46231 else if (BinOp == ISD::SMIN)
46232 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46233 else if (BinOp == ISD::UMAX)
46234 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46235
46236 if (Mask)
46237 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46238
46239 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46240 // shuffling each upper element down and insert zeros. This means that the
46241 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46242 // ready for the PHMINPOS.
46243 if (ExtractVT == MVT::i8) {
46245 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46246 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46247 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46248 }
46249
46250 // Perform the PHMINPOS on a v8i16 vector,
46251 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46252 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46253 MinPos = DAG.getBitcast(SrcVT, MinPos);
46254
46255 if (Mask)
46256 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46257
46258 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46259 DAG.getVectorIdxConstant(0, DL));
46260}
46261
46262// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46264 const X86Subtarget &Subtarget) {
46265 // Bail without SSE2.
46266 if (!Subtarget.hasSSE2())
46267 return SDValue();
46268
46269 EVT ExtractVT = Extract->getValueType(0);
46270 unsigned BitWidth = ExtractVT.getSizeInBits();
46271 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46272 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46273 return SDValue();
46274
46275 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46276 ISD::NodeType BinOp;
46277 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46278 if (!Match && ExtractVT == MVT::i1)
46279 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46280 if (!Match)
46281 return SDValue();
46282
46283 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46284 // which we can't support here for now.
46285 if (Match.getScalarValueSizeInBits() != BitWidth)
46286 return SDValue();
46287
46288 SDValue Movmsk;
46289 SDLoc DL(Extract);
46290 EVT MatchVT = Match.getValueType();
46291 unsigned NumElts = MatchVT.getVectorNumElements();
46292 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46293 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46294 LLVMContext &Ctx = *DAG.getContext();
46295
46296 if (ExtractVT == MVT::i1) {
46297 // Special case for (pre-legalization) vXi1 reductions.
46298 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46299 return SDValue();
46300 if (Match.getOpcode() == ISD::SETCC) {
46301 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46302 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46303 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46304 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46305 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46306 X86::CondCode X86CC;
46307 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46308 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46309 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46310 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46311 DAG, X86CC))
46312 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46313 getSETCC(X86CC, V, DL, DAG));
46314 }
46315 }
46316 if (TLI.isTypeLegal(MatchVT)) {
46317 // If this is a legal AVX512 predicate type then we can just bitcast.
46318 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46319 Movmsk = DAG.getBitcast(MovmskVT, Match);
46320 } else {
46321 // Use combineBitcastvxi1 to create the MOVMSK.
46322 while (NumElts > MaxElts) {
46323 SDValue Lo, Hi;
46324 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46325 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46326 NumElts /= 2;
46327 }
46328 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46329 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46330 }
46331 if (!Movmsk)
46332 return SDValue();
46333 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46334 } else {
46335 // FIXME: Better handling of k-registers or 512-bit vectors?
46336 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46337 if (!(MatchSizeInBits == 128 ||
46338 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46339 return SDValue();
46340
46341 // Make sure this isn't a vector of 1 element. The perf win from using
46342 // MOVMSK diminishes with less elements in the reduction, but it is
46343 // generally better to get the comparison over to the GPRs as soon as
46344 // possible to reduce the number of vector ops.
46345 if (Match.getValueType().getVectorNumElements() < 2)
46346 return SDValue();
46347
46348 // Check that we are extracting a reduction of all sign bits.
46349 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46350 return SDValue();
46351
46352 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46353 SDValue Lo, Hi;
46354 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46355 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46356 MatchSizeInBits = Match.getValueSizeInBits();
46357 }
46358
46359 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46360 MVT MaskSrcVT;
46361 if (64 == BitWidth || 32 == BitWidth)
46363 MatchSizeInBits / BitWidth);
46364 else
46365 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46366
46367 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46368 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46369 NumElts = MaskSrcVT.getVectorNumElements();
46370 }
46371 assert((NumElts <= 32 || NumElts == 64) &&
46372 "Not expecting more than 64 elements");
46373
46374 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46375 if (BinOp == ISD::XOR) {
46376 // parity -> (PARITY(MOVMSK X))
46377 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46378 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46379 }
46380
46381 SDValue CmpC;
46382 ISD::CondCode CondCode;
46383 if (BinOp == ISD::OR) {
46384 // any_of -> MOVMSK != 0
46385 CmpC = DAG.getConstant(0, DL, CmpVT);
46386 CondCode = ISD::CondCode::SETNE;
46387 } else {
46388 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46389 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46390 DL, CmpVT);
46391 CondCode = ISD::CondCode::SETEQ;
46392 }
46393
46394 // The setcc produces an i8 of 0/1, so extend that to the result width and
46395 // negate to get the final 0/-1 mask value.
46396 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46397 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46398 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46399 return DAG.getNegative(Zext, DL, ExtractVT);
46400}
46401
46403 const X86Subtarget &Subtarget) {
46404 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46405 return SDValue();
46406
46407 EVT ExtractVT = Extract->getValueType(0);
46408 // Verify the type we're extracting is i32, as the output element type of
46409 // vpdpbusd is i32.
46410 if (ExtractVT != MVT::i32)
46411 return SDValue();
46412
46413 EVT VT = Extract->getOperand(0).getValueType();
46415 return SDValue();
46416
46417 // Match shuffle + add pyramid.
46418 ISD::NodeType BinOp;
46419 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46420
46421 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46422 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46423 // before adding into the accumulator.
46424 // TODO:
46425 // We also need to verify that the multiply has at least 2x the number of bits
46426 // of the input. We shouldn't match
46427 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46428 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46429 // Root = Root.getOperand(0);
46430
46431 // If there was a match, we want Root to be a mul.
46432 if (!Root || Root.getOpcode() != ISD::MUL)
46433 return SDValue();
46434
46435 // Check whether we have an extend and mul pattern
46436 SDValue LHS, RHS;
46437 if (!detectExtMul(DAG, Root, LHS, RHS))
46438 return SDValue();
46439
46440 // Create the dot product instruction.
46441 SDLoc DL(Extract);
46442 unsigned StageBias;
46443 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46444
46445 // If the original vector was wider than 4 elements, sum over the results
46446 // in the DP vector.
46447 unsigned Stages = Log2_32(VT.getVectorNumElements());
46448 EVT DpVT = DP.getValueType();
46449
46450 if (Stages > StageBias) {
46451 unsigned DpElems = DpVT.getVectorNumElements();
46452
46453 for (unsigned i = Stages - StageBias; i > 0; --i) {
46454 SmallVector<int, 16> Mask(DpElems, -1);
46455 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46456 Mask[j] = MaskEnd + j;
46457
46458 SDValue Shuffle =
46459 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46460 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46461 }
46462 }
46463
46464 // Return the lowest ExtractSizeInBits bits.
46465 EVT ResVT =
46466 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46467 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46468 DP = DAG.getBitcast(ResVT, DP);
46469 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46470 Extract->getOperand(1));
46471}
46472
46474 const X86Subtarget &Subtarget) {
46475 using namespace SDPatternMatch;
46476
46477 // PSADBW is only supported on SSE2 and up.
46478 if (!Subtarget.hasSSE2())
46479 return SDValue();
46480
46481 EVT ExtractVT = Extract->getValueType(0);
46482 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46483 ExtractVT != MVT::i64)
46484 return SDValue();
46485
46486 EVT VT = Extract->getOperand(0).getValueType();
46488 return SDValue();
46489
46490 // Match shuffle + add pyramid.
46491 ISD::NodeType BinOp;
46492 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46493 if (!Root)
46494 return SDValue();
46495
46496 // The operand is expected to be zero extended from i8.
46497 // In order to convert to i64 and above, additional any/zero/sign
46498 // extend is expected.
46499 // The zero extend from 32 bit has no mathematical effect on the result.
46500 // Also the sign extend is basically zero extend
46501 // (extends the sign bit which is zero).
46502 // So it is correct to skip the sign/zero extend instruction.
46503 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46504 Root.getOpcode() == ISD::ZERO_EXTEND ||
46505 Root.getOpcode() == ISD::ANY_EXTEND)
46506 Root = Root.getOperand(0);
46507
46508 // Check whether we have an vXi8 abdu pattern.
46509 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46510 SDValue Src0, Src1;
46511 if (!sd_match(
46512 Root,
46513 m_AnyOf(
46514 m_SpecificVectorElementVT(
46515 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46516 m_SpecificVectorElementVT(
46517 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46518 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46519 m_Abs(
46520 m_Sub(m_AllOf(m_Value(Src0),
46521 m_ZExt(m_SpecificVectorElementVT(MVT::i8))),
46522 m_AllOf(m_Value(Src1),
46523 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46524 return SDValue();
46525
46526 // Create the SAD instruction.
46527 SDLoc DL(Extract);
46528 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46529
46530 // If the original vector was wider than 8 elements, sum over the results
46531 // in the SAD vector.
46532 unsigned Stages = Log2_32(VT.getVectorNumElements());
46533 EVT SadVT = SAD.getValueType();
46534 if (Stages > 3) {
46535 unsigned SadElems = SadVT.getVectorNumElements();
46536
46537 for(unsigned i = Stages - 3; i > 0; --i) {
46538 SmallVector<int, 16> Mask(SadElems, -1);
46539 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46540 Mask[j] = MaskEnd + j;
46541
46542 SDValue Shuffle =
46543 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46544 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46545 }
46546 }
46547
46548 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46549 // Return the lowest ExtractSizeInBits bits.
46550 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46551 SadVT.getSizeInBits() / ExtractSizeInBits);
46552 SAD = DAG.getBitcast(ResVT, SAD);
46553 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46554 Extract->getOperand(1));
46555}
46556
46557// If this extract is from a loaded vector value and will be used as an
46558// integer, that requires a potentially expensive XMM -> GPR transfer.
46559// Additionally, if we can convert to a scalar integer load, that will likely
46560// be folded into a subsequent integer op.
46561// Note: SrcVec might not have a VecVT type, but it must be the same size.
46562// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46563// to a single-use of the loaded vector. For the reasons above, we
46564// expect this to be profitable even if it creates an extra load.
46565static SDValue
46567 const SDLoc &dl, SelectionDAG &DAG,
46569 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46570 "Only EXTRACT_VECTOR_ELT supported so far");
46571
46572 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46573 EVT VT = N->getValueType(0);
46574
46575 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46576 return Use->getOpcode() == ISD::STORE ||
46577 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46578 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46579 });
46580
46581 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46582 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46583 VecVT.getVectorElementType() == VT &&
46584 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46585 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46586 SDValue NewPtr = TLI.getVectorElementPointer(
46587 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46588 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46589 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46590 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46591 SDValue Load =
46592 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46593 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46594 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46595 return Load;
46596 }
46597
46598 return SDValue();
46599}
46600
46601// Attempt to peek through a target shuffle and extract the scalar from the
46602// source.
46605 const X86Subtarget &Subtarget) {
46606 if (DCI.isBeforeLegalizeOps())
46607 return SDValue();
46608
46609 SDLoc dl(N);
46610 SDValue Src = N->getOperand(0);
46611 SDValue Idx = N->getOperand(1);
46612
46613 EVT VT = N->getValueType(0);
46614 EVT SrcVT = Src.getValueType();
46615 EVT SrcSVT = SrcVT.getVectorElementType();
46616 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46617 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46618
46619 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46620 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46621 return SDValue();
46622
46623 const APInt &IdxC = N->getConstantOperandAPInt(1);
46624 if (IdxC.uge(NumSrcElts))
46625 return SDValue();
46626
46627 SDValue SrcBC = peekThroughBitcasts(Src);
46628
46629 // Handle extract(bitcast(broadcast(scalar_value))).
46630 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46631 SDValue SrcOp = SrcBC.getOperand(0);
46632 EVT SrcOpVT = SrcOp.getValueType();
46633 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46634 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46635 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46636 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46637 // TODO support non-zero offsets.
46638 if (Offset == 0) {
46639 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46640 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46641 return SrcOp;
46642 }
46643 }
46644 }
46645
46646 // If we're extracting a single element from a broadcast load and there are
46647 // no other users, just create a single load.
46649 SrcBC.hasOneUse()) {
46650 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46651 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46652 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46653 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46654 SDValue Load =
46655 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46656 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46657 MemIntr->getMemOperand()->getFlags());
46658 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46659 return Load;
46660 }
46661 }
46662
46663 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46664 // TODO: Move to DAGCombine?
46665 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46666 SrcBC.getValueType().isInteger() &&
46667 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46668 SrcBC.getScalarValueSizeInBits() ==
46669 SrcBC.getOperand(0).getValueSizeInBits()) {
46670 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46671 if (IdxC.ult(Scale)) {
46672 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46673 SDValue Scl = SrcBC.getOperand(0);
46674 EVT SclVT = Scl.getValueType();
46675 if (Offset) {
46676 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46677 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46678 }
46679 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46680 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46681 return Scl;
46682 }
46683 }
46684
46685 // Handle extract(truncate(x)) for 0'th index.
46686 // TODO: Treat this as a faux shuffle?
46687 // TODO: When can we use this for general indices?
46688 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46689 (SrcVT.getSizeInBits() % 128) == 0) {
46690 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46691 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46692 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46693 Idx);
46694 }
46695
46696 // We can only legally extract other elements from 128-bit vectors and in
46697 // certain circumstances, depending on SSE-level.
46698 // TODO: Investigate float/double extraction if it will be just stored.
46699 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46700 unsigned Idx) {
46701 EVT VecSVT = VecVT.getScalarType();
46702 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46703 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46704 VecSVT == MVT::i64)) {
46705 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46706 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46707 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46708 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46709 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46710 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46711 Idx &= (NumEltsPerLane - 1);
46712 }
46713 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46714 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46715 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46716 DAG.getBitcast(VecVT, Vec),
46717 DAG.getVectorIdxConstant(Idx, dl));
46718 }
46719 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46720 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46721 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46722 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46723 DAG.getTargetConstant(Idx, dl, MVT::i8));
46724 }
46725 return SDValue();
46726 };
46727
46728 // Resolve the target shuffle inputs and mask.
46731 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46732 return SDValue();
46733
46734 // Shuffle inputs must be the same size as the result.
46735 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46736 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46737 }))
46738 return SDValue();
46739
46740 // Attempt to narrow/widen the shuffle mask to the correct size.
46741 if (Mask.size() != NumSrcElts) {
46742 if ((NumSrcElts % Mask.size()) == 0) {
46743 SmallVector<int, 16> ScaledMask;
46744 int Scale = NumSrcElts / Mask.size();
46745 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46746 Mask = std::move(ScaledMask);
46747 } else if ((Mask.size() % NumSrcElts) == 0) {
46748 // Simplify Mask based on demanded element.
46749 int ExtractIdx = (int)IdxC.getZExtValue();
46750 int Scale = Mask.size() / NumSrcElts;
46751 int Lo = Scale * ExtractIdx;
46752 int Hi = Scale * (ExtractIdx + 1);
46753 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46754 if (i < Lo || Hi <= i)
46755 Mask[i] = SM_SentinelUndef;
46756
46757 SmallVector<int, 16> WidenedMask;
46758 while (Mask.size() > NumSrcElts &&
46759 canWidenShuffleElements(Mask, WidenedMask))
46760 Mask = std::move(WidenedMask);
46761 }
46762 }
46763
46764 // If narrowing/widening failed, see if we can extract+zero-extend.
46765 int ExtractIdx;
46766 EVT ExtractVT;
46767 if (Mask.size() == NumSrcElts) {
46768 ExtractIdx = Mask[IdxC.getZExtValue()];
46769 ExtractVT = SrcVT;
46770 } else {
46771 unsigned Scale = Mask.size() / NumSrcElts;
46772 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46773 return SDValue();
46774 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46775 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46776 return SDValue();
46777 ExtractIdx = Mask[ScaledIdx];
46778 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46779 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46780 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46781 "Failed to widen vector type");
46782 }
46783
46784 // If the shuffle source element is undef/zero then we can just accept it.
46785 if (ExtractIdx == SM_SentinelUndef)
46786 return DAG.getUNDEF(VT);
46787
46788 if (ExtractIdx == SM_SentinelZero)
46789 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46790 : DAG.getConstant(0, dl, VT);
46791
46792 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46793 ExtractIdx = ExtractIdx % Mask.size();
46794 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46795 return DAG.getZExtOrTrunc(V, dl, VT);
46796
46797 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46799 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46800 return V;
46801
46802 return SDValue();
46803}
46804
46805/// Extracting a scalar FP value from vector element 0 is free, so extract each
46806/// operand first, then perform the math as a scalar op.
46808 const X86Subtarget &Subtarget,
46810 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46811 SDValue Vec = ExtElt->getOperand(0);
46812 SDValue Index = ExtElt->getOperand(1);
46813 EVT VT = ExtElt->getValueType(0);
46814 EVT VecVT = Vec.getValueType();
46815
46816 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46817 // non-zero element because the shuffle+scalar op will be cheaper?
46818 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46819 return SDValue();
46820
46821 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46822 // extract, the condition code), so deal with those as a special-case.
46823 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46824 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46825 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46826 return SDValue();
46827
46828 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46829 SDLoc DL(ExtElt);
46830 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46831 Vec.getOperand(0), Index);
46832 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46833 Vec.getOperand(1), Index);
46834 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46835 }
46836
46837 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46838 VT != MVT::f64)
46839 return SDValue();
46840
46841 // Vector FP selects don't fit the pattern of FP math ops (because the
46842 // condition has a different type and we have to change the opcode), so deal
46843 // with those here.
46844 // FIXME: This is restricted to pre type legalization. If we loosen this we
46845 // need to convert vector bool to a scalar bool.
46846 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46847 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46848 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46849 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46850 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46851 SDLoc DL(ExtElt);
46854 Vec.getOperand(0), Index);
46855 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46856 Vec.getOperand(1), Index);
46857 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46858 Vec.getOperand(2), Index);
46859 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46860 }
46861
46862 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46863 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46864 // missed load folding and fma+fneg combining.
46865 switch (Vec.getOpcode()) {
46866 case ISD::FMA: // Begin 3 operands
46867 case ISD::FMAD:
46868 case ISD::FADD: // Begin 2 operands
46869 case ISD::FSUB:
46870 case ISD::FMUL:
46871 case ISD::FDIV:
46872 case ISD::FREM:
46873 case ISD::FCOPYSIGN:
46874 case ISD::FMINNUM:
46875 case ISD::FMAXNUM:
46876 case ISD::FMINNUM_IEEE:
46877 case ISD::FMAXNUM_IEEE:
46878 case ISD::FMAXIMUM:
46879 case ISD::FMINIMUM:
46880 case ISD::FMAXIMUMNUM:
46881 case ISD::FMINIMUMNUM:
46882 case X86ISD::FMAX:
46883 case X86ISD::FMIN:
46884 case ISD::FABS: // Begin 1 operand
46885 case ISD::FSQRT:
46886 case ISD::FRINT:
46887 case ISD::FCEIL:
46888 case ISD::FTRUNC:
46889 case ISD::FNEARBYINT:
46890 case ISD::FROUNDEVEN:
46891 case ISD::FROUND:
46892 case ISD::FFLOOR:
46893 case X86ISD::FRCP:
46894 case X86ISD::FRSQRT: {
46895 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46896 SDLoc DL(ExtElt);
46898 for (SDValue Op : Vec->ops())
46899 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46900 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46901 }
46902 default:
46903 return SDValue();
46904 }
46905 llvm_unreachable("All opcodes should return within switch");
46906}
46907
46908/// Try to convert a vector reduction sequence composed of binops and shuffles
46909/// into horizontal ops.
46911 const X86Subtarget &Subtarget) {
46912 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46913
46914 // We need at least SSE2 to anything here.
46915 if (!Subtarget.hasSSE2())
46916 return SDValue();
46917
46919 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46920 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46921 if (!Rdx)
46922 return SDValue();
46923
46924 SDValue Index = ExtElt->getOperand(1);
46925 assert(isNullConstant(Index) &&
46926 "Reduction doesn't end in an extract from index 0");
46927
46928 EVT VT = ExtElt->getValueType(0);
46929 EVT VecVT = Rdx.getValueType();
46930 if (VecVT.getScalarType() != VT)
46931 return SDValue();
46932
46933 SDLoc DL(ExtElt);
46934 unsigned NumElts = VecVT.getVectorNumElements();
46935 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46936
46937 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46938 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46939 if (V.getValueType() == MVT::v4i8) {
46940 if (ZeroExtend && Subtarget.hasSSE41()) {
46941 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46942 DAG.getConstant(0, DL, MVT::v4i32),
46943 DAG.getBitcast(MVT::i32, V),
46944 DAG.getVectorIdxConstant(0, DL));
46945 return DAG.getBitcast(MVT::v16i8, V);
46946 }
46947 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46948 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46949 : DAG.getUNDEF(MVT::v4i8));
46950 }
46951 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46952 DAG.getUNDEF(MVT::v8i8));
46953 };
46954
46955 // vXi8 mul reduction - promote to vXi16 mul reduction.
46956 if (Opc == ISD::MUL) {
46957 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
46958 return SDValue();
46959 if (VecVT.getSizeInBits() >= 128) {
46960 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
46961 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46962 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46963 Lo = DAG.getBitcast(WideVT, Lo);
46964 Hi = DAG.getBitcast(WideVT, Hi);
46965 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
46966 while (Rdx.getValueSizeInBits() > 128) {
46967 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46968 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
46969 }
46970 } else {
46971 Rdx = WidenToV16I8(Rdx, false);
46972 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
46973 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
46974 }
46975 if (NumElts >= 8)
46976 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46977 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46978 {4, 5, 6, 7, -1, -1, -1, -1}));
46979 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46980 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46981 {2, 3, -1, -1, -1, -1, -1, -1}));
46982 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46983 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46984 {1, -1, -1, -1, -1, -1, -1, -1}));
46985 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46986 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46987 }
46988
46989 // vXi8 add reduction - sub 128-bit vector.
46990 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
46991 Rdx = WidenToV16I8(Rdx, true);
46992 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46993 DAG.getConstant(0, DL, MVT::v16i8));
46994 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46995 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46996 }
46997
46998 // Must be a >=128-bit vector with pow2 elements.
46999 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47000 return SDValue();
47001
47002 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47003 if (VT == MVT::i8) {
47004 while (Rdx.getValueSizeInBits() > 128) {
47005 SDValue Lo, Hi;
47006 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47007 VecVT = Lo.getValueType();
47008 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47009 }
47010 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47011
47013 MVT::v16i8, DL, Rdx, Rdx,
47014 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47015 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47016 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47017 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47018 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47019 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47020 }
47021
47022 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47023 // If the source vector values are 0-255, then we can use PSADBW to
47024 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47025 // TODO: See if its worth avoiding vXi16/i32 truncations?
47026 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47027 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47028 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47029 Subtarget.hasAVX512())) {
47030 if (Rdx.getValueType() == MVT::v8i16) {
47031 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47032 DAG.getUNDEF(MVT::v8i16));
47033 } else {
47034 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47035 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47036 if (ByteVT.getSizeInBits() < 128)
47037 Rdx = WidenToV16I8(Rdx, true);
47038 }
47039
47040 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47041 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47042 ArrayRef<SDValue> Ops) {
47043 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47044 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47045 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47046 };
47047 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47048 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47049
47050 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47051 while (Rdx.getValueSizeInBits() > 128) {
47052 SDValue Lo, Hi;
47053 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47054 VecVT = Lo.getValueType();
47055 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47056 }
47057 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47058
47059 if (NumElts > 8) {
47060 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47061 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47062 }
47063
47064 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47065 Rdx = DAG.getBitcast(VecVT, Rdx);
47066 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47067 }
47068
47069 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47070 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47071 return SDValue();
47072
47073 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47074
47075 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47076 // across the whole vector, so we need an extract + hop preliminary stage.
47077 // This is the only step where the operands of the hop are not the same value.
47078 // TODO: We could extend this to handle 512-bit or even longer vectors.
47079 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47080 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47081 unsigned NumElts = VecVT.getVectorNumElements();
47082 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47083 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47084 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47085 VecVT = Rdx.getValueType();
47086 }
47087 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47088 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47089 return SDValue();
47090
47091 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47092 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47093 for (unsigned i = 0; i != ReductionSteps; ++i)
47094 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47095
47096 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47097}
47098
47099/// Detect vector gather/scatter index generation and convert it from being a
47100/// bunch of shuffles and extracts into a somewhat faster sequence.
47101/// For i686, the best sequence is apparently storing the value and loading
47102/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47105 const X86Subtarget &Subtarget) {
47106 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47107 return NewOp;
47108
47109 SDValue InputVector = N->getOperand(0);
47110 SDValue EltIdx = N->getOperand(1);
47111 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47112
47113 EVT SrcVT = InputVector.getValueType();
47114 EVT VT = N->getValueType(0);
47115 SDLoc dl(InputVector);
47116 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47117 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47118 unsigned NumEltBits = VT.getScalarSizeInBits();
47119 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47120
47121 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47122 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47123
47124 // Integer Constant Folding.
47125 if (CIdx && VT.isInteger()) {
47126 APInt UndefVecElts;
47127 SmallVector<APInt, 16> EltBits;
47128 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47129 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47130 EltBits, /*AllowWholeUndefs*/ true,
47131 /*AllowPartialUndefs*/ false)) {
47132 uint64_t Idx = CIdx->getZExtValue();
47133 if (UndefVecElts[Idx])
47134 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47135 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47136 }
47137
47138 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47139 // Improves lowering of bool masks on rust which splits them into byte array.
47140 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47141 SDValue Src = peekThroughBitcasts(InputVector);
47142 if (Src.getValueType().getScalarType() == MVT::i1 &&
47143 TLI.isTypeLegal(Src.getValueType())) {
47144 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47145 SDValue Sub = DAG.getNode(
47146 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47147 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47148 return DAG.getBitcast(VT, Sub);
47149 }
47150 }
47151 }
47152
47153 if (IsPextr) {
47154 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47155 DCI))
47156 return SDValue(N, 0);
47157
47158 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47159 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47160 InputVector.getOpcode() == X86ISD::PINSRW) &&
47161 InputVector.getOperand(2) == EltIdx) {
47162 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47163 "Vector type mismatch");
47164 SDValue Scl = InputVector.getOperand(1);
47165 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47166 return DAG.getZExtOrTrunc(Scl, dl, VT);
47167 }
47168
47169 // TODO - Remove this once we can handle the implicit zero-extension of
47170 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47171 // combineBasicSADPattern.
47172 return SDValue();
47173 }
47174
47175 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47176 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47177 InputVector.getOpcode() == ISD::BITCAST &&
47178 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47179 isNullConstant(EltIdx) && InputVector.hasOneUse())
47180 return DAG.getBitcast(VT, InputVector);
47181
47182 // Detect mmx to i32 conversion through a v2i32 elt extract.
47183 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47184 InputVector.getOpcode() == ISD::BITCAST &&
47185 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47186 isNullConstant(EltIdx) && InputVector.hasOneUse())
47187 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47188 InputVector.getOperand(0));
47189
47190 // Check whether this extract is the root of a sum of absolute differences
47191 // pattern. This has to be done here because we really want it to happen
47192 // pre-legalization,
47193 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47194 return SAD;
47195
47196 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47197 return VPDPBUSD;
47198
47199 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47200 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47201 return Cmp;
47202
47203 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47204 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47205 return MinMax;
47206
47207 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47208 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47209 return V;
47210
47211 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47212 return V;
47213
47214 if (CIdx)
47216 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47217 dl, DAG, DCI))
47218 return V;
47219
47220 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47221 // and then testing the relevant element.
47222 //
47223 // Note that we only combine extracts on the *same* result number, i.e.
47224 // t0 = merge_values a0, a1, a2, a3
47225 // i1 = extract_vector_elt t0, Constant:i64<2>
47226 // i1 = extract_vector_elt t0, Constant:i64<3>
47227 // but not
47228 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47229 // since the latter would need its own MOVMSK.
47230 if (SrcVT.getScalarType() == MVT::i1) {
47231 bool IsVar = !CIdx;
47232 SmallVector<SDNode *, 16> BoolExtracts;
47233 unsigned ResNo = InputVector.getResNo();
47234 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47235 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47236 Use->getOperand(0).getResNo() == ResNo &&
47237 Use->getValueType(0) == MVT::i1) {
47238 BoolExtracts.push_back(Use);
47239 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47240 return true;
47241 }
47242 return false;
47243 };
47244 // TODO: Can we drop the oneuse check for constant extracts?
47245 if (all_of(InputVector->users(), IsBoolExtract) &&
47246 (IsVar || BoolExtracts.size() > 1)) {
47247 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47248 if (SDValue BC =
47249 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47250 for (SDNode *Use : BoolExtracts) {
47251 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47252 // Mask = 1 << MaskIdx
47253 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47254 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47255 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47256 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47257 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47258 DCI.CombineTo(Use, Res);
47259 }
47260 return SDValue(N, 0);
47261 }
47262 }
47263 }
47264
47265 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47266 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47267 SDValue TruncSrc = InputVector.getOperand(0);
47268 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47269 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47270 SDValue NewExt =
47271 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47272 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47273 }
47274 }
47275
47276 return SDValue();
47277}
47278
47279// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47280// This is more or less the reverse of combineBitcastvxi1.
47282 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47283 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47284 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47285 Opcode != ISD::ANY_EXTEND)
47286 return SDValue();
47287 if (!DCI.isBeforeLegalizeOps())
47288 return SDValue();
47289 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47290 return SDValue();
47291
47292 EVT SVT = VT.getScalarType();
47293 EVT InSVT = N0.getValueType().getScalarType();
47294 unsigned EltSizeInBits = SVT.getSizeInBits();
47295
47296 // Input type must be extending a bool vector (bit-casted from a scalar
47297 // integer) to legal integer types.
47298 if (!VT.isVector())
47299 return SDValue();
47300 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47301 return SDValue();
47302 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47303 return SDValue();
47304
47305 SDValue N00 = N0.getOperand(0);
47306 EVT SclVT = N00.getValueType();
47307 if (!SclVT.isScalarInteger())
47308 return SDValue();
47309
47310 SDValue Vec;
47311 SmallVector<int> ShuffleMask;
47312 unsigned NumElts = VT.getVectorNumElements();
47313 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47314
47315 // Broadcast the scalar integer to the vector elements.
47316 if (NumElts > EltSizeInBits) {
47317 // If the scalar integer is greater than the vector element size, then we
47318 // must split it down into sub-sections for broadcasting. For example:
47319 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47320 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47321 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47322 unsigned Scale = NumElts / EltSizeInBits;
47323 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47324 bool UseBroadcast = Subtarget.hasInt256() &&
47325 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47326 Vec = UseBroadcast
47327 ? DAG.getSplat(BroadcastVT, DL, N00)
47328 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47329 Vec = DAG.getBitcast(VT, Vec);
47330
47331 for (unsigned i = 0; i != Scale; ++i) {
47332 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47333 ShuffleMask.append(EltSizeInBits, i + Offset);
47334 }
47335 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47336 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47337 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47338 // If we have register broadcast instructions, use the scalar size as the
47339 // element type for the shuffle. Then cast to the wider element type. The
47340 // widened bits won't be used, and this might allow the use of a broadcast
47341 // load.
47342 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47343 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47344 (NumElts * EltSizeInBits) / NumElts);
47345 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47346 } else {
47347 // For smaller scalar integers, we can simply any-extend it to the vector
47348 // element size (we don't care about the upper bits) and broadcast it to all
47349 // elements.
47350 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47351 }
47352
47353 // Now, mask the relevant bit in each element.
47355 for (unsigned i = 0; i != NumElts; ++i) {
47356 int BitIdx = (i % EltSizeInBits);
47357 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47358 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47359 }
47360 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47361 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47362
47363 // Compare against the bitmask and extend the result.
47364 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47365 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47366 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47367
47368 // For SEXT, this is now done, otherwise shift the result down for
47369 // zero-extension.
47370 if (Opcode == ISD::SIGN_EXTEND)
47371 return Vec;
47372 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47373 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47374}
47375
47376/// If both arms of a vector select are concatenated vectors, split the select,
47377/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47378/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47379/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47381 const X86Subtarget &Subtarget) {
47382 unsigned Opcode = N->getOpcode();
47383 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47384 return SDValue();
47385
47386 // TODO: Split 512-bit vectors too?
47387 EVT VT = N->getValueType(0);
47388 if (!VT.is256BitVector())
47389 return SDValue();
47390
47391 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47392 SDValue Cond = N->getOperand(0);
47393 SDValue TVal = N->getOperand(1);
47394 SDValue FVal = N->getOperand(2);
47395 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47396 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47397 return SDValue();
47398
47399 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47400 ArrayRef<SDValue> Ops) {
47401 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47402 };
47403 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47404 /*CheckBWI*/ false);
47405}
47406
47408 const SDLoc &DL) {
47409 SDValue Cond = N->getOperand(0);
47410 SDValue LHS = N->getOperand(1);
47411 SDValue RHS = N->getOperand(2);
47412
47413 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47414 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47415 if (!TrueC || !FalseC)
47416 return SDValue();
47417
47418 // Don't do this for crazy integer types.
47419 EVT VT = N->getValueType(0);
47420 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47421 return SDValue();
47422
47423 // We're going to use the condition bit in math or logic ops. We could allow
47424 // this with a wider condition value (post-legalization it becomes an i8),
47425 // but if nothing is creating selects that late, it doesn't matter.
47426 if (Cond.getValueType() != MVT::i1)
47427 return SDValue();
47428
47429 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47430 // 3, 5, or 9 with i32/i64, so those get transformed too.
47431 // TODO: For constants that overflow or do not differ by power-of-2 or small
47432 // multiplier, convert to 'and' + 'add'.
47433 const APInt &TrueVal = TrueC->getAPIntValue();
47434 const APInt &FalseVal = FalseC->getAPIntValue();
47435
47436 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47437 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47438 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47439 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47440 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47441 return SDValue();
47442 }
47443
47444 bool OV;
47445 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47446 if (OV)
47447 return SDValue();
47448
47449 APInt AbsDiff = Diff.abs();
47450 if (AbsDiff.isPowerOf2() ||
47451 ((VT == MVT::i32 || VT == MVT::i64) &&
47452 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47453
47454 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47455 // of the condition can usually be folded into a compare predicate, but even
47456 // without that, the sequence should be cheaper than a CMOV alternative.
47457 if (TrueVal.slt(FalseVal)) {
47458 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47459 std::swap(TrueC, FalseC);
47460 }
47461
47462 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47463 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47464
47465 // Multiply condition by the difference if non-one.
47466 if (!AbsDiff.isOne())
47467 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47468
47469 // Add the base if non-zero.
47470 if (!FalseC->isZero())
47471 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47472
47473 return R;
47474 }
47475
47476 return SDValue();
47477}
47478
47479/// If this is a *dynamic* select (non-constant condition) and we can match
47480/// this node with one of the variable blend instructions, restructure the
47481/// condition so that blends can use the high (sign) bit of each element.
47482/// This function will also call SimplifyDemandedBits on already created
47483/// BLENDV to perform additional simplifications.
47485 const SDLoc &DL,
47487 const X86Subtarget &Subtarget) {
47488 SDValue Cond = N->getOperand(0);
47489 if ((N->getOpcode() != ISD::VSELECT &&
47490 N->getOpcode() != X86ISD::BLENDV) ||
47492 return SDValue();
47493
47494 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47495 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47496 EVT VT = N->getValueType(0);
47497
47498 // We can only handle the cases where VSELECT is directly legal on the
47499 // subtarget. We custom lower VSELECT nodes with constant conditions and
47500 // this makes it hard to see whether a dynamic VSELECT will correctly
47501 // lower, so we both check the operation's status and explicitly handle the
47502 // cases where a *dynamic* blend will fail even though a constant-condition
47503 // blend could be custom lowered.
47504 // FIXME: We should find a better way to handle this class of problems.
47505 // Potentially, we should combine constant-condition vselect nodes
47506 // pre-legalization into shuffles and not mark as many types as custom
47507 // lowered.
47509 return SDValue();
47510 // FIXME: We don't support i16-element blends currently. We could and
47511 // should support them by making *all* the bits in the condition be set
47512 // rather than just the high bit and using an i8-element blend.
47513 if (VT.getVectorElementType() == MVT::i16)
47514 return SDValue();
47515 // Dynamic blending was only available from SSE4.1 onward.
47516 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47517 return SDValue();
47518 // Byte blends are only available in AVX2
47519 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47520 return SDValue();
47521 // There are no 512-bit blend instructions that use sign bits.
47522 if (VT.is512BitVector())
47523 return SDValue();
47524
47525 // Don't optimize before the condition has been transformed to a legal type
47526 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47527 if (BitWidth < 8 || BitWidth > 64)
47528 return SDValue();
47529
47530 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47531 for (SDUse &Use : Cond->uses())
47532 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47533 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47534 Use.getOperandNo() != 0)
47535 return false;
47536
47537 return true;
47538 };
47539
47541
47542 if (OnlyUsedAsSelectCond(Cond)) {
47543 KnownBits Known;
47545 !DCI.isBeforeLegalizeOps());
47546 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47547 return SDValue();
47548
47549 // If we changed the computation somewhere in the DAG, this change will
47550 // affect all users of Cond. Update all the nodes so that we do not use
47551 // the generic VSELECT anymore. Otherwise, we may perform wrong
47552 // optimizations as we messed with the actual expectation for the vector
47553 // boolean values.
47554 for (SDNode *U : Cond->users()) {
47555 if (U->getOpcode() == X86ISD::BLENDV)
47556 continue;
47557
47558 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47559 Cond, U->getOperand(1), U->getOperand(2));
47560 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47561 DCI.AddToWorklist(U);
47562 }
47563 DCI.CommitTargetLoweringOpt(TLO);
47564 return SDValue(N, 0);
47565 }
47566
47567 // Otherwise we can still at least try to simplify multiple use bits.
47569 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47570 N->getOperand(1), N->getOperand(2));
47571
47572 return SDValue();
47573}
47574
47575// Try to match:
47576// (or (and (M, (sub 0, X)), (pandn M, X)))
47577// which is a special case of:
47578// (select M, (sub 0, X), X)
47579// Per:
47580// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47581// We know that, if fNegate is 0 or 1:
47582// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47583//
47584// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47585// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47586// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47587// This lets us transform our vselect to:
47588// (add (xor X, M), (and M, 1))
47589// And further to:
47590// (sub (xor X, M), M)
47592 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47593 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47594 using namespace SDPatternMatch;
47595 EVT MaskVT = Mask.getValueType();
47596 assert(MaskVT.isInteger() &&
47597 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47598 "Mask must be zero/all-bits");
47599
47600 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47602 return SDValue();
47603
47604 SDValue V;
47605 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47606 !sd_match(X, m_Neg(m_AllOf(m_Specific(Y), m_Value(V)))))
47607 return SDValue();
47608
47609 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47610 SDValue SubOp2 = Mask;
47611
47612 // If the negate was on the false side of the select, then
47613 // the operands of the SUB need to be swapped. PR 27251.
47614 // This is because the pattern being matched above is
47615 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47616 // but if the pattern matched was
47617 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47618 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47619 // pattern also needs to be a negation of the replacement pattern above.
47620 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47621 // sub accomplishes the negation of the replacement pattern.
47622 if (V == Y)
47623 std::swap(SubOp1, SubOp2);
47624
47625 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47626 return DAG.getBitcast(VT, Res);
47627}
47628
47630 const X86Subtarget &Subtarget) {
47631 using namespace SDPatternMatch;
47632 if (!Subtarget.hasAVX512())
47633 return SDValue();
47634
47635 ISD::CondCode CC;
47636 SDValue Cond, X, Y, LHS, RHS;
47637 if (!sd_match(N, m_VSelect(m_AllOf(m_Value(Cond),
47638 m_OneUse(m_SetCC(m_Value(X), m_Value(Y),
47639 m_CondCode(CC)))),
47640 m_Value(LHS), m_Value(RHS))))
47641 return SDValue();
47642
47643 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47644 !canCombineAsMaskOperation(RHS, Subtarget))
47645 return SDValue();
47646
47647 // Commute LHS and RHS to create opportunity to select mask instruction.
47648 // (vselect M, L, R) -> (vselect ~M, R, L)
47649 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47650 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47651 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47652}
47653
47654/// Do target-specific dag combines on SELECT and VSELECT nodes.
47657 const X86Subtarget &Subtarget) {
47658 SDLoc DL(N);
47659 SDValue Cond = N->getOperand(0);
47660 SDValue LHS = N->getOperand(1);
47661 SDValue RHS = N->getOperand(2);
47662
47663 // Try simplification again because we use this function to optimize
47664 // BLENDV nodes that are not handled by the generic combiner.
47665 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47666 return V;
47667
47668 // When avx512 is available the lhs operand of select instruction can be
47669 // folded with mask instruction, while the rhs operand can't. Commute the
47670 // lhs and rhs of the select instruction to create the opportunity of
47671 // folding.
47672 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47673 return V;
47674
47675 EVT VT = LHS.getValueType();
47676 EVT CondVT = Cond.getValueType();
47677 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47678 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47679
47680 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47681 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47682 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47683 if (CondVT.isVector() && CondVT.isInteger() &&
47684 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47685 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47688 DL, DAG, Subtarget))
47689 return V;
47690
47691 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47692 SmallVector<int, 64> CondMask;
47693 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47694 N->getOpcode() == X86ISD::BLENDV)) {
47695 // Convert vselects with constant condition into shuffles.
47696 if (DCI.isBeforeLegalizeOps())
47697 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47698
47699 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47700 // by forcing the unselected elements to zero.
47701 // TODO: Can we handle more shuffles with this?
47702 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47703 SmallVector<SDValue, 1> LHSOps, RHSOps;
47704 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47707 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47708 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47709 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47710 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47711 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47712 assert(ByteMask.size() == LHSMask.size() &&
47713 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47714 for (auto [I, M] : enumerate(ByteMask)) {
47715 // getConstVector sets negative shuffle mask values as undef, so
47716 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47717 if (M < (int)ByteMask.size()) {
47718 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47719 RHSMask[I] = 0x80;
47720 } else {
47721 LHSMask[I] = 0x80;
47722 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47723 }
47724 }
47725 MVT ByteVT = LHSShuf.getSimpleValueType();
47726 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47727 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47728 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47729 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47730 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47731 }
47732 }
47733
47734 // Attempt to combine as shuffle.
47735 SDValue Op(N, 0);
47736 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47737 return Res;
47738 }
47739 }
47740
47741 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47742 // instructions match the semantics of the common C idiom x<y?x:y but not
47743 // x<=y?x:y, because of how they handle negative zero (which can be
47744 // ignored in unsafe-math mode).
47745 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47746 if ((Cond.getOpcode() == ISD::SETCC ||
47747 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47748 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47749 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47750 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47751 (Subtarget.hasSSE2() ||
47752 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47753 bool IsStrict = Cond->isStrictFPOpcode();
47754 ISD::CondCode CC =
47755 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47756 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47757 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47758
47759 unsigned Opcode = 0;
47760 // Check for x CC y ? x : y.
47761 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47762 switch (CC) {
47763 default: break;
47764 case ISD::SETULT:
47765 // Converting this to a min would handle NaNs incorrectly, and swapping
47766 // the operands would cause it to handle comparisons between positive
47767 // and negative zero incorrectly.
47768 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47770 !(DAG.isKnownNeverZeroFloat(LHS) ||
47772 break;
47773 std::swap(LHS, RHS);
47774 }
47775 Opcode = X86ISD::FMIN;
47776 break;
47777 case ISD::SETOLE:
47778 // Converting this to a min would handle comparisons between positive
47779 // and negative zero incorrectly.
47782 break;
47783 Opcode = X86ISD::FMIN;
47784 break;
47785 case ISD::SETULE:
47786 // Converting this to a min would handle both negative zeros and NaNs
47787 // incorrectly, but we can swap the operands to fix both.
47788 std::swap(LHS, RHS);
47789 [[fallthrough]];
47790 case ISD::SETOLT:
47791 case ISD::SETLT:
47792 case ISD::SETLE:
47793 Opcode = X86ISD::FMIN;
47794 break;
47795
47796 case ISD::SETOGE:
47797 // Converting this to a max would handle comparisons between positive
47798 // and negative zero incorrectly.
47801 break;
47802 Opcode = X86ISD::FMAX;
47803 break;
47804 case ISD::SETUGT:
47805 // Converting this to a max would handle NaNs incorrectly, and swapping
47806 // the operands would cause it to handle comparisons between positive
47807 // and negative zero incorrectly.
47808 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47810 !(DAG.isKnownNeverZeroFloat(LHS) ||
47812 break;
47813 std::swap(LHS, RHS);
47814 }
47815 Opcode = X86ISD::FMAX;
47816 break;
47817 case ISD::SETUGE:
47818 // Converting this to a max would handle both negative zeros and NaNs
47819 // incorrectly, but we can swap the operands to fix both.
47820 std::swap(LHS, RHS);
47821 [[fallthrough]];
47822 case ISD::SETOGT:
47823 case ISD::SETGT:
47824 case ISD::SETGE:
47825 Opcode = X86ISD::FMAX;
47826 break;
47827 }
47828 // Check for x CC y ? y : x -- a min/max with reversed arms.
47829 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47830 switch (CC) {
47831 default: break;
47832 case ISD::SETOGE:
47833 // Converting this to a min would handle comparisons between positive
47834 // and negative zero incorrectly, and swapping the operands would
47835 // cause it to handle NaNs incorrectly.
47837 !(DAG.isKnownNeverZeroFloat(LHS) ||
47838 DAG.isKnownNeverZeroFloat(RHS))) {
47839 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47840 break;
47841 std::swap(LHS, RHS);
47842 }
47843 Opcode = X86ISD::FMIN;
47844 break;
47845 case ISD::SETUGT:
47846 // Converting this to a min would handle NaNs incorrectly.
47847 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47848 break;
47849 Opcode = X86ISD::FMIN;
47850 break;
47851 case ISD::SETUGE:
47852 // Converting this to a min would handle both negative zeros and NaNs
47853 // incorrectly, but we can swap the operands to fix both.
47854 std::swap(LHS, RHS);
47855 [[fallthrough]];
47856 case ISD::SETOGT:
47857 case ISD::SETGT:
47858 case ISD::SETGE:
47859 Opcode = X86ISD::FMIN;
47860 break;
47861
47862 case ISD::SETULT:
47863 // Converting this to a max would handle NaNs incorrectly.
47864 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47865 break;
47866 Opcode = X86ISD::FMAX;
47867 break;
47868 case ISD::SETOLE:
47869 // Converting this to a max would handle comparisons between positive
47870 // and negative zero incorrectly, and swapping the operands would
47871 // cause it to handle NaNs incorrectly.
47873 !DAG.isKnownNeverZeroFloat(LHS) &&
47874 !DAG.isKnownNeverZeroFloat(RHS)) {
47875 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47876 break;
47877 std::swap(LHS, RHS);
47878 }
47879 Opcode = X86ISD::FMAX;
47880 break;
47881 case ISD::SETULE:
47882 // Converting this to a max would handle both negative zeros and NaNs
47883 // incorrectly, but we can swap the operands to fix both.
47884 std::swap(LHS, RHS);
47885 [[fallthrough]];
47886 case ISD::SETOLT:
47887 case ISD::SETLT:
47888 case ISD::SETLE:
47889 Opcode = X86ISD::FMAX;
47890 break;
47891 }
47892 }
47893
47894 if (Opcode) {
47895 if (IsStrict) {
47896 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47898 DL, {N->getValueType(0), MVT::Other},
47899 {Cond.getOperand(0), LHS, RHS});
47900 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47901 return Ret;
47902 }
47903 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47904 }
47905 }
47906
47907 // Some mask scalar intrinsics rely on checking if only one bit is set
47908 // and implement it in C code like this:
47909 // A[0] = (U & 1) ? A[0] : W[0];
47910 // This creates some redundant instructions that break pattern matching.
47911 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47912 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47913 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47914 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47915 SDValue AndNode = Cond.getOperand(0);
47916 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47917 isNullConstant(Cond.getOperand(1)) &&
47918 isOneConstant(AndNode.getOperand(1))) {
47919 // LHS and RHS swapped due to
47920 // setcc outputting 1 when AND resulted in 0 and vice versa.
47921 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47922 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47923 }
47924 }
47925
47926 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47927 // lowering on KNL. In this case we convert it to
47928 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47929 // The same situation all vectors of i8 and i16 without BWI.
47930 // Make sure we extend these even before type legalization gets a chance to
47931 // split wide vectors.
47932 // Since SKX these selects have a proper lowering.
47933 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47934 CondVT.getVectorElementType() == MVT::i1 &&
47935 (VT.getVectorElementType() == MVT::i8 ||
47936 VT.getVectorElementType() == MVT::i16)) {
47937 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47938 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47939 }
47940
47941 // AVX512 - Extend select to merge with target shuffle.
47942 // select(mask, extract_subvector(shuffle(x)), y) -->
47943 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47944 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47945 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47946 CondVT.getVectorElementType() == MVT::i1) {
47947 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
47948 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47949 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
47950 isNullConstant(Op.getOperand(1)) &&
47951 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
47952 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
47953 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
47954 ISD::isBuildVectorAllZeros(Alt.getNode()));
47955 };
47956
47957 bool SelectableLHS = SelectableOp(LHS, RHS);
47958 bool SelectableRHS = SelectableOp(RHS, LHS);
47959 if (SelectableLHS || SelectableRHS) {
47960 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
47961 : RHS.getOperand(0).getValueType();
47962 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
47963 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
47964 VT.getSizeInBits());
47965 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
47966 VT.getSizeInBits());
47967 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
47968 DAG.getUNDEF(SrcCondVT), Cond,
47969 DAG.getVectorIdxConstant(0, DL));
47970 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
47971 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
47972 }
47973 }
47974
47975 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
47976 return V;
47977
47978 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
47979 Cond.hasOneUse()) {
47980 EVT CondVT = Cond.getValueType();
47981 SDValue Cond0 = Cond.getOperand(0);
47982 SDValue Cond1 = Cond.getOperand(1);
47983 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47984
47985 // Canonicalize min/max:
47986 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
47987 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
47988 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
47989 // the need for an extra compare against zero. e.g.
47990 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
47991 // subl %esi, %edi
47992 // testl %edi, %edi
47993 // movl $0, %eax
47994 // cmovgl %edi, %eax
47995 // =>
47996 // xorl %eax, %eax
47997 // subl %esi, $edi
47998 // cmovsl %eax, %edi
47999 //
48000 // We can also canonicalize
48001 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48002 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48003 // This allows the use of a test instruction for the compare.
48004 if (LHS == Cond0 && RHS == Cond1) {
48005 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48006 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48008 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48009 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48010 }
48011 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48012 ISD::CondCode NewCC = ISD::SETUGE;
48013 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48014 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48015 }
48016 }
48017
48018 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48019 // fold eq + gt/lt nested selects into ge/le selects
48020 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48021 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48022 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48023 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48024 // .. etc ..
48025 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48026 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48027 SDValue InnerSetCC = RHS.getOperand(0);
48028 ISD::CondCode InnerCC =
48029 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48030 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48031 Cond0 == InnerSetCC.getOperand(0) &&
48032 Cond1 == InnerSetCC.getOperand(1)) {
48033 ISD::CondCode NewCC;
48034 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48035 // clang-format off
48036 case ISD::SETGT: NewCC = ISD::SETGE; break;
48037 case ISD::SETLT: NewCC = ISD::SETLE; break;
48038 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48039 case ISD::SETULT: NewCC = ISD::SETULE; break;
48040 default: NewCC = ISD::SETCC_INVALID; break;
48041 // clang-format on
48042 }
48043 if (NewCC != ISD::SETCC_INVALID) {
48044 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48045 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48046 }
48047 }
48048 }
48049 }
48050
48051 // Check if the first operand is all zeros and Cond type is vXi1.
48052 // If this an avx512 target we can improve the use of zero masking by
48053 // swapping the operands and inverting the condition.
48054 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48055 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48056 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48057 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48058 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48059 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48060 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48061 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48062 }
48063
48064 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48065 // get split by legalization.
48066 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48067 CondVT.getVectorElementType() == MVT::i1 &&
48068 TLI.isTypeLegal(VT.getScalarType())) {
48069 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48071 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48072 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48073 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48074 }
48075 }
48076
48077 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48078 // with out-of-bounds clamping.
48079
48080 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48081 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48082 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48083 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48084 // exceeding bitwidth-1.
48085 if (N->getOpcode() == ISD::VSELECT) {
48086 using namespace llvm::SDPatternMatch;
48087 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48088 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48089 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48090 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48092 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48094 m_SpecificCondCode(ISD::SETULT)))) {
48095 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48096 : X86ISD::VSHLV,
48097 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48098 }
48099 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48100 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48101 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48102 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48104 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48106 m_SpecificCondCode(ISD::SETUGE)))) {
48107 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48108 : X86ISD::VSHLV,
48109 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48110 }
48111 }
48112
48113 // Early exit check
48114 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48115 return SDValue();
48116
48117 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48118 return V;
48119
48120 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48121 return V;
48122
48123 // select(~Cond, X, Y) -> select(Cond, Y, X)
48124 if (CondVT.getScalarType() != MVT::i1) {
48125 if (SDValue CondNot = IsNOT(Cond, DAG))
48126 return DAG.getNode(N->getOpcode(), DL, VT,
48127 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48128
48129 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48130 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48131 Cond.getOperand(0).getOpcode() == ISD::AND &&
48132 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48133 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48134 Cond.getScalarValueSizeInBits(),
48135 /*AllowUndefs=*/true) &&
48136 Cond.hasOneUse()) {
48137 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48138 Cond.getOperand(0).getOperand(1));
48139 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48140 }
48141
48142 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48143 // signbit.
48144 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48145 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48146 Cond.hasOneUse()) {
48147 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48148 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48149 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48150 }
48151 }
48152
48153 // Try to optimize vXi1 selects if both operands are either all constants or
48154 // bitcasts from scalar integer type. In that case we can convert the operands
48155 // to integer and use an integer select which will be converted to a CMOV.
48156 // We need to take a little bit of care to avoid creating an i64 type after
48157 // type legalization.
48158 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48159 VT.getVectorElementType() == MVT::i1 &&
48160 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48162 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48163 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48164 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48165
48166 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48167 LHS.getOperand(0).getValueType() == IntVT)) &&
48168 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48169 RHS.getOperand(0).getValueType() == IntVT))) {
48170 if (LHSIsConst)
48172 else
48173 LHS = LHS.getOperand(0);
48174
48175 if (RHSIsConst)
48177 else
48178 RHS = RHS.getOperand(0);
48179
48180 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48181 return DAG.getBitcast(VT, Select);
48182 }
48183 }
48184 }
48185
48186 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48187 // single bits, then invert the predicate and swap the select operands.
48188 // This can lower using a vector shift bit-hack rather than mask and compare.
48189 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48190 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48191 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48192 Cond.getOperand(0).getOpcode() == ISD::AND &&
48193 isNullOrNullSplat(Cond.getOperand(1)) &&
48194 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48195 Cond.getOperand(0).getValueType() == VT) {
48196 // The 'and' mask must be composed of power-of-2 constants.
48197 SDValue And = Cond.getOperand(0);
48198 auto *C = isConstOrConstSplat(And.getOperand(1));
48199 if (C && C->getAPIntValue().isPowerOf2()) {
48200 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48201 SDValue NotCond =
48202 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48203 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48204 }
48205
48206 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48207 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48208 // 16-bit lacks a proper blendv.
48209 unsigned EltBitWidth = VT.getScalarSizeInBits();
48210 bool CanShiftBlend =
48211 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48212 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48213 (Subtarget.hasXOP()));
48214 if (CanShiftBlend &&
48215 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48216 return C->getAPIntValue().isPowerOf2();
48217 })) {
48218 // Create a left-shift constant to get the mask bits over to the sign-bit.
48219 SDValue Mask = And.getOperand(1);
48220 SmallVector<int, 32> ShlVals;
48221 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48222 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48223 ShlVals.push_back(EltBitWidth - 1 -
48224 MaskVal->getAPIntValue().exactLogBase2());
48225 }
48226 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48227 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48228 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48229 SDValue NewCond =
48230 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48231 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48232 }
48233 }
48234
48235 return SDValue();
48236}
48237
48238/// Combine:
48239/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48240/// to:
48241/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48242/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48243/// Note that this is only legal for some op/cc combinations.
48245 SelectionDAG &DAG,
48246 const X86Subtarget &Subtarget) {
48247 // This combine only operates on CMP-like nodes.
48248 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48249 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48250 return SDValue();
48251
48252 // Can't replace the cmp if it has more uses than the one we're looking at.
48253 // FIXME: We would like to be able to handle this, but would need to make sure
48254 // all uses were updated.
48255 if (!Cmp.hasOneUse())
48256 return SDValue();
48257
48258 // This only applies to variations of the common case:
48259 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48260 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48261 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48262 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48263 // Using the proper condcodes (see below), overflow is checked for.
48264
48265 // FIXME: We can generalize both constraints:
48266 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48267 // - LHS != 1
48268 // if the result is compared.
48269
48270 SDValue CmpLHS = Cmp.getOperand(0);
48271 SDValue CmpRHS = Cmp.getOperand(1);
48272 EVT CmpVT = CmpLHS.getValueType();
48273
48274 if (!CmpLHS.hasOneUse())
48275 return SDValue();
48276
48277 unsigned Opc = CmpLHS.getOpcode();
48279 return SDValue();
48280
48281 SDValue OpRHS = CmpLHS.getOperand(2);
48282 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48283 if (!OpRHSC)
48284 return SDValue();
48285
48286 APInt Addend = OpRHSC->getAPIntValue();
48288 Addend = -Addend;
48289
48290 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48291 if (!CmpRHSC)
48292 return SDValue();
48293
48294 APInt Comparison = CmpRHSC->getAPIntValue();
48295 APInt NegAddend = -Addend;
48296
48297 // See if we can adjust the CC to make the comparison match the negated
48298 // addend.
48299 if (Comparison != NegAddend) {
48300 APInt IncComparison = Comparison + 1;
48301 if (IncComparison == NegAddend) {
48302 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48303 Comparison = IncComparison;
48304 CC = X86::COND_AE;
48305 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48306 Comparison = IncComparison;
48307 CC = X86::COND_L;
48308 }
48309 }
48310 APInt DecComparison = Comparison - 1;
48311 if (DecComparison == NegAddend) {
48312 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48313 Comparison = DecComparison;
48314 CC = X86::COND_A;
48315 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48316 Comparison = DecComparison;
48317 CC = X86::COND_LE;
48318 }
48319 }
48320 }
48321
48322 // If the addend is the negation of the comparison value, then we can do
48323 // a full comparison by emitting the atomic arithmetic as a locked sub.
48324 if (Comparison == NegAddend) {
48325 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48326 // atomic sub.
48327 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48328 auto AtomicSub = DAG.getAtomic(
48329 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48330 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48331 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48332 AN->getMemOperand());
48333 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48334 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48335 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48336 return LockOp;
48337 }
48338
48339 // We can handle comparisons with zero in a number of cases by manipulating
48340 // the CC used.
48341 if (!Comparison.isZero())
48342 return SDValue();
48343
48344 if (CC == X86::COND_S && Addend == 1)
48345 CC = X86::COND_LE;
48346 else if (CC == X86::COND_NS && Addend == 1)
48347 CC = X86::COND_G;
48348 else if (CC == X86::COND_G && Addend == -1)
48349 CC = X86::COND_GE;
48350 else if (CC == X86::COND_LE && Addend == -1)
48351 CC = X86::COND_L;
48352 else
48353 return SDValue();
48354
48355 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48356 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48357 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48358 return LockOp;
48359}
48360
48361// Check whether we're just testing the signbit, and whether we can simplify
48362// this by tracking where the signbit came from.
48364 SelectionDAG &DAG) {
48365 if (CC != X86::COND_S && CC != X86::COND_NS)
48366 return SDValue();
48367
48368 if (!Cmp.hasOneUse())
48369 return SDValue();
48370
48371 SDValue Src;
48372 if (Cmp.getOpcode() == X86ISD::CMP) {
48373 // CMP(X,0) -> signbit test
48374 if (!isNullConstant(Cmp.getOperand(1)))
48375 return SDValue();
48376 Src = Cmp.getOperand(0);
48377 // Peek through a SRA node as we just need the signbit.
48378 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48379 // TODO: Use SimplifyDemandedBits instead of just SRA?
48380 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48381 return SDValue();
48382 Src = Src.getOperand(0);
48383 } else if (Cmp.getOpcode() == X86ISD::OR) {
48384 // OR(X,Y) -> see if only one operand contributes to the signbit.
48385 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48386 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48387 Src = Cmp.getOperand(1);
48388 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48389 Src = Cmp.getOperand(0);
48390 else
48391 return SDValue();
48392 } else {
48393 return SDValue();
48394 }
48395
48396 // Replace with a TEST on the MSB.
48397 SDLoc DL(Cmp);
48398 MVT SrcVT = Src.getSimpleValueType();
48399 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48400
48401 // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
48402 // peek through and adjust the TEST bit.
48403 if (Src.getOpcode() == ISD::SHL) {
48404 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48405 Src = Src.getOperand(0);
48406 BitMask.lshrInPlace(*ShiftAmt);
48407 }
48408 }
48409
48410 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48411 DAG.getConstant(BitMask, DL, SrcVT));
48412 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48413 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48414 DAG.getConstant(0, DL, SrcVT));
48415}
48416
48417// Check whether a boolean test is testing a boolean value generated by
48418// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48419// code.
48420//
48421// Simplify the following patterns:
48422// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48423// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48424// to (Op EFLAGS Cond)
48425//
48426// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48427// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48428// to (Op EFLAGS !Cond)
48429//
48430// where Op could be BRCOND or CMOV.
48431//
48433 // This combine only operates on CMP-like nodes.
48434 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48435 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48436 return SDValue();
48437
48438 // Quit if not used as a boolean value.
48439 if (CC != X86::COND_E && CC != X86::COND_NE)
48440 return SDValue();
48441
48442 // Check CMP operands. One of them should be 0 or 1 and the other should be
48443 // an SetCC or extended from it.
48444 SDValue Op1 = Cmp.getOperand(0);
48445 SDValue Op2 = Cmp.getOperand(1);
48446
48447 SDValue SetCC;
48448 const ConstantSDNode* C = nullptr;
48449 bool needOppositeCond = (CC == X86::COND_E);
48450 bool checkAgainstTrue = false; // Is it a comparison against 1?
48451
48452 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48453 SetCC = Op2;
48454 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48455 SetCC = Op1;
48456 else // Quit if all operands are not constants.
48457 return SDValue();
48458
48459 if (C->getZExtValue() == 1) {
48460 needOppositeCond = !needOppositeCond;
48461 checkAgainstTrue = true;
48462 } else if (C->getZExtValue() != 0)
48463 // Quit if the constant is neither 0 or 1.
48464 return SDValue();
48465
48466 bool truncatedToBoolWithAnd = false;
48467 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48468 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48469 SetCC.getOpcode() == ISD::TRUNCATE ||
48470 SetCC.getOpcode() == ISD::AND) {
48471 if (SetCC.getOpcode() == ISD::AND) {
48472 int OpIdx = -1;
48473 if (isOneConstant(SetCC.getOperand(0)))
48474 OpIdx = 1;
48475 if (isOneConstant(SetCC.getOperand(1)))
48476 OpIdx = 0;
48477 if (OpIdx < 0)
48478 break;
48479 SetCC = SetCC.getOperand(OpIdx);
48480 truncatedToBoolWithAnd = true;
48481 } else
48482 SetCC = SetCC.getOperand(0);
48483 }
48484
48485 switch (SetCC.getOpcode()) {
48487 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48488 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48489 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48490 // truncated to i1 using 'and'.
48491 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48492 break;
48494 "Invalid use of SETCC_CARRY!");
48495 [[fallthrough]];
48496 case X86ISD::SETCC:
48497 // Set the condition code or opposite one if necessary.
48498 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48499 if (needOppositeCond)
48501 return SetCC.getOperand(1);
48502 case X86ISD::CMOV: {
48503 // Check whether false/true value has canonical one, i.e. 0 or 1.
48504 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
48505 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
48506 // Quit if true value is not a constant.
48507 if (!TVal)
48508 return SDValue();
48509 // Quit if false value is not a constant.
48510 if (!FVal) {
48511 SDValue Op = SetCC.getOperand(0);
48512 // Skip 'zext' or 'trunc' node.
48513 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48514 Op.getOpcode() == ISD::TRUNCATE)
48515 Op = Op.getOperand(0);
48516 // A special case for rdrand/rdseed, where 0 is set if false cond is
48517 // found.
48518 if ((Op.getOpcode() != X86ISD::RDRAND &&
48519 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48520 return SDValue();
48521 }
48522 // Quit if false value is not the constant 0 or 1.
48523 bool FValIsFalse = true;
48524 if (FVal && FVal->getZExtValue() != 0) {
48525 if (FVal->getZExtValue() != 1)
48526 return SDValue();
48527 // If FVal is 1, opposite cond is needed.
48528 needOppositeCond = !needOppositeCond;
48529 FValIsFalse = false;
48530 }
48531 // Quit if TVal is not the constant opposite of FVal.
48532 if (FValIsFalse && TVal->getZExtValue() != 1)
48533 return SDValue();
48534 if (!FValIsFalse && TVal->getZExtValue() != 0)
48535 return SDValue();
48536 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48537 if (needOppositeCond)
48539 return SetCC.getOperand(3);
48540 }
48541 }
48542
48543 return SDValue();
48544}
48545
48546/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48547/// Match:
48548/// (X86or (X86setcc) (X86setcc))
48549/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48551 X86::CondCode &CC1, SDValue &Flags,
48552 bool &isAnd) {
48553 if (Cond->getOpcode() == X86ISD::CMP) {
48554 if (!isNullConstant(Cond->getOperand(1)))
48555 return false;
48556
48557 Cond = Cond->getOperand(0);
48558 }
48559
48560 isAnd = false;
48561
48562 SDValue SetCC0, SetCC1;
48563 switch (Cond->getOpcode()) {
48564 default: return false;
48565 case ISD::AND:
48566 case X86ISD::AND:
48567 isAnd = true;
48568 [[fallthrough]];
48569 case ISD::OR:
48570 case X86ISD::OR:
48571 SetCC0 = Cond->getOperand(0);
48572 SetCC1 = Cond->getOperand(1);
48573 break;
48574 };
48575
48576 // Make sure we have SETCC nodes, using the same flags value.
48577 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48578 SetCC1.getOpcode() != X86ISD::SETCC ||
48579 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48580 return false;
48581
48582 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48583 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48584 Flags = SetCC0->getOperand(1);
48585 return true;
48586}
48587
48588// When legalizing carry, we create carries via add X, -1
48589// If that comes from an actual carry, via setcc, we use the
48590// carry directly.
48592 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48593 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48594 bool FoundAndLSB = false;
48595 SDValue Carry = EFLAGS.getOperand(0);
48596 while (Carry.getOpcode() == ISD::TRUNCATE ||
48597 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48598 (Carry.getOpcode() == ISD::AND &&
48599 isOneConstant(Carry.getOperand(1)))) {
48600 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48601 Carry = Carry.getOperand(0);
48602 }
48603 if (Carry.getOpcode() == X86ISD::SETCC ||
48604 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48605 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48606 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48607 SDValue CarryOp1 = Carry.getOperand(1);
48608 if (CarryCC == X86::COND_B)
48609 return CarryOp1;
48610 if (CarryCC == X86::COND_A) {
48611 // Try to convert COND_A into COND_B in an attempt to facilitate
48612 // materializing "setb reg".
48613 //
48614 // Do not flip "e > c", where "c" is a constant, because Cmp
48615 // instruction cannot take an immediate as its first operand.
48616 //
48617 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48618 CarryOp1.getNode()->hasOneUse() &&
48619 CarryOp1.getValueType().isInteger() &&
48620 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48621 SDValue SubCommute =
48622 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48623 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48624 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48625 }
48626 }
48627 // If this is a check of the z flag of an add with 1, switch to the
48628 // C flag.
48629 if (CarryCC == X86::COND_E &&
48630 CarryOp1.getOpcode() == X86ISD::ADD &&
48631 isOneConstant(CarryOp1.getOperand(1)))
48632 return CarryOp1;
48633 } else if (FoundAndLSB) {
48634 SDLoc DL(Carry);
48635 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48636 if (Carry.getOpcode() == ISD::SRL) {
48637 BitNo = Carry.getOperand(1);
48638 Carry = Carry.getOperand(0);
48639 }
48640 return getBT(Carry, BitNo, DL, DAG);
48641 }
48642 }
48643 }
48644
48645 return SDValue();
48646}
48647
48648/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48649/// to avoid the inversion.
48651 SelectionDAG &DAG,
48652 const X86Subtarget &Subtarget) {
48653 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48654 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48655 EFLAGS.getOpcode() != X86ISD::TESTP)
48656 return SDValue();
48657
48658 // PTEST/TESTP sets EFLAGS as:
48659 // TESTZ: ZF = (Op0 & Op1) == 0
48660 // TESTC: CF = (~Op0 & Op1) == 0
48661 // TESTNZC: ZF == 0 && CF == 0
48662 MVT VT = EFLAGS.getSimpleValueType();
48663 SDValue Op0 = EFLAGS.getOperand(0);
48664 SDValue Op1 = EFLAGS.getOperand(1);
48665 MVT OpVT = Op0.getSimpleValueType();
48666 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48667
48668 // TEST*(~X,Y) == TEST*(X,Y)
48669 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48670 X86::CondCode InvCC;
48671 switch (CC) {
48672 case X86::COND_B:
48673 // testc -> testz.
48674 InvCC = X86::COND_E;
48675 break;
48676 case X86::COND_AE:
48677 // !testc -> !testz.
48678 InvCC = X86::COND_NE;
48679 break;
48680 case X86::COND_E:
48681 // testz -> testc.
48682 InvCC = X86::COND_B;
48683 break;
48684 case X86::COND_NE:
48685 // !testz -> !testc.
48686 InvCC = X86::COND_AE;
48687 break;
48688 case X86::COND_A:
48689 case X86::COND_BE:
48690 // testnzc -> testnzc (no change).
48691 InvCC = CC;
48692 break;
48693 default:
48694 InvCC = X86::COND_INVALID;
48695 break;
48696 }
48697
48698 if (InvCC != X86::COND_INVALID) {
48699 CC = InvCC;
48700 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48701 DAG.getBitcast(OpVT, NotOp0), Op1);
48702 }
48703 }
48704
48705 if (CC == X86::COND_B || CC == X86::COND_AE) {
48706 // TESTC(X,~X) == TESTC(X,-1)
48707 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48708 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48709 SDLoc DL(EFLAGS);
48710 return DAG.getNode(
48711 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48712 DAG.getBitcast(OpVT,
48713 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48714 }
48715 }
48716 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48717 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48719 SDValue BC0 = peekThroughBitcasts(Op0);
48720 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48722 SDLoc DL(EFLAGS);
48723 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48724 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48725 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48726 }
48727 }
48728 }
48729
48730 if (CC == X86::COND_E || CC == X86::COND_NE) {
48731 // TESTZ(X,~Y) == TESTC(Y,X)
48732 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48733 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48734 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48735 DAG.getBitcast(OpVT, NotOp1), Op0);
48736 }
48737
48738 if (Op0 == Op1) {
48739 SDValue BC = peekThroughBitcasts(Op0);
48740 EVT BCVT = BC.getValueType();
48741
48742 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48743 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48744 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48745 DAG.getBitcast(OpVT, BC.getOperand(0)),
48746 DAG.getBitcast(OpVT, BC.getOperand(1)));
48747 }
48748
48749 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48750 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48751 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48752 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48753 DAG.getBitcast(OpVT, BC.getOperand(0)),
48754 DAG.getBitcast(OpVT, BC.getOperand(1)));
48755 }
48756
48757 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48758 // to more efficiently extract the sign bits and compare that.
48759 // TODO: Handle TESTC with comparison inversion.
48760 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48761 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48762 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48763 unsigned EltBits = BCVT.getScalarSizeInBits();
48764 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48765 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48766 APInt SignMask = APInt::getSignMask(EltBits);
48767 if (SDValue Res =
48768 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48769 // For vXi16 cases we need to use pmovmksb and extract every other
48770 // sign bit.
48771 SDLoc DL(EFLAGS);
48772 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48773 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48774 MVT FloatVT =
48775 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48776 Res = DAG.getBitcast(FloatVT, Res);
48777 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48778 } else if (EltBits == 16) {
48779 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48780 Res = DAG.getBitcast(MovmskVT, Res);
48781 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48782 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48783 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48784 } else {
48785 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48786 }
48787 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48788 DAG.getConstant(0, DL, MVT::i32));
48789 }
48790 }
48791 }
48792 }
48793
48794 // TESTZ(-1,X) == TESTZ(X,X)
48796 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48797
48798 // TESTZ(X,-1) == TESTZ(X,X)
48800 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48801
48802 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48803 // TODO: Add COND_NE handling?
48804 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48805 SDValue Src0 = peekThroughBitcasts(Op0);
48806 SDValue Src1 = peekThroughBitcasts(Op1);
48807 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48809 peekThroughBitcasts(Src0.getOperand(1)), true);
48811 peekThroughBitcasts(Src1.getOperand(1)), true);
48812 if (Src0 && Src1) {
48813 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48814 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48815 DAG.getBitcast(OpVT2, Src0),
48816 DAG.getBitcast(OpVT2, Src1));
48817 }
48818 }
48819 }
48820 }
48821
48822 return SDValue();
48823}
48824
48825// Attempt to simplify the MOVMSK input based on the comparison type.
48827 SelectionDAG &DAG,
48828 const X86Subtarget &Subtarget) {
48829 // Handle eq/ne against zero (any_of).
48830 // Handle eq/ne against -1 (all_of).
48831 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48832 return SDValue();
48833 if (EFLAGS.getValueType() != MVT::i32)
48834 return SDValue();
48835 unsigned CmpOpcode = EFLAGS.getOpcode();
48836 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48837 return SDValue();
48838 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48839 if (!CmpConstant)
48840 return SDValue();
48841 const APInt &CmpVal = CmpConstant->getAPIntValue();
48842
48843 SDValue CmpOp = EFLAGS.getOperand(0);
48844 unsigned CmpBits = CmpOp.getValueSizeInBits();
48845 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48846
48847 // Peek through any truncate.
48848 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48849 CmpOp = CmpOp.getOperand(0);
48850
48851 // Bail if we don't find a MOVMSK.
48852 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48853 return SDValue();
48854
48855 SDValue Vec = CmpOp.getOperand(0);
48856 MVT VecVT = Vec.getSimpleValueType();
48857 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48858 "Unexpected MOVMSK operand");
48859 unsigned NumElts = VecVT.getVectorNumElements();
48860 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48861
48862 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48863 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48864 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48865 if (!IsAnyOf && !IsAllOf)
48866 return SDValue();
48867
48868 // TODO: Check more combining cases for me.
48869 // Here we check the cmp use number to decide do combining or not.
48870 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48871 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48872 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48873
48874 // See if we can peek through to a vector with a wider element type, if the
48875 // signbits extend down to all the sub-elements as well.
48876 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48877 // potential SimplifyDemandedBits/Elts cases.
48878 // If we looked through a truncate that discard bits, we can't do this
48879 // transform.
48880 // FIXME: We could do this transform for truncates that discarded bits by
48881 // inserting an AND mask between the new MOVMSK and the CMP.
48882 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48883 SDValue BC = peekThroughBitcasts(Vec);
48884 MVT BCVT = BC.getSimpleValueType();
48885 unsigned BCNumElts = BCVT.getVectorNumElements();
48886 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48887 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48888 BCNumEltBits > NumEltBits &&
48889 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48890 SDLoc DL(EFLAGS);
48891 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48892 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48893 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48894 DAG.getConstant(CmpMask, DL, MVT::i32));
48895 }
48896 }
48897
48898 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48899 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48900 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48901 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48902 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48904 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48905 Ops.size() == 2) {
48906 SDLoc DL(EFLAGS);
48907 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48908 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48909 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48910 DAG.getBitcast(SubVT, Ops[0]),
48911 DAG.getBitcast(SubVT, Ops[1]));
48912 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48913 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48914 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48915 DAG.getConstant(CmpMask, DL, MVT::i32));
48916 }
48917 }
48918
48919 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48920 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48921 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48922 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48923 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48924 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48925 SDValue BC = peekThroughBitcasts(Vec);
48926 // Ensure MOVMSK was testing every signbit of BC.
48927 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48928 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48929 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48930 BC.getOperand(0), BC.getOperand(1));
48931 V = DAG.getBitcast(TestVT, V);
48932 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48933 }
48934 // Check for 256-bit split vector cases.
48935 if (BC.getOpcode() == ISD::AND &&
48936 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48937 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48938 SDValue LHS = BC.getOperand(0);
48939 SDValue RHS = BC.getOperand(1);
48940 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48941 LHS.getOperand(0), LHS.getOperand(1));
48942 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48943 RHS.getOperand(0), RHS.getOperand(1));
48944 LHS = DAG.getBitcast(TestVT, LHS);
48945 RHS = DAG.getBitcast(TestVT, RHS);
48946 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
48947 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48948 }
48949 }
48950 }
48951
48952 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
48953 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
48954 // sign bits prior to the comparison with zero unless we know that
48955 // the vXi16 splats the sign bit down to the lower i8 half.
48956 // TODO: Handle all_of patterns.
48957 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
48958 SDValue VecOp0 = Vec.getOperand(0);
48959 SDValue VecOp1 = Vec.getOperand(1);
48960 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
48961 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
48962 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
48963 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
48964 SDLoc DL(EFLAGS);
48965 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
48966 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48967 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
48968 if (!SignExt0) {
48969 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
48970 DAG.getConstant(0xAAAA, DL, MVT::i16));
48971 }
48972 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48973 DAG.getConstant(0, DL, MVT::i16));
48974 }
48975 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
48976 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
48977 if (CmpBits >= 16 && Subtarget.hasInt256() &&
48978 (IsAnyOf || (SignExt0 && SignExt1))) {
48979 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
48980 SDLoc DL(EFLAGS);
48981 SDValue Result = peekThroughBitcasts(Src);
48982 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
48983 Result.getValueType().getVectorNumElements() <= NumElts) {
48984 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
48985 Result.getOperand(0), Result.getOperand(1));
48986 V = DAG.getBitcast(MVT::v4i64, V);
48987 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48988 }
48989 Result = DAG.getBitcast(MVT::v32i8, Result);
48990 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48991 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
48992 if (!SignExt0 || !SignExt1) {
48993 assert(IsAnyOf &&
48994 "Only perform v16i16 signmasks for any_of patterns");
48995 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
48996 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48997 }
48998 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48999 DAG.getConstant(CmpMask, DL, MVT::i32));
49000 }
49001 }
49002 }
49003
49004 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49005 // Since we peek through a bitcast, we need to be careful if the base vector
49006 // type has smaller elements than the MOVMSK type. In that case, even if
49007 // all the elements are demanded by the shuffle mask, only the "high"
49008 // elements which have highbits that align with highbits in the MOVMSK vec
49009 // elements are actually demanded. A simplification of spurious operations
49010 // on the "low" elements take place during other simplifications.
49011 //
49012 // For example:
49013 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49014 // demanded, because we are swapping around the result can change.
49015 //
49016 // To address this, we check that we can scale the shuffle mask to MOVMSK
49017 // element width (this will ensure "high" elements match). Its slightly overly
49018 // conservative, but fine for an edge case fold.
49019 SmallVector<int, 32> ShuffleMask;
49020 SmallVector<SDValue, 2> ShuffleInputs;
49021 if (NumElts <= CmpBits &&
49022 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49023 ShuffleMask, DAG) &&
49024 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49025 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49026 canScaleShuffleElements(ShuffleMask, NumElts)) {
49027 SDLoc DL(EFLAGS);
49028 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49029 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49030 Result =
49031 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49032 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49033 }
49034
49035 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49036 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49037 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49038 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49039 // iff every element is referenced.
49040 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49041 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49042 (NumEltBits == 32 || NumEltBits == 64)) {
49043 SDLoc DL(EFLAGS);
49044 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49045 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49046 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49047 SDValue LHS = Vec;
49048 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49049 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49050 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49051 DAG.getBitcast(FloatVT, LHS),
49052 DAG.getBitcast(FloatVT, RHS));
49053 }
49054
49055 return SDValue();
49056}
49057
49058/// Optimize an EFLAGS definition used according to the condition code \p CC
49059/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49060/// uses of chain values.
49062 SelectionDAG &DAG,
49063 const X86Subtarget &Subtarget) {
49064 if (CC == X86::COND_B)
49065 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49066 return Flags;
49067
49068 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49069 return R;
49070
49071 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49072 return R;
49073
49074 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49075 return R;
49076
49077 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49078 return R;
49079
49080 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49081}
49082
49083/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49086 const X86Subtarget &Subtarget) {
49087 SDLoc DL(N);
49088 EVT VT = N->getValueType(0);
49089 SDValue FalseOp = N->getOperand(0);
49090 SDValue TrueOp = N->getOperand(1);
49091 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49092 SDValue Cond = N->getOperand(3);
49093
49094 // cmov X, X, ?, ? --> X
49095 if (TrueOp == FalseOp)
49096 return TrueOp;
49097
49098 // Try to simplify the EFLAGS and condition code operands.
49099 // We can't always do this as FCMOV only supports a subset of X86 cond.
49100 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49101 if (!(FalseOp.getValueType() == MVT::f80 ||
49102 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49103 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49104 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49105 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49106 Flags};
49107 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49108 }
49109 }
49110
49111 // If this is a select between two integer constants, try to do some
49112 // optimizations. Note that the operands are ordered the opposite of SELECT
49113 // operands.
49114 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49115 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49116 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49117 // larger than FalseC (the false value).
49118 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49120 std::swap(TrueC, FalseC);
49121 std::swap(TrueOp, FalseOp);
49122 }
49123
49124 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49125 // This is efficient for any integer data type (including i8/i16) and
49126 // shift amount.
49127 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49128 Cond = getSETCC(CC, Cond, DL, DAG);
49129
49130 // Zero extend the condition if needed.
49131 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49132
49133 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49134 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49135 DAG.getConstant(ShAmt, DL, MVT::i8));
49136 return Cond;
49137 }
49138
49139 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49140 // for any integer data type, including i8/i16.
49141 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49142 Cond = getSETCC(CC, Cond, DL, DAG);
49143
49144 // Zero extend the condition if needed.
49146 FalseC->getValueType(0), Cond);
49147 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49148 SDValue(FalseC, 0));
49149 return Cond;
49150 }
49151
49152 // Optimize cases that will turn into an LEA instruction. This requires
49153 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49154 if (VT == MVT::i32 || VT == MVT::i64) {
49155 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49156 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49157 "Implicit constant truncation");
49158
49159 bool isFastMultiplier = false;
49160 if (Diff.ult(10)) {
49161 switch (Diff.getZExtValue()) {
49162 default: break;
49163 case 1: // result = add base, cond
49164 case 2: // result = lea base( , cond*2)
49165 case 3: // result = lea base(cond, cond*2)
49166 case 4: // result = lea base( , cond*4)
49167 case 5: // result = lea base(cond, cond*4)
49168 case 8: // result = lea base( , cond*8)
49169 case 9: // result = lea base(cond, cond*8)
49170 isFastMultiplier = true;
49171 break;
49172 }
49173 }
49174
49175 if (isFastMultiplier) {
49176 Cond = getSETCC(CC, Cond, DL ,DAG);
49177 // Zero extend the condition if needed.
49178 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49179 Cond);
49180 // Scale the condition by the difference.
49181 if (Diff != 1)
49182 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49183 DAG.getConstant(Diff, DL, Cond.getValueType()));
49184
49185 // Add the base if non-zero.
49186 if (FalseC->getAPIntValue() != 0)
49187 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49188 SDValue(FalseC, 0));
49189 return Cond;
49190 }
49191 }
49192 }
49193 }
49194
49195 // Handle these cases:
49196 // (select (x != c), e, c) -> select (x != c), e, x),
49197 // (select (x == c), c, e) -> select (x == c), x, e)
49198 // where the c is an integer constant, and the "select" is the combination
49199 // of CMOV and CMP.
49200 //
49201 // The rationale for this change is that the conditional-move from a constant
49202 // needs two instructions, however, conditional-move from a register needs
49203 // only one instruction.
49204 //
49205 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49206 // some instruction-combining opportunities. This opt needs to be
49207 // postponed as late as possible.
49208 //
49209 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49210 // the DCI.xxxx conditions are provided to postpone the optimization as
49211 // late as possible.
49212
49213 ConstantSDNode *CmpAgainst = nullptr;
49214 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49215 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49216 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49217
49218 if (CC == X86::COND_NE &&
49219 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49221 std::swap(TrueOp, FalseOp);
49222 }
49223
49224 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49225 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49226 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49227 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49228 }
49229 }
49230 }
49231
49232 // Transform:
49233 //
49234 // (cmov 1 T (uge T 2))
49235 //
49236 // to:
49237 //
49238 // (adc T 0 (sub T 1))
49239 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49240 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49241 SDValue Cond0 = Cond.getOperand(0);
49242 if (Cond0.getOpcode() == ISD::TRUNCATE)
49243 Cond0 = Cond0.getOperand(0);
49244 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49245 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49246 EVT CondVT = Cond->getValueType(0);
49247 // Subtract 1 and generate a carry.
49248 SDValue NewSub =
49249 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49250 DAG.getConstant(1, DL, CondVT));
49251 SDValue EFLAGS(NewSub.getNode(), 1);
49252 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49253 DAG.getConstant(0, DL, VT), EFLAGS);
49254 }
49255 }
49256
49257 // Fold and/or of setcc's to double CMOV:
49258 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49259 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49260 //
49261 // This combine lets us generate:
49262 // cmovcc1 (jcc1 if we don't have CMOV)
49263 // cmovcc2 (same)
49264 // instead of:
49265 // setcc1
49266 // setcc2
49267 // and/or
49268 // cmovne (jne if we don't have CMOV)
49269 // When we can't use the CMOV instruction, it might increase branch
49270 // mispredicts.
49271 // When we can use CMOV, or when there is no mispredict, this improves
49272 // throughput and reduces register pressure.
49273 //
49274 if (CC == X86::COND_NE) {
49275 SDValue Flags;
49276 X86::CondCode CC0, CC1;
49277 bool isAndSetCC;
49278 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49279 if (isAndSetCC) {
49280 std::swap(FalseOp, TrueOp);
49283 }
49284
49285 SDValue LOps[] = {FalseOp, TrueOp,
49286 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49287 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49288 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49289 Flags};
49290 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49291 return CMOV;
49292 }
49293 }
49294
49295 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49296 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49297 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49298 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49299 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49300 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49301 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49302 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49303 SDValue Add = TrueOp;
49304 SDValue Const = FalseOp;
49305 // Canonicalize the condition code for easier matching and output.
49306 if (CC == X86::COND_E)
49307 std::swap(Add, Const);
49308
49309 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49310 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49311 Add.getResNo() == 0 && Add.hasOneUse() &&
49312 Add.getOperand(1) == Cond.getOperand(0)) {
49313 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49314 Add.getOperand(1));
49315 }
49316
49317 // We might have replaced the constant in the cmov with the LHS of the
49318 // compare. If so change it to the RHS of the compare.
49319 if (Const == Cond.getOperand(0))
49320 Const = Cond.getOperand(1);
49321
49322 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49323 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49324 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49325 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49326 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49327 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49328 // This should constant fold.
49329 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49330 SDValue CMov =
49331 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49332 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49333 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49334 }
49335 }
49336
49337 return SDValue();
49338}
49339
49340/// Different mul shrinking modes.
49342
49344 EVT VT = N->getOperand(0).getValueType();
49345 if (VT.getScalarSizeInBits() != 32)
49346 return false;
49347
49348 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49349 unsigned SignBits[2] = {1, 1};
49350 bool IsPositive[2] = {false, false};
49351 for (unsigned i = 0; i < 2; i++) {
49352 SDValue Opd = N->getOperand(i);
49353
49354 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49355 IsPositive[i] = DAG.SignBitIsZero(Opd);
49356 }
49357
49358 bool AllPositive = IsPositive[0] && IsPositive[1];
49359 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49360 // When ranges are from -128 ~ 127, use MULS8 mode.
49361 if (MinSignBits >= 25)
49362 Mode = ShrinkMode::MULS8;
49363 // When ranges are from 0 ~ 255, use MULU8 mode.
49364 else if (AllPositive && MinSignBits >= 24)
49365 Mode = ShrinkMode::MULU8;
49366 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49367 else if (MinSignBits >= 17)
49368 Mode = ShrinkMode::MULS16;
49369 // When ranges are from 0 ~ 65535, use MULU16 mode.
49370 else if (AllPositive && MinSignBits >= 16)
49371 Mode = ShrinkMode::MULU16;
49372 else
49373 return false;
49374 return true;
49375}
49376
49377/// When the operands of vector mul are extended from smaller size values,
49378/// like i8 and i16, the type of mul may be shrinked to generate more
49379/// efficient code. Two typical patterns are handled:
49380/// Pattern1:
49381/// %2 = sext/zext <N x i8> %1 to <N x i32>
49382/// %4 = sext/zext <N x i8> %3 to <N x i32>
49383// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49384/// %5 = mul <N x i32> %2, %4
49385///
49386/// Pattern2:
49387/// %2 = zext/sext <N x i16> %1 to <N x i32>
49388/// %4 = zext/sext <N x i16> %3 to <N x i32>
49389/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49390/// %5 = mul <N x i32> %2, %4
49391///
49392/// There are four mul shrinking modes:
49393/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49394/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49395/// generate pmullw+sext32 for it (MULS8 mode).
49396/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49397/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49398/// generate pmullw+zext32 for it (MULU8 mode).
49399/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49400/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49401/// generate pmullw+pmulhw for it (MULS16 mode).
49402/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49403/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49404/// generate pmullw+pmulhuw for it (MULU16 mode).
49406 const X86Subtarget &Subtarget) {
49407 // Check for legality
49408 // pmullw/pmulhw are not supported by SSE.
49409 if (!Subtarget.hasSSE2())
49410 return SDValue();
49411
49412 // Check for profitability
49413 // pmulld is supported since SSE41. It is better to use pmulld
49414 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49415 // the expansion.
49416 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49417 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49418 return SDValue();
49419
49420 ShrinkMode Mode;
49421 if (!canReduceVMulWidth(N, DAG, Mode))
49422 return SDValue();
49423
49424 SDValue N0 = N->getOperand(0);
49425 SDValue N1 = N->getOperand(1);
49426 EVT VT = N->getOperand(0).getValueType();
49427 unsigned NumElts = VT.getVectorNumElements();
49428 if ((NumElts % 2) != 0)
49429 return SDValue();
49430
49431 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49432
49433 // Shrink the operands of mul.
49434 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49435 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49436
49437 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49438 // lower part is needed.
49439 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49440 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
49441 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
49443 DL, VT, MulLo);
49444
49445 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49446 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49447 // the higher part is also needed.
49448 SDValue MulHi =
49449 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
49450 ReducedVT, NewN0, NewN1);
49451
49452 // Repack the lower part and higher part result of mul into a wider
49453 // result.
49454 // Generate shuffle functioning as punpcklwd.
49455 SmallVector<int, 16> ShuffleMask(NumElts);
49456 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49457 ShuffleMask[2 * i] = i;
49458 ShuffleMask[2 * i + 1] = i + NumElts;
49459 }
49460 SDValue ResLo =
49461 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49462 ResLo = DAG.getBitcast(ResVT, ResLo);
49463 // Generate shuffle functioning as punpckhwd.
49464 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49465 ShuffleMask[2 * i] = i + NumElts / 2;
49466 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49467 }
49468 SDValue ResHi =
49469 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49470 ResHi = DAG.getBitcast(ResVT, ResHi);
49471 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49472}
49473
49475 EVT VT, const SDLoc &DL) {
49476
49477 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49478 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49479 DAG.getConstant(Mult, DL, VT));
49480 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49481 DAG.getConstant(Shift, DL, MVT::i8));
49482 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49483 N->getOperand(0));
49484 return Result;
49485 };
49486
49487 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49488 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49489 DAG.getConstant(Mul1, DL, VT));
49490 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49491 DAG.getConstant(Mul2, DL, VT));
49492 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49493 N->getOperand(0));
49494 return Result;
49495 };
49496
49497 switch (MulAmt) {
49498 default:
49499 break;
49500 case 11:
49501 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49502 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49503 case 21:
49504 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49505 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49506 case 41:
49507 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49508 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49509 case 22:
49510 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49511 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49512 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49513 case 19:
49514 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49515 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49516 case 37:
49517 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49518 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49519 case 73:
49520 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49521 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49522 case 13:
49523 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49524 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49525 case 23:
49526 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49527 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49528 case 26:
49529 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49530 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49531 case 28:
49532 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49533 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49534 case 29:
49535 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49536 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49537 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49538 }
49539
49540 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49541 // by a single LEA.
49542 // First check if this a sum of two power of 2s because that's easy. Then
49543 // count how many zeros are up to the first bit.
49544 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49545 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49546 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49547 if (ScaleShift >= 1 && ScaleShift < 4) {
49548 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49549 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49550 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49551 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49552 DAG.getConstant(ScaleShift, DL, MVT::i8));
49553 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49554 }
49555 }
49556
49557 return SDValue();
49558}
49559
49560// If the upper 17 bits of either element are zero and the other element are
49561// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49562// PMULLD, except on KNL.
49564 SelectionDAG &DAG,
49565 const X86Subtarget &Subtarget) {
49566 if (!Subtarget.hasSSE2())
49567 return SDValue();
49568
49569 if (Subtarget.isPMADDWDSlow())
49570 return SDValue();
49571
49572 EVT VT = N->getValueType(0);
49573
49574 // Only support vXi32 vectors.
49575 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49576 return SDValue();
49577
49578 // Make sure the type is legal or can split/widen to a legal type.
49579 // With AVX512 but without BWI, we would need to split v32i16.
49580 unsigned NumElts = VT.getVectorNumElements();
49581 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49582 return SDValue();
49583
49584 // With AVX512 but without BWI, we would need to split v32i16.
49585 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49586 return SDValue();
49587
49588 SDValue N0 = N->getOperand(0);
49589 SDValue N1 = N->getOperand(1);
49590
49591 // If we are zero/sign extending two steps without SSE4.1, its better to
49592 // reduce the vmul width instead.
49593 if (!Subtarget.hasSSE41() &&
49594 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49595 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49596 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49597 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49598 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49599 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49600 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49601 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49602 return SDValue();
49603
49604 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49605 // the vmul width instead.
49606 if (!Subtarget.hasSSE41() &&
49607 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49608 N0.getOperand(0).getValueSizeInBits() > 128) &&
49609 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49610 N1.getOperand(0).getValueSizeInBits() > 128))
49611 return SDValue();
49612
49613 // Sign bits must extend down to the lowest i16.
49614 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49615 DAG.ComputeMaxSignificantBits(N0) > 16)
49616 return SDValue();
49617
49618 // At least one of the elements must be zero in the upper 17 bits, or can be
49619 // safely made zero without altering the final result.
49620 auto GetZeroableOp = [&](SDValue Op) {
49621 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49622 if (DAG.MaskedValueIsZero(Op, Mask17))
49623 return Op;
49624 // Mask off upper 16-bits of sign-extended constants.
49626 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49627 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49628 SDValue Src = Op.getOperand(0);
49629 // Convert sext(vXi16) to zext(vXi16).
49630 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49631 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49632 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49633 // which will expand the extension.
49634 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49635 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49636 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49637 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49638 }
49639 }
49640 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49641 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49642 N->isOnlyUserOf(Op.getNode())) {
49643 SDValue Src = Op.getOperand(0);
49644 if (Src.getScalarValueSizeInBits() == 16)
49645 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49646 }
49647 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49648 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49649 N->isOnlyUserOf(Op.getNode())) {
49650 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49651 Op.getOperand(1));
49652 }
49653 return SDValue();
49654 };
49655 SDValue ZeroN0 = GetZeroableOp(N0);
49656 SDValue ZeroN1 = GetZeroableOp(N1);
49657 if (!ZeroN0 && !ZeroN1)
49658 return SDValue();
49659 N0 = ZeroN0 ? ZeroN0 : N0;
49660 N1 = ZeroN1 ? ZeroN1 : N1;
49661
49662 // Use SplitOpsAndApply to handle AVX splitting.
49663 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49664 ArrayRef<SDValue> Ops) {
49665 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49666 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49667 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49668 DAG.getBitcast(OpVT, Ops[0]),
49669 DAG.getBitcast(OpVT, Ops[1]));
49670 };
49671 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49672}
49673
49675 const X86Subtarget &Subtarget) {
49676 if (!Subtarget.hasSSE2())
49677 return SDValue();
49678
49679 EVT VT = N->getValueType(0);
49680
49681 // Only support vXi64 vectors.
49682 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49683 VT.getVectorNumElements() < 2 ||
49685 return SDValue();
49686
49687 SDValue N0 = N->getOperand(0);
49688 SDValue N1 = N->getOperand(1);
49689
49690 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49691 // 32-bits. We can lower with this if the sign bits stretch that far.
49692 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49693 DAG.ComputeNumSignBits(N1) > 32) {
49694 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49695 ArrayRef<SDValue> Ops) {
49696 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49697 };
49698 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49699 /*CheckBWI*/ false);
49700 }
49701
49702 // If the upper bits are zero we can use a single pmuludq.
49703 APInt Mask = APInt::getHighBitsSet(64, 32);
49704 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49705 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49706 ArrayRef<SDValue> Ops) {
49707 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49708 };
49709 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49710 /*CheckBWI*/ false);
49711 }
49712
49713 return SDValue();
49714}
49715
49718 const X86Subtarget &Subtarget) {
49719 EVT VT = N->getValueType(0);
49720 SDLoc DL(N);
49721
49722 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49723 return V;
49724
49725 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49726 return V;
49727
49728 if (DCI.isBeforeLegalize() && VT.isVector())
49729 return reduceVMULWidth(N, DL, DAG, Subtarget);
49730
49731 if (VT != MVT::i64 && VT != MVT::i32 &&
49732 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49733 return SDValue();
49734
49735 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49736 if (!Known1.isConstant())
49737 return SDValue();
49738
49739 const APInt &C = Known1.getConstant();
49740 if (C.isZero())
49741 return DAG.getConstant(0, DL, VT);
49742
49743 if (C.isAllOnes())
49744 return DAG.getNegative(N->getOperand(0), DL, VT);
49745
49746 if (isPowerOf2_64(C.getZExtValue()))
49747 return SDValue();
49748
49749 // Optimize a single multiply with constant into two operations in order to
49750 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49752 return SDValue();
49753
49754 // An imul is usually smaller than the alternative sequence.
49756 return SDValue();
49757
49758 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49759 return SDValue();
49760
49761 int64_t SignMulAmt = C.getSExtValue();
49762 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49763 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49764
49765 SDValue NewMul = SDValue();
49766 if (VT == MVT::i64 || VT == MVT::i32) {
49767 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49768 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49769 DAG.getConstant(AbsMulAmt, DL, VT));
49770 if (SignMulAmt < 0)
49771 NewMul = DAG.getNegative(NewMul, DL, VT);
49772
49773 return NewMul;
49774 }
49775
49776 uint64_t MulAmt1 = 0;
49777 uint64_t MulAmt2 = 0;
49778 if ((AbsMulAmt % 9) == 0) {
49779 MulAmt1 = 9;
49780 MulAmt2 = AbsMulAmt / 9;
49781 } else if ((AbsMulAmt % 5) == 0) {
49782 MulAmt1 = 5;
49783 MulAmt2 = AbsMulAmt / 5;
49784 } else if ((AbsMulAmt % 3) == 0) {
49785 MulAmt1 = 3;
49786 MulAmt2 = AbsMulAmt / 3;
49787 }
49788
49789 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49790 if (MulAmt2 &&
49791 (isPowerOf2_64(MulAmt2) ||
49792 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49793
49794 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49795 N->user_begin()->getOpcode() == ISD::ADD))
49796 // If second multiplifer is pow2, issue it first. We want the multiply
49797 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49798 // use is an add. Only do this for positive multiply amounts since the
49799 // negate would prevent it from being used as an address mode anyway.
49800 std::swap(MulAmt1, MulAmt2);
49801
49802 if (isPowerOf2_64(MulAmt1))
49803 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49804 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49805 else
49806 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49807 DAG.getConstant(MulAmt1, DL, VT));
49808
49809 if (isPowerOf2_64(MulAmt2))
49810 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49811 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49812 else
49813 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49814 DAG.getConstant(MulAmt2, DL, VT));
49815
49816 // Negate the result.
49817 if (SignMulAmt < 0)
49818 NewMul = DAG.getNegative(NewMul, DL, VT);
49819 } else if (!Subtarget.slowLEA())
49820 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49821 }
49822 if (!NewMul) {
49823 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49824 if (isPowerOf2_64(AbsMulAmt - 1)) {
49825 // (mul x, 2^N + 1) => (add (shl x, N), x)
49826 NewMul = DAG.getNode(
49827 ISD::ADD, DL, VT, N->getOperand(0),
49828 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49829 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49830 if (SignMulAmt < 0)
49831 NewMul = DAG.getNegative(NewMul, DL, VT);
49832 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49833 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49834 NewMul =
49835 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49836 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49837 // To negate, reverse the operands of the subtract.
49838 if (SignMulAmt < 0)
49839 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49840 else
49841 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49842 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49843 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49844 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49845 NewMul =
49846 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49847 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49848 NewMul = DAG.getNode(
49849 ISD::ADD, DL, VT, NewMul,
49850 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49851 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49852 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49853 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49854 NewMul =
49855 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49856 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49857 NewMul = DAG.getNode(
49858 ISD::SUB, DL, VT, NewMul,
49859 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49860 } else if (SignMulAmt >= 0 && VT.isVector() &&
49861 Subtarget.fastImmVectorShift()) {
49862 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49863 uint64_t ShiftAmt1;
49864 std::optional<unsigned> Opc;
49865 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49866 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49867 Opc = ISD::ADD;
49868 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49869 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49870 Opc = ISD::SUB;
49871 }
49872
49873 if (Opc) {
49874 SDValue Shift1 =
49875 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49876 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49877 SDValue Shift2 =
49878 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49879 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49880 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49881 }
49882 }
49883 }
49884
49885 return NewMul;
49886}
49887
49888// Try to form a MULHU or MULHS node by looking for
49889// (srl (mul ext, ext), 16)
49890// TODO: This is X86 specific because we want to be able to handle wide types
49891// before type legalization. But we can only do it if the vector will be
49892// legalized via widening/splitting. Type legalization can't handle promotion
49893// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49894// combiner.
49896 const SDLoc &DL,
49897 const X86Subtarget &Subtarget) {
49898 using namespace SDPatternMatch;
49899 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49900 "SRL or SRA node is required here!");
49901
49902 if (!Subtarget.hasSSE2())
49903 return SDValue();
49904
49905 // Input type should be at least vXi32.
49906 EVT VT = N->getValueType(0);
49907 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49908 return SDValue();
49909
49910 // The operation must be a multiply shifted right by 16.
49911 SDValue LHS, RHS;
49912 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49913 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49914 return SDValue();
49915
49916 unsigned ExtOpc = LHS.getOpcode();
49917 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49918 RHS.getOpcode() != ExtOpc)
49919 return SDValue();
49920
49921 // Peek through the extends.
49922 LHS = LHS.getOperand(0);
49923 RHS = RHS.getOperand(0);
49924
49925 // Ensure the input types match.
49926 EVT MulVT = LHS.getValueType();
49927 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49928 return SDValue();
49929
49930 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49931 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49932
49933 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49934 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49935}
49936
49938 const X86Subtarget &Subtarget) {
49939 using namespace llvm::SDPatternMatch;
49940 SDValue N0 = N->getOperand(0);
49941 SDValue N1 = N->getOperand(1);
49942 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
49943 EVT VT = N0.getValueType();
49944 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49945 SDLoc DL(N);
49946
49947 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49948 // with out-of-bounds clamping.
49949 if (N0.getOpcode() == ISD::VSELECT &&
49950 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
49951 SDValue Cond = N0.getOperand(0);
49952 SDValue N00 = N0.getOperand(1);
49953 SDValue N01 = N0.getOperand(2);
49954 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
49956 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49957 m_SpecificCondCode(ISD::SETULT)))) {
49958 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
49959 }
49960 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
49962 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49963 m_SpecificCondCode(ISD::SETUGE)))) {
49964 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
49965 }
49966 }
49967
49968 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
49969 // since the result of setcc_c is all zero's or all ones.
49970 if (VT.isInteger() && !VT.isVector() &&
49971 N1C && N0.getOpcode() == ISD::AND &&
49972 N0.getOperand(1).getOpcode() == ISD::Constant) {
49973 SDValue N00 = N0.getOperand(0);
49974 APInt Mask = N0.getConstantOperandAPInt(1);
49975 Mask <<= N1C->getAPIntValue();
49976 bool MaskOK = false;
49977 // We can handle cases concerning bit-widening nodes containing setcc_c if
49978 // we carefully interrogate the mask to make sure we are semantics
49979 // preserving.
49980 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
49981 // of the underlying setcc_c operation if the setcc_c was zero extended.
49982 // Consider the following example:
49983 // zext(setcc_c) -> i32 0x0000FFFF
49984 // c1 -> i32 0x0000FFFF
49985 // c2 -> i32 0x00000001
49986 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
49987 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
49988 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
49989 MaskOK = true;
49990 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
49992 MaskOK = true;
49993 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
49994 N00.getOpcode() == ISD::ANY_EXTEND) &&
49996 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
49997 }
49998 if (MaskOK && Mask != 0)
49999 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50000 }
50001
50002 return SDValue();
50003}
50004
50006 const X86Subtarget &Subtarget) {
50007 using namespace llvm::SDPatternMatch;
50008 SDValue N0 = N->getOperand(0);
50009 SDValue N1 = N->getOperand(1);
50010 EVT VT = N0.getValueType();
50011 unsigned Size = VT.getSizeInBits();
50012 SDLoc DL(N);
50013
50014 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50015 return V;
50016
50017 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50018 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50019 SDValue ShrAmtVal;
50020 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50022 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50023 }
50024
50025 // fold (SRA (SHL X, ShlConst), SraConst)
50026 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50027 // or (sext_in_reg X)
50028 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50029 // depending on relation between SraConst and ShlConst.
50030 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50031 // us to do the sext_in_reg from corresponding bit.
50032
50033 // sexts in X86 are MOVs. The MOVs have the same code size
50034 // as above SHIFTs (only SHIFT on 1 has lower code size).
50035 // However the MOVs have 2 advantages to a SHIFT:
50036 // 1. MOVs can write to a register that differs from source
50037 // 2. MOVs accept memory operands
50038
50039 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50040 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50042 return SDValue();
50043
50044 SDValue N00 = N0.getOperand(0);
50045 SDValue N01 = N0.getOperand(1);
50046 APInt ShlConst = N01->getAsAPIntVal();
50047 APInt SraConst = N1->getAsAPIntVal();
50048 EVT CVT = N1.getValueType();
50049
50050 if (CVT != N01.getValueType())
50051 return SDValue();
50052 if (SraConst.isNegative())
50053 return SDValue();
50054
50055 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50056 unsigned ShiftSize = SVT.getSizeInBits();
50057 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50058 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50059 continue;
50060 SDValue NN =
50061 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50062 if (SraConst.eq(ShlConst))
50063 return NN;
50064 if (SraConst.ult(ShlConst))
50065 return DAG.getNode(ISD::SHL, DL, VT, NN,
50066 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50067 return DAG.getNode(ISD::SRA, DL, VT, NN,
50068 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50069 }
50070 return SDValue();
50071}
50072
50075 const X86Subtarget &Subtarget) {
50076 using namespace llvm::SDPatternMatch;
50077 SDValue N0 = N->getOperand(0);
50078 SDValue N1 = N->getOperand(1);
50079 EVT VT = N0.getValueType();
50080 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50081 SDLoc DL(N);
50082
50083 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50084 return V;
50085
50086 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50087 // with out-of-bounds clamping.
50088 if (N0.getOpcode() == ISD::VSELECT &&
50089 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50090 SDValue Cond = N0.getOperand(0);
50091 SDValue N00 = N0.getOperand(1);
50092 SDValue N01 = N0.getOperand(2);
50093 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50095 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50096 m_SpecificCondCode(ISD::SETULT)))) {
50097 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50098 }
50099 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50101 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50102 m_SpecificCondCode(ISD::SETUGE)))) {
50103 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50104 }
50105 }
50106
50107 // Only do this on the last DAG combine as it can interfere with other
50108 // combines.
50109 if (!DCI.isAfterLegalizeDAG())
50110 return SDValue();
50111
50112 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50113 // TODO: This is a generic DAG combine that became an x86-only combine to
50114 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50115 // and-not ('andn').
50116 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50117 return SDValue();
50118
50119 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50120 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50121 if (!ShiftC || !AndC)
50122 return SDValue();
50123
50124 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50125 // transform should reduce code size. It may also enable secondary transforms
50126 // from improved known-bits analysis or instruction selection.
50127 APInt MaskVal = AndC->getAPIntValue();
50128
50129 // If this can be matched by a zero extend, don't optimize.
50130 if (MaskVal.isMask()) {
50131 unsigned TO = MaskVal.countr_one();
50132 if (TO >= 8 && isPowerOf2_32(TO))
50133 return SDValue();
50134 }
50135
50136 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50137 unsigned OldMaskSize = MaskVal.getSignificantBits();
50138 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50139 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50140 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50141 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50142 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50143 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50144 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50145 }
50146 return SDValue();
50147}
50148
50150 const X86Subtarget &Subtarget) {
50151 unsigned Opcode = N->getOpcode();
50152 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50153
50154 SDLoc DL(N);
50155 EVT VT = N->getValueType(0);
50156 SDValue N0 = N->getOperand(0);
50157 SDValue N1 = N->getOperand(1);
50158 EVT SrcVT = N0.getValueType();
50159
50160 SDValue BC0 =
50161 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50162 SDValue BC1 =
50163 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50164
50165 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50166 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50167 // truncation trees that help us avoid lane crossing shuffles.
50168 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50169 // TODO: We don't handle vXf64 shuffles yet.
50170 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50171 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50173 SmallVector<int> ShuffleMask, ScaledMask;
50174 SDValue Vec = peekThroughBitcasts(BCSrc);
50175 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50177 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50178 // shuffle to a v4X64 width - we can probably relax this in the future.
50179 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50180 ShuffleOps[0].getValueType().is256BitVector() &&
50181 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50182 SDValue Lo, Hi;
50183 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50184 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50185 Lo = DAG.getBitcast(SrcVT, Lo);
50186 Hi = DAG.getBitcast(SrcVT, Hi);
50187 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50188 Res = DAG.getBitcast(ShufVT, Res);
50189 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50190 return DAG.getBitcast(VT, Res);
50191 }
50192 }
50193 }
50194 }
50195
50196 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50197 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50198 // If either/both ops are a shuffle that can scale to v2x64,
50199 // then see if we can perform this as a v4x32 post shuffle.
50200 SmallVector<SDValue> Ops0, Ops1;
50201 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50202 bool IsShuf0 =
50203 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50204 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50205 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50206 bool IsShuf1 =
50207 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50208 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50209 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50210 if (IsShuf0 || IsShuf1) {
50211 if (!IsShuf0) {
50212 Ops0.assign({BC0});
50213 ScaledMask0.assign({0, 1});
50214 }
50215 if (!IsShuf1) {
50216 Ops1.assign({BC1});
50217 ScaledMask1.assign({0, 1});
50218 }
50219
50220 SDValue LHS, RHS;
50221 int PostShuffle[4] = {-1, -1, -1, -1};
50222 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50223 if (M < 0)
50224 return true;
50225 Idx = M % 2;
50226 SDValue Src = Ops[M / 2];
50227 if (!LHS || LHS == Src) {
50228 LHS = Src;
50229 return true;
50230 }
50231 if (!RHS || RHS == Src) {
50232 Idx += 2;
50233 RHS = Src;
50234 return true;
50235 }
50236 return false;
50237 };
50238 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50239 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50240 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50241 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50242 LHS = DAG.getBitcast(SrcVT, LHS);
50243 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50244 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50245 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50246 Res = DAG.getBitcast(ShufVT, Res);
50247 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50248 return DAG.getBitcast(VT, Res);
50249 }
50250 }
50251 }
50252
50253 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50254 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50255 SmallVector<int> Mask0, Mask1;
50256 SmallVector<SDValue> Ops0, Ops1;
50257 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50258 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50259 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50260 !Ops0.empty() && !Ops1.empty() &&
50261 all_of(Ops0,
50262 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50263 all_of(Ops1,
50264 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50265 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50266 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50267 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50268 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50269 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50270 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50271 if ((Op00 == Op11) && (Op01 == Op10)) {
50272 std::swap(Op10, Op11);
50274 }
50275 if ((Op00 == Op10) && (Op01 == Op11)) {
50276 const int Map[4] = {0, 2, 1, 3};
50277 SmallVector<int, 4> ShuffleMask(
50278 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50279 Map[ScaledMask1[1]]});
50280 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50281 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50282 DAG.getBitcast(SrcVT, Op01));
50283 Res = DAG.getBitcast(ShufVT, Res);
50284 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50285 return DAG.getBitcast(VT, Res);
50286 }
50287 }
50288 }
50289
50290 return SDValue();
50291}
50292
50295 const X86Subtarget &Subtarget) {
50296 unsigned Opcode = N->getOpcode();
50297 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50298 "Unexpected pack opcode");
50299
50300 EVT VT = N->getValueType(0);
50301 SDValue N0 = N->getOperand(0);
50302 SDValue N1 = N->getOperand(1);
50303 unsigned NumDstElts = VT.getVectorNumElements();
50304 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50305 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50306 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50307 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50308 "Unexpected PACKSS/PACKUS input type");
50309
50310 bool IsSigned = (X86ISD::PACKSS == Opcode);
50311
50312 // Constant Folding.
50313 APInt UndefElts0, UndefElts1;
50314 SmallVector<APInt, 32> EltBits0, EltBits1;
50315 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50316 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50317 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50318 /*AllowWholeUndefs*/ true,
50319 /*AllowPartialUndefs*/ true) &&
50320 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50321 /*AllowWholeUndefs*/ true,
50322 /*AllowPartialUndefs*/ true)) {
50323 unsigned NumLanes = VT.getSizeInBits() / 128;
50324 unsigned NumSrcElts = NumDstElts / 2;
50325 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50326 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50327
50328 APInt Undefs(NumDstElts, 0);
50329 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50330 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50331 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50332 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50333 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50334 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50335
50336 if (UndefElts[SrcIdx]) {
50337 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50338 continue;
50339 }
50340
50341 APInt &Val = EltBits[SrcIdx];
50342 if (IsSigned) {
50343 // PACKSS: Truncate signed value with signed saturation.
50344 // Source values less than dst minint are saturated to minint.
50345 // Source values greater than dst maxint are saturated to maxint.
50346 Val = Val.truncSSat(DstBitsPerElt);
50347 } else {
50348 // PACKUS: Truncate signed value with unsigned saturation.
50349 // Source values less than zero are saturated to zero.
50350 // Source values greater than dst maxuint are saturated to maxuint.
50351 // NOTE: This is different from APInt::truncUSat.
50352 if (Val.isIntN(DstBitsPerElt))
50353 Val = Val.trunc(DstBitsPerElt);
50354 else if (Val.isNegative())
50355 Val = APInt::getZero(DstBitsPerElt);
50356 else
50357 Val = APInt::getAllOnes(DstBitsPerElt);
50358 }
50359 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50360 }
50361 }
50362
50363 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50364 }
50365
50366 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50367 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50368 return V;
50369
50370 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50371 // Currently limit this to allsignbits cases only.
50372 if (IsSigned &&
50373 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50374 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50375 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50376 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50377 if (Not0 && Not1) {
50378 SDLoc DL(N);
50379 MVT SrcVT = N0.getSimpleValueType();
50380 SDValue Pack =
50381 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50382 DAG.getBitcast(SrcVT, Not1));
50383 return DAG.getNOT(DL, Pack, VT);
50384 }
50385 }
50386
50387 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50388 // truncate to create a larger truncate.
50389 if (Subtarget.hasAVX512() &&
50390 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50391 N0.getOperand(0).getValueType() == MVT::v8i32) {
50392 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50393 (!IsSigned &&
50394 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50395 if (Subtarget.hasVLX())
50396 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50397
50398 // Widen input to v16i32 so we can truncate that.
50399 SDLoc dl(N);
50400 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50401 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50402 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50403 }
50404 }
50405
50406 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50407 if (VT.is128BitVector()) {
50408 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50409 SDValue Src0, Src1;
50410 if (N0.getOpcode() == ExtOpc &&
50412 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50413 Src0 = N0.getOperand(0);
50414 }
50415 if (N1.getOpcode() == ExtOpc &&
50417 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50418 Src1 = N1.getOperand(0);
50419 }
50420 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50421 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50422 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50423 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50424 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50425 }
50426
50427 // Try again with pack(*_extend_vector_inreg, undef).
50428 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50430 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50431 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50432 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50433 DAG);
50434 }
50435
50436 // Attempt to combine as shuffle.
50437 SDValue Op(N, 0);
50438 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50439 return Res;
50440
50441 return SDValue();
50442}
50443
50446 const X86Subtarget &Subtarget) {
50447 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50448 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50449 "Unexpected horizontal add/sub opcode");
50450
50451 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50452 MVT VT = N->getSimpleValueType(0);
50453 SDValue LHS = N->getOperand(0);
50454 SDValue RHS = N->getOperand(1);
50455
50456 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50457 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50458 LHS.getOpcode() == RHS.getOpcode() &&
50459 LHS.getValueType() == RHS.getValueType() &&
50460 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50461 SDValue LHS0 = LHS.getOperand(0);
50462 SDValue LHS1 = LHS.getOperand(1);
50463 SDValue RHS0 = RHS.getOperand(0);
50464 SDValue RHS1 = RHS.getOperand(1);
50465 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50466 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50467 SDLoc DL(N);
50468 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50469 LHS0.isUndef() ? LHS1 : LHS0,
50470 RHS0.isUndef() ? RHS1 : RHS0);
50471 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50472 Res = DAG.getBitcast(ShufVT, Res);
50473 SDValue NewLHS =
50474 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50475 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50476 SDValue NewRHS =
50477 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50478 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50479 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50480 DAG.getBitcast(VT, NewRHS));
50481 }
50482 }
50483 }
50484
50485 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50486 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50487 return V;
50488
50489 return SDValue();
50490}
50491
50494 const X86Subtarget &Subtarget) {
50495 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50496 X86ISD::VSRL == N->getOpcode()) &&
50497 "Unexpected shift opcode");
50498 EVT VT = N->getValueType(0);
50499 SDValue N0 = N->getOperand(0);
50500 SDValue N1 = N->getOperand(1);
50501
50502 // Shift zero -> zero.
50504 return DAG.getConstant(0, SDLoc(N), VT);
50505
50506 // Detect constant shift amounts.
50507 APInt UndefElts;
50508 SmallVector<APInt, 32> EltBits;
50509 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50510 /*AllowWholeUndefs*/ true,
50511 /*AllowPartialUndefs*/ false)) {
50512 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50513 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50514 EltBits[0].getZExtValue(), DAG);
50515 }
50516
50517 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50518 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50519 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50520 return SDValue(N, 0);
50521
50522 return SDValue();
50523}
50524
50527 const X86Subtarget &Subtarget) {
50528 unsigned Opcode = N->getOpcode();
50529 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50530 X86ISD::VSRLI == Opcode) &&
50531 "Unexpected shift opcode");
50532 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50533 EVT VT = N->getValueType(0);
50534 SDValue N0 = N->getOperand(0);
50535 SDValue N1 = N->getOperand(1);
50536 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50537 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50538 "Unexpected value type");
50539 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50540
50541 // (shift undef, X) -> 0
50542 if (N0.isUndef())
50543 return DAG.getConstant(0, SDLoc(N), VT);
50544
50545 // Out of range logical bit shifts are guaranteed to be zero.
50546 // Out of range arithmetic bit shifts splat the sign bit.
50547 unsigned ShiftVal = N->getConstantOperandVal(1);
50548 if (ShiftVal >= NumBitsPerElt) {
50549 if (LogicalShift)
50550 return DAG.getConstant(0, SDLoc(N), VT);
50551 ShiftVal = NumBitsPerElt - 1;
50552 }
50553
50554 // (shift X, 0) -> X
50555 if (!ShiftVal)
50556 return N0;
50557
50558 // (shift 0, C) -> 0
50560 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50561 // result are all zeros, not undef.
50562 return DAG.getConstant(0, SDLoc(N), VT);
50563
50564 // (VSRAI -1, C) -> -1
50565 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50566 // N0 is all ones or undef. We guarantee that the bits shifted into the
50567 // result are all ones, not undef.
50568 return DAG.getAllOnesConstant(SDLoc(N), VT);
50569
50570 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50571 unsigned NewShiftVal = Amt0 + Amt1;
50572 if (NewShiftVal >= NumBitsPerElt) {
50573 // Out of range logical bit shifts are guaranteed to be zero.
50574 // Out of range arithmetic bit shifts splat the sign bit.
50575 if (LogicalShift)
50576 return DAG.getConstant(0, SDLoc(N), VT);
50577 NewShiftVal = NumBitsPerElt - 1;
50578 }
50579 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50580 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50581 };
50582
50583 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50584 if (Opcode == N0.getOpcode())
50585 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50586
50587 // (shl (add X, X), C) -> (shl X, (C + 1))
50588 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50589 N0.getOperand(0) == N0.getOperand(1))
50590 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50591
50592 // We can decode 'whole byte' logical bit shifts as shuffles.
50593 if (LogicalShift && (ShiftVal % 8) == 0) {
50594 SDValue Op(N, 0);
50595 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50596 return Res;
50597 }
50598
50599 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50600 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50601 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50602 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50603 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50604 N0.getOpcode() == X86ISD::PSHUFD &&
50605 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50606 N0->hasOneUse()) {
50608 if (BC.getOpcode() == X86ISD::VSHLI &&
50609 BC.getScalarValueSizeInBits() == 64 &&
50610 BC.getConstantOperandVal(1) == 63) {
50611 SDLoc DL(N);
50612 SDValue Src = BC.getOperand(0);
50613 Src = DAG.getBitcast(VT, Src);
50614 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50615 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50616 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50617 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50618 return Src;
50619 }
50620 }
50621
50622 auto TryConstantFold = [&](SDValue V) {
50623 APInt UndefElts;
50624 SmallVector<APInt, 32> EltBits;
50625 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50626 /*AllowWholeUndefs*/ true,
50627 /*AllowPartialUndefs*/ true))
50628 return SDValue();
50629 assert(EltBits.size() == VT.getVectorNumElements() &&
50630 "Unexpected shift value type");
50631 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50632 // created an undef input due to no input bits being demanded, but user
50633 // still expects 0 in other bits.
50634 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50635 APInt &Elt = EltBits[i];
50636 if (UndefElts[i])
50637 Elt = 0;
50638 else if (X86ISD::VSHLI == Opcode)
50639 Elt <<= ShiftVal;
50640 else if (X86ISD::VSRAI == Opcode)
50641 Elt.ashrInPlace(ShiftVal);
50642 else
50643 Elt.lshrInPlace(ShiftVal);
50644 }
50645 // Reset undef elements since they were zeroed above.
50646 UndefElts = 0;
50647 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50648 };
50649
50650 // Constant Folding.
50651 if (N->isOnlyUserOf(N0.getNode())) {
50652 if (SDValue C = TryConstantFold(N0))
50653 return C;
50654
50655 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50656 // Don't break NOT patterns.
50658 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50659 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50661 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50662 SDLoc DL(N);
50663 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50664 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50665 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50666 }
50667 }
50668 }
50669
50670 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50671 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50672 DCI))
50673 return SDValue(N, 0);
50674
50675 return SDValue();
50676}
50677
50680 const X86Subtarget &Subtarget) {
50681 EVT VT = N->getValueType(0);
50682 unsigned Opcode = N->getOpcode();
50683 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50684 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50685 Opcode == ISD::INSERT_VECTOR_ELT) &&
50686 "Unexpected vector insertion");
50687
50688 SDValue Vec = N->getOperand(0);
50689 SDValue Scl = N->getOperand(1);
50690 SDValue Idx = N->getOperand(2);
50691
50692 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50693 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50694 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50695
50696 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50697 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50698 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50699 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50700 APInt::getAllOnes(NumBitsPerElt), DCI))
50701 return SDValue(N, 0);
50702 }
50703
50704 // Attempt to combine insertion patterns to a shuffle.
50705 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50706 SDValue Op(N, 0);
50707 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50708 return Res;
50709 }
50710
50711 return SDValue();
50712}
50713
50714/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50715/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50716/// OR -> CMPNEQSS.
50719 const X86Subtarget &Subtarget) {
50720 unsigned opcode;
50721
50722 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50723 // we're requiring SSE2 for both.
50724 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50725 SDValue N0 = N->getOperand(0);
50726 SDValue N1 = N->getOperand(1);
50727 SDValue CMP0 = N0.getOperand(1);
50728 SDValue CMP1 = N1.getOperand(1);
50729 SDLoc DL(N);
50730
50731 // The SETCCs should both refer to the same CMP.
50732 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50733 return SDValue();
50734
50735 SDValue CMP00 = CMP0->getOperand(0);
50736 SDValue CMP01 = CMP0->getOperand(1);
50737 EVT VT = CMP00.getValueType();
50738
50739 if (VT == MVT::f32 || VT == MVT::f64 ||
50740 (VT == MVT::f16 && Subtarget.hasFP16())) {
50741 bool ExpectingFlags = false;
50742 // Check for any users that want flags:
50743 for (const SDNode *U : N->users()) {
50744 if (ExpectingFlags)
50745 break;
50746
50747 switch (U->getOpcode()) {
50748 default:
50749 case ISD::BR_CC:
50750 case ISD::BRCOND:
50751 case ISD::SELECT:
50752 ExpectingFlags = true;
50753 break;
50754 case ISD::CopyToReg:
50755 case ISD::SIGN_EXTEND:
50756 case ISD::ZERO_EXTEND:
50757 case ISD::ANY_EXTEND:
50758 break;
50759 }
50760 }
50761
50762 if (!ExpectingFlags) {
50763 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50764 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50765
50766 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50767 X86::CondCode tmp = cc0;
50768 cc0 = cc1;
50769 cc1 = tmp;
50770 }
50771
50772 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50773 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50774 // FIXME: need symbolic constants for these magic numbers.
50775 // See X86ATTInstPrinter.cpp:printSSECC().
50776 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50777 if (Subtarget.hasAVX512()) {
50778 SDValue FSetCC =
50779 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50780 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50781 // Need to fill with zeros to ensure the bitcast will produce zeroes
50782 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50783 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50784 DAG.getConstant(0, DL, MVT::v16i1),
50785 FSetCC, DAG.getVectorIdxConstant(0, DL));
50786 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50787 N->getSimpleValueType(0));
50788 }
50789 SDValue OnesOrZeroesF =
50790 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50791 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50792
50793 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50794 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50795
50796 if (is64BitFP && !Subtarget.is64Bit()) {
50797 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50798 // 64-bit integer, since that's not a legal type. Since
50799 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50800 // bits, but can do this little dance to extract the lowest 32 bits
50801 // and work with those going forward.
50802 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50803 MVT::v2f64, OnesOrZeroesF);
50804 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50805 OnesOrZeroesF =
50806 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50807 DAG.getVectorIdxConstant(0, DL));
50808 IntVT = MVT::i32;
50809 }
50810
50811 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50812 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50813 DAG.getConstant(1, DL, IntVT));
50814 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50815 ANDed);
50816 return OneBitOfTruth;
50817 }
50818 }
50819 }
50820 }
50821 return SDValue();
50822}
50823
50824/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50826 SelectionDAG &DAG) {
50827 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50828
50829 MVT VT = N->getSimpleValueType(0);
50830 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50831 return SDValue();
50832
50833 SDValue X, Y;
50834 SDValue N0 = N->getOperand(0);
50835 SDValue N1 = N->getOperand(1);
50836
50837 if (SDValue Not = IsNOT(N0, DAG)) {
50838 X = Not;
50839 Y = N1;
50840 } else if (SDValue Not = IsNOT(N1, DAG)) {
50841 X = Not;
50842 Y = N0;
50843 } else
50844 return SDValue();
50845
50846 X = DAG.getBitcast(VT, X);
50847 Y = DAG.getBitcast(VT, Y);
50848 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50849}
50850
50851/// Try to fold:
50852/// and (vector_shuffle<Z,...,Z>
50853/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50854/// ->
50855/// andnp (vector_shuffle<Z,...,Z>
50856/// (insert_vector_elt undef, X, Z), undef), Y
50858 const X86Subtarget &Subtarget) {
50859 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50860
50861 EVT VT = N->getValueType(0);
50862 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50863 // value and require extra moves.
50864 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50865 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50866 return SDValue();
50867
50868 auto GetNot = [&DAG](SDValue V) {
50869 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
50870 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50871 // end-users are ISD::AND including cases
50872 // (and(extract_vector_element(SVN), Y)).
50873 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50874 !SVN->getOperand(1).isUndef()) {
50875 return SDValue();
50876 }
50877 SDValue IVEN = SVN->getOperand(0);
50878 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50879 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50880 return SDValue();
50881 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50882 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50883 return SDValue();
50884 SDValue Src = IVEN.getOperand(1);
50885 if (SDValue Not = IsNOT(Src, DAG)) {
50886 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50887 SDValue NotIVEN =
50889 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50890 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50891 SVN->getOperand(1), SVN->getMask());
50892 }
50893 return SDValue();
50894 };
50895
50896 SDValue X, Y;
50897 SDValue N0 = N->getOperand(0);
50898 SDValue N1 = N->getOperand(1);
50899 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50900
50901 if (SDValue Not = GetNot(N0)) {
50902 X = Not;
50903 Y = N1;
50904 } else if (SDValue Not = GetNot(N1)) {
50905 X = Not;
50906 Y = N0;
50907 } else
50908 return SDValue();
50909
50910 X = DAG.getBitcast(VT, X);
50911 Y = DAG.getBitcast(VT, Y);
50912 SDLoc DL(N);
50913
50914 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50915 // AVX2.
50916 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50918 SDValue LoX, HiX;
50919 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50920 SDValue LoY, HiY;
50921 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50922 EVT SplitVT = LoX.getValueType();
50923 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50924 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50925 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50926 }
50927
50928 if (TLI.isTypeLegal(VT))
50929 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50930
50931 return SDValue();
50932}
50933
50934// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50935// logical operations, like in the example below.
50936// or (and (truncate x, truncate y)),
50937// (xor (truncate z, build_vector (constants)))
50938// Given a target type \p VT, we generate
50939// or (and x, y), (xor z, zext(build_vector (constants)))
50940// given x, y and z are of type \p VT. We can do so, if operands are either
50941// truncates from VT types, the second operand is a vector of constants or can
50942// be recursively promoted.
50944 SelectionDAG &DAG, unsigned Depth) {
50945 // Limit recursion to avoid excessive compile times.
50947 return SDValue();
50948
50949 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
50950 return SDValue();
50951
50952 SDValue N0 = N.getOperand(0);
50953 SDValue N1 = N.getOperand(1);
50954
50955 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50956 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
50957 return SDValue();
50958
50959 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
50960 N0 = NN0;
50961 else {
50962 // The left side has to be a trunc.
50963 if (N0.getOpcode() != ISD::TRUNCATE)
50964 return SDValue();
50965
50966 // The type of the truncated inputs.
50967 if (N0.getOperand(0).getValueType() != VT)
50968 return SDValue();
50969
50970 N0 = N0.getOperand(0);
50971 }
50972
50973 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
50974 N1 = NN1;
50975 else {
50976 // The right side has to be a 'trunc' or a (foldable) constant.
50977 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
50978 N1.getOperand(0).getValueType() == VT;
50979 if (RHSTrunc)
50980 N1 = N1.getOperand(0);
50981 else if (SDValue Cst =
50983 N1 = Cst;
50984 else
50985 return SDValue();
50986 }
50987
50988 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
50989}
50990
50991// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
50992// register. In most cases we actually compare or select YMM-sized registers
50993// and mixing the two types creates horrible code. This method optimizes
50994// some of the transition sequences.
50995// Even with AVX-512 this is still useful for removing casts around logical
50996// operations on vXi1 mask types.
50998 SelectionDAG &DAG,
50999 const X86Subtarget &Subtarget) {
51000 EVT VT = N.getValueType();
51001 assert(VT.isVector() && "Expected vector type");
51002 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51003 N.getOpcode() == ISD::ZERO_EXTEND ||
51004 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51005
51006 SDValue Narrow = N.getOperand(0);
51007 EVT NarrowVT = Narrow.getValueType();
51008
51009 // Generate the wide operation.
51010 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
51011 if (!Op)
51012 return SDValue();
51013 switch (N.getOpcode()) {
51014 default: llvm_unreachable("Unexpected opcode");
51015 case ISD::ANY_EXTEND:
51016 return Op;
51017 case ISD::ZERO_EXTEND:
51018 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51019 case ISD::SIGN_EXTEND:
51020 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51021 Op, DAG.getValueType(NarrowVT));
51022 }
51023}
51024
51025static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51026 unsigned FPOpcode;
51027 switch (Opcode) {
51028 // clang-format off
51029 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51030 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51031 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51032 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51033 // clang-format on
51034 }
51035 return FPOpcode;
51036}
51037
51038/// If both input operands of a logic op are being cast from floating-point
51039/// types or FP compares, try to convert this into a floating-point logic node
51040/// to avoid unnecessary moves from SSE to integer registers.
51041static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51042 SDValue N0, SDValue N1,
51043 SelectionDAG &DAG,
51045 const X86Subtarget &Subtarget) {
51046 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51047 "Unexpected bit opcode");
51048
51049 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51050 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51051 return SDValue();
51052
51053 SDValue N00 = N0.getOperand(0);
51054 SDValue N10 = N1.getOperand(0);
51055 EVT N00Type = N00.getValueType();
51056 EVT N10Type = N10.getValueType();
51057
51058 // Ensure that both types are the same and are legal scalar fp types.
51059 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51060 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51061 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51062 return SDValue();
51063
51064 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51065 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51066 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51067 return DAG.getBitcast(VT, FPLogic);
51068 }
51069
51070 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51071 !N1.hasOneUse())
51072 return SDValue();
51073
51074 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51075 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51076
51077 // The vector ISA for FP predicates is incomplete before AVX, so converting
51078 // COMIS* to CMPS* may not be a win before AVX.
51079 if (!Subtarget.hasAVX() &&
51080 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51081 return SDValue();
51082
51083 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51084 // and vector logic:
51085 // logic (setcc N00, N01), (setcc N10, N11) -->
51086 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51087 unsigned NumElts = 128 / N00Type.getSizeInBits();
51088 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51089 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51090 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51091 SDValue N01 = N0.getOperand(1);
51092 SDValue N11 = N1.getOperand(1);
51093 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51094 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51095 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51096 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51097 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51098 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51099 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51100 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51101}
51102
51103// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51104// to reduce XMM->GPR traffic.
51105static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51106 SDValue N1, SelectionDAG &DAG) {
51107 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51108 "Unexpected bit opcode");
51109
51110 // Both operands must be single use MOVMSK.
51111 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51112 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51113 return SDValue();
51114
51115 SDValue Vec0 = N0.getOperand(0);
51116 SDValue Vec1 = N1.getOperand(0);
51117 EVT VecVT0 = Vec0.getValueType();
51118 EVT VecVT1 = Vec1.getValueType();
51119
51120 // Both MOVMSK operands must be from vectors of the same size and same element
51121 // size, but its OK for a fp/int diff.
51122 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51123 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51124 return SDValue();
51125
51126 unsigned VecOpc =
51128 SDValue Result =
51129 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51130 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51131}
51132
51133// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51134// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51135// handles in InstCombine.
51136static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51137 SDValue N0, SDValue N1,
51138 SelectionDAG &DAG) {
51139 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51140 "Unexpected bit opcode");
51141
51142 // Both operands must be single use.
51143 if (!N0.hasOneUse() || !N1.hasOneUse())
51144 return SDValue();
51145
51146 // Search for matching shifts.
51149
51150 unsigned BCOpc = BC0.getOpcode();
51151 EVT BCVT = BC0.getValueType();
51152 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51153 return SDValue();
51154
51155 switch (BCOpc) {
51156 case X86ISD::VSHLI:
51157 case X86ISD::VSRLI:
51158 case X86ISD::VSRAI: {
51159 if (BC0.getOperand(1) != BC1.getOperand(1))
51160 return SDValue();
51161 SDValue BitOp =
51162 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51163 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51164 return DAG.getBitcast(VT, Shift);
51165 }
51166 }
51167
51168 return SDValue();
51169}
51170
51171// Attempt to fold:
51172// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51173// TODO: Handle PACKUS handling.
51174static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51175 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51176 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51177 "Unexpected bit opcode");
51178
51179 // Both operands must be single use.
51180 if (!N0.hasOneUse() || !N1.hasOneUse())
51181 return SDValue();
51182
51183 // Search for matching packs.
51186
51187 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51188 return SDValue();
51189
51190 MVT DstVT = N0.getSimpleValueType();
51191 if (DstVT != N1.getSimpleValueType())
51192 return SDValue();
51193
51194 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51195 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51196
51197 // Limit to allsignbits packing.
51198 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51199 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51200 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51201 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51202 return SDValue();
51203
51204 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51205 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51206 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51207}
51208
51209/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51210/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51211/// with a shift-right to eliminate loading the vector constant mask value.
51213 SelectionDAG &DAG,
51214 const X86Subtarget &Subtarget) {
51215 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51216 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51217 EVT VT = Op0.getValueType();
51218 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51219 return SDValue();
51220
51221 // Try to convert an "is positive" signbit masking operation into arithmetic
51222 // shift and "andn". This saves a materialization of a -1 vector constant.
51223 // The "is negative" variant should be handled more generally because it only
51224 // requires "and" rather than "andn":
51225 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51226 //
51227 // This is limited to the original type to avoid producing even more bitcasts.
51228 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51229 // will be profitable.
51230 if (N->getValueType(0) == VT &&
51231 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51232 SDValue X, Y;
51233 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51234 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51235 X = Op1.getOperand(0);
51236 Y = Op0;
51237 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51238 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51239 X = Op0.getOperand(0);
51240 Y = Op1;
51241 }
51242 if (X && Y) {
51243 SDValue Sra =
51245 VT.getScalarSizeInBits() - 1, DAG);
51246 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51247 }
51248 }
51249
51250 APInt SplatVal;
51251 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51252 return SDValue();
51253
51254 // Don't prevent creation of ANDN.
51255 if (isBitwiseNot(Op0))
51256 return SDValue();
51257
51258 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51259 return SDValue();
51260
51261 unsigned EltBitWidth = VT.getScalarSizeInBits();
51262 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51263 return SDValue();
51264
51265 unsigned ShiftVal = SplatVal.countr_one();
51266 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51267 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51268 return DAG.getBitcast(N->getValueType(0), Shift);
51269}
51270
51271// Get the index node from the lowered DAG of a GEP IR instruction with one
51272// indexing dimension.
51274 if (Ld->isIndexed())
51275 return SDValue();
51276
51277 SDValue Base = Ld->getBasePtr();
51278 if (Base.getOpcode() != ISD::ADD)
51279 return SDValue();
51280
51281 SDValue ShiftedIndex = Base.getOperand(0);
51282 if (ShiftedIndex.getOpcode() != ISD::SHL)
51283 return SDValue();
51284
51285 return ShiftedIndex.getOperand(0);
51286}
51287
51288static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51289 return Subtarget.hasBMI2() &&
51290 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51291}
51292
51293/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51294/// This undoes the inverse fold performed in InstCombine
51296 SelectionDAG &DAG) {
51297 using namespace llvm::SDPatternMatch;
51298 MVT VT = N->getSimpleValueType(0);
51299 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51300 return SDValue();
51301
51302 SDValue X, Y, Z;
51303 if (sd_match(N, m_And(m_Value(X),
51304 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51305 // Don't fold if Y or Z are constants to prevent infinite loops.
51308 return DAG.getNode(
51309 ISD::AND, DL, VT, X,
51310 DAG.getNOT(
51311 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51312 }
51313
51314 return SDValue();
51315}
51316
51317// This function recognizes cases where X86 bzhi instruction can replace and
51318// 'and-load' sequence.
51319// In case of loading integer value from an array of constants which is defined
51320// as follows:
51321//
51322// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51323//
51324// then applying a bitwise and on the result with another input.
51325// It's equivalent to performing bzhi (zero high bits) on the input, with the
51326// same index of the load.
51328 const X86Subtarget &Subtarget) {
51329 MVT VT = Node->getSimpleValueType(0);
51330 SDLoc dl(Node);
51331
51332 // Check if subtarget has BZHI instruction for the node's type
51333 if (!hasBZHI(Subtarget, VT))
51334 return SDValue();
51335
51336 // Try matching the pattern for both operands.
51337 for (unsigned i = 0; i < 2; i++) {
51338 // continue if the operand is not a load instruction
51339 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51340 if (!Ld)
51341 continue;
51342 const Value *MemOp = Ld->getMemOperand()->getValue();
51343 if (!MemOp)
51344 continue;
51345 // Get the Node which indexes into the array.
51347 if (!Index)
51348 continue;
51349
51350 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51351 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51352 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51353 Constant *Init = GV->getInitializer();
51354 Type *Ty = Init->getType();
51355 if (!isa<ConstantDataArray>(Init) ||
51356 !Ty->getArrayElementType()->isIntegerTy() ||
51358 VT.getSizeInBits() ||
51359 Ty->getArrayNumElements() >
51361 continue;
51362
51363 // Check if the array's constant elements are suitable to our case.
51364 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51365 bool ConstantsMatch = true;
51366 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51367 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51368 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51369 ConstantsMatch = false;
51370 break;
51371 }
51372 }
51373 if (!ConstantsMatch)
51374 continue;
51375
51376 // Do the transformation (For 32-bit type):
51377 // -> (and (load arr[idx]), inp)
51378 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51379 // that will be replaced with one bzhi instruction.
51380 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51381 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51382
51383 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51384 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51385 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51386
51387 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51388 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51389 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51390 }
51391 }
51392 }
51393 }
51394 return SDValue();
51395}
51396
51397// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51398// Where C is a mask containing the same number of bits as the setcc and
51399// where the setcc will freely 0 upper bits of k-register. We can replace the
51400// undef in the concat with 0s and remove the AND. This mainly helps with
51401// v2i1/v4i1 setcc being casted to scalar.
51403 const X86Subtarget &Subtarget) {
51404 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51405
51406 EVT VT = N->getValueType(0);
51407
51408 // Make sure this is an AND with constant. We will check the value of the
51409 // constant later.
51410 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51411 if (!C1)
51412 return SDValue();
51413
51414 // This is implied by the ConstantSDNode.
51415 assert(!VT.isVector() && "Expected scalar VT!");
51416
51417 SDValue Src = N->getOperand(0);
51418 if (!Src.hasOneUse())
51419 return SDValue();
51420
51421 // (Optionally) peek through any_extend().
51422 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51423 if (!Src.getOperand(0).hasOneUse())
51424 return SDValue();
51425 Src = Src.getOperand(0);
51426 }
51427
51428 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51429 return SDValue();
51430
51431 Src = Src.getOperand(0);
51432 EVT SrcVT = Src.getValueType();
51433
51434 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51435 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51436 !TLI.isTypeLegal(SrcVT))
51437 return SDValue();
51438
51439 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51440 return SDValue();
51441
51442 // We only care about the first subvector of the concat, we expect the
51443 // other subvectors to be ignored due to the AND if we make the change.
51444 SDValue SubVec = Src.getOperand(0);
51445 EVT SubVecVT = SubVec.getValueType();
51446
51447 // The RHS of the AND should be a mask with as many bits as SubVec.
51448 if (!TLI.isTypeLegal(SubVecVT) ||
51449 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51450 return SDValue();
51451
51452 // First subvector should be a setcc with a legal result type or a
51453 // AND containing at least one setcc with a legal result type.
51454 auto IsLegalSetCC = [&](SDValue V) {
51455 if (V.getOpcode() != ISD::SETCC)
51456 return false;
51457 EVT SetccVT = V.getOperand(0).getValueType();
51458 if (!TLI.isTypeLegal(SetccVT) ||
51459 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51460 return false;
51461 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51462 return false;
51463 return true;
51464 };
51465 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51466 (IsLegalSetCC(SubVec.getOperand(0)) ||
51467 IsLegalSetCC(SubVec.getOperand(1))))))
51468 return SDValue();
51469
51470 // We passed all the checks. Rebuild the concat_vectors with zeroes
51471 // and cast it back to VT.
51472 SDLoc dl(N);
51473 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51474 DAG.getConstant(0, dl, SubVecVT));
51475 Ops[0] = SubVec;
51476 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51477 Ops);
51478 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51479 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51480}
51481
51483 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51484 // We don't want to go crazy with the recursion here. This isn't a super
51485 // important optimization.
51486 static constexpr unsigned kMaxDepth = 2;
51487
51488 // Only do this re-ordering if op has one use.
51489 if (!Op.hasOneUse())
51490 return SDValue();
51491
51492 SDLoc DL(Op);
51493 // If we hit another assosiative op, recurse further.
51494 if (Op.getOpcode() == Opc) {
51495 // Done recursing.
51496 if (Depth++ >= kMaxDepth)
51497 return SDValue();
51498
51499 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51500 if (SDValue R =
51501 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51502 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51503 Op.getOperand(1 - OpIdx));
51504
51505 } else if (Op.getOpcode() == ISD::SUB) {
51506 if (Opc == ISD::AND) {
51507 // BLSI: (and x, (sub 0, x))
51508 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51509 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51510 }
51511 // Opc must be ISD::AND or ISD::XOR
51512 // BLSR: (and x, (sub x, 1))
51513 // BLSMSK: (xor x, (sub x, 1))
51514 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51515 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51516
51517 } else if (Op.getOpcode() == ISD::ADD) {
51518 // Opc must be ISD::AND or ISD::XOR
51519 // BLSR: (and x, (add x, -1))
51520 // BLSMSK: (xor x, (add x, -1))
51521 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51522 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51523 }
51524 return SDValue();
51525}
51526
51528 const X86Subtarget &Subtarget) {
51529 EVT VT = N->getValueType(0);
51530 // Make sure this node is a candidate for BMI instructions.
51531 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51532 (VT != MVT::i32 && VT != MVT::i64))
51533 return SDValue();
51534
51535 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51536
51537 // Try and match LHS and RHS.
51538 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51539 if (SDValue OpMatch =
51540 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51541 N->getOperand(1 - OpIdx), 0))
51542 return OpMatch;
51543 return SDValue();
51544}
51545
51546/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51548 SelectionDAG &DAG,
51549 const X86Subtarget &Subtarget) {
51550 using namespace llvm::SDPatternMatch;
51551
51552 EVT VT = And->getValueType(0);
51553 // Make sure this node is a candidate for BMI instructions.
51554 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51555 return SDValue();
51556
51557 SDValue X;
51558 SDValue Y;
51559 if (!sd_match(And, m_And(m_OneUse(m_Xor(m_Value(X),
51561 m_Value(Y))))
51562 return SDValue();
51563
51564 SDValue BLSMSK =
51565 DAG.getNode(ISD::XOR, DL, VT, X,
51566 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51567 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51568 return AndN;
51569}
51570
51572 SelectionDAG &DAG,
51574 const X86Subtarget &ST) {
51575 // cmp(setcc(cc, X), 0)
51576 // brcond ne
51577 // ->
51578 // X
51579 // brcond cc
51580
51581 // sub(setcc(cc, X), 1)
51582 // brcond ne
51583 // ->
51584 // X
51585 // brcond ~cc
51586 //
51587 // if only flag has users
51588
51589 SDValue SetCC = N->getOperand(0);
51590
51591 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51592 return SDValue();
51593
51594 // Check the only user of flag is `brcond ne`.
51595 SDNode *BrCond = *Flag->user_begin();
51596 if (BrCond->getOpcode() != X86ISD::BRCOND)
51597 return SDValue();
51598 unsigned CondNo = 2;
51599 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51601 return SDValue();
51602
51603 SDValue X = SetCC.getOperand(1);
51604 // sub has two results while X only have one. DAG combine assumes the value
51605 // type matches.
51606 if (N->getOpcode() == X86ISD::SUB)
51607 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51608
51609 SDValue CCN = SetCC.getOperand(0);
51610 X86::CondCode CC =
51611 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51613 // Update CC for the consumer of the flag.
51614 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51615 // checking if the second condition evaluates to true. When comparing the
51616 // result with 1, we are checking uf the second condition evaluates to false.
51617 SmallVector<SDValue> Ops(BrCond->op_values());
51618 if (isNullConstant(N->getOperand(1)))
51619 Ops[CondNo] = CCN;
51620 else if (isOneConstant(N->getOperand(1)))
51621 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51622 else
51623 llvm_unreachable("expect constant 0 or 1");
51624
51625 SDValue NewBrCond =
51626 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51627 // Avoid self-assign error b/c CC1 can be `e/ne`.
51628 if (BrCond != NewBrCond.getNode())
51629 DCI.CombineTo(BrCond, NewBrCond);
51630 return X;
51631}
51632
51635 const X86Subtarget &ST) {
51636 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51637 // ->
51638 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51639
51640 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51641 // ->
51642 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51643 //
51644 // where cflags is determined by cc1.
51645
51646 if (!ST.hasCCMP())
51647 return SDValue();
51648
51649 SDValue SetCC0 = N->getOperand(0);
51650 SDValue SetCC1 = N->getOperand(1);
51651 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51652 SetCC1.getOpcode() != X86ISD::SETCC)
51653 return SDValue();
51654
51655 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51656 SDValue Op = V.getOperand(1);
51657 unsigned Opc = Op.getOpcode();
51658 if (Opc == X86ISD::SUB)
51659 return X86ISD::CCMP;
51660 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51661 return X86ISD::CTEST;
51662 return 0U;
51663 };
51664
51665 unsigned NewOpc = 0;
51666
51667 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51668 // appear on the right.
51669 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51670 std::swap(SetCC0, SetCC1);
51671 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51672 return SDValue();
51673 }
51674
51675 X86::CondCode CC0 =
51676 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51677 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51678 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51679 return SDValue();
51680
51681 bool IsOR = N->getOpcode() == ISD::OR;
51682
51683 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51684 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51685 // operator is OR. Similar for CC1.
51686 SDValue SrcCC =
51688 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51689 : SetCC0.getOperand(0);
51690 SDValue CC1N = SetCC1.getOperand(0);
51691 X86::CondCode CC1 =
51692 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51694 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51695 SDLoc DL(N);
51696 SDValue CFlags = DAG.getTargetConstant(
51697 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51698 SDValue Sub = SetCC1.getOperand(1);
51699
51700 // Replace any uses of the old flag produced by SUB/CMP with the new one
51701 // produced by CCMP/CTEST.
51702 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51703 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51704 {Sub.getOperand(0), Sub.getOperand(1),
51705 CFlags, SrcCC, SetCC0.getOperand(1)})
51706 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51707 {Sub.getOperand(0), Sub.getOperand(0),
51708 CFlags, SrcCC, SetCC0.getOperand(1)});
51709
51710 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51711}
51712
51715 const X86Subtarget &Subtarget) {
51716 using namespace SDPatternMatch;
51717
51718 SDValue N0 = N->getOperand(0);
51719 SDValue N1 = N->getOperand(1);
51720 EVT VT = N->getValueType(0);
51721 SDLoc dl(N);
51722 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51723
51724 // If this is SSE1 only convert to FAND to avoid scalarization.
51725 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51726 return DAG.getBitcast(MVT::v4i32,
51727 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51728 DAG.getBitcast(MVT::v4f32, N0),
51729 DAG.getBitcast(MVT::v4f32, N1)));
51730 }
51731
51732 // Use a 32-bit and+zext if upper bits known zero.
51733 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51734 APInt HiMask = APInt::getHighBitsSet(64, 32);
51735 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51736 DAG.MaskedValueIsZero(N0, HiMask)) {
51737 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51738 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51739 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51740 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51741 }
51742 }
51743
51744 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51745 // TODO: Support multiple SrcOps.
51746 if (VT == MVT::i1) {
51748 SmallVector<APInt, 2> SrcPartials;
51749 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51750 SrcOps.size() == 1) {
51751 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51752 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51753 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51754 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51755 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51756 if (Mask) {
51757 assert(SrcPartials[0].getBitWidth() == NumElts &&
51758 "Unexpected partial reduction mask");
51759 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51760 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51761 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51762 }
51763 }
51764 }
51765
51766 // InstCombine converts:
51767 // `(-x << C0) & C1`
51768 // to
51769 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51770 // This saves an IR instruction but on x86 the neg/shift version is preferable
51771 // so undo the transform.
51772
51773 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51774 // TODO: We don't actually need a splat for this, we just need the checks to
51775 // hold for each element.
51776 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51777 /*AllowTruncation*/ false);
51778 ConstantSDNode *N01C =
51779 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51780 /*AllowTruncation*/ false);
51781 if (N1C && N01C) {
51782 const APInt &MulC = N01C->getAPIntValue();
51783 const APInt &AndC = N1C->getAPIntValue();
51784 APInt MulCLowBit = MulC & (-MulC);
51785 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51786 (MulCLowBit + MulC).isPowerOf2()) {
51787 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51788 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51789 assert(MulCLowBitLog != -1 &&
51790 "Isolated lowbit is somehow not a power of 2!");
51791 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51792 DAG.getConstant(MulCLowBitLog, dl, VT));
51793 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51794 }
51795 }
51796 }
51797
51798 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51799 return SetCC;
51800
51801 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51802 return V;
51803
51804 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51805 return R;
51806
51807 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51808 return R;
51809
51810 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51811 return R;
51812
51813 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51814 DAG, DCI, Subtarget))
51815 return FPLogic;
51816
51817 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51818 return R;
51819
51820 if (DCI.isBeforeLegalizeOps())
51821 return SDValue();
51822
51823 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51824 return R;
51825
51826 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51827 return R;
51828
51829 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51830 return ShiftRight;
51831
51832 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51833 return R;
51834
51835 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51836 return R;
51837
51838 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51839 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51840 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51841 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51842 unsigned Opc0 = N0.getOpcode();
51843 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51845 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51846 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51847 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51848 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51849 }
51850 }
51851
51852 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51853 // to make use of predicated selects.
51854 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51855 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51856 SDValue X, Y;
51857 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51858 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51859 sd_match(N, m_And(m_Value(X),
51860 m_OneUse(m_SExt(m_AllOf(
51861 m_Value(Y), m_SpecificVT(CondVT),
51862 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51863 return DAG.getSelect(dl, VT, Y, X,
51864 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51865 }
51866 }
51867
51868 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51869 // avoids slow variable shift (moving shift amount to ECX etc.)
51870 if (isOneConstant(N1) && N0->hasOneUse()) {
51871 SDValue Src = N0;
51872 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51873 Src.getOpcode() == ISD::TRUNCATE) &&
51874 Src.getOperand(0)->hasOneUse())
51875 Src = Src.getOperand(0);
51876 bool ContainsNOT = false;
51877 X86::CondCode X86CC = X86::COND_B;
51878 // Peek through AND(NOT(SRL(X,Y)),1).
51879 if (isBitwiseNot(Src)) {
51880 Src = Src.getOperand(0);
51881 X86CC = X86::COND_AE;
51882 ContainsNOT = true;
51883 }
51884 if (Src.getOpcode() == ISD::SRL &&
51885 !isa<ConstantSDNode>(Src.getOperand(1))) {
51886 SDValue BitNo = Src.getOperand(1);
51887 Src = Src.getOperand(0);
51888 // Peek through AND(SRL(NOT(X),Y),1).
51889 if (isBitwiseNot(Src)) {
51890 Src = Src.getOperand(0);
51891 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51892 ContainsNOT = true;
51893 }
51894 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51895 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51896 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51897 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51898 }
51899 }
51900
51901 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51902 // Attempt to recursively combine a bitmask AND with shuffles.
51903 SDValue Op(N, 0);
51904 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51905 return Res;
51906
51907 // If either operand is a constant mask, then only the elements that aren't
51908 // zero are actually demanded by the other operand.
51909 auto GetDemandedMasks = [&](SDValue Op) {
51910 APInt UndefElts;
51911 SmallVector<APInt> EltBits;
51912 int NumElts = VT.getVectorNumElements();
51913 int EltSizeInBits = VT.getScalarSizeInBits();
51914 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51915 APInt DemandedElts = APInt::getAllOnes(NumElts);
51916 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51917 EltBits)) {
51918 DemandedBits.clearAllBits();
51919 DemandedElts.clearAllBits();
51920 for (int I = 0; I != NumElts; ++I) {
51921 if (UndefElts[I]) {
51922 // We can't assume an undef src element gives an undef dst - the
51923 // other src might be zero.
51924 DemandedBits.setAllBits();
51925 DemandedElts.setBit(I);
51926 } else if (!EltBits[I].isZero()) {
51927 DemandedBits |= EltBits[I];
51928 DemandedElts.setBit(I);
51929 }
51930 }
51931 }
51932 return std::make_pair(DemandedBits, DemandedElts);
51933 };
51934 APInt Bits0, Elts0;
51935 APInt Bits1, Elts1;
51936 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
51937 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
51938
51939 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
51940 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
51941 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
51942 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
51943 if (N->getOpcode() != ISD::DELETED_NODE)
51944 DCI.AddToWorklist(N);
51945 return SDValue(N, 0);
51946 }
51947
51948 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
51949 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
51950 if (NewN0 || NewN1)
51951 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
51952 NewN1 ? NewN1 : N1);
51953 }
51954
51955 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
51956 if ((VT.getScalarSizeInBits() % 8) == 0 &&
51958 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
51959 SDValue BitMask = N1;
51960 SDValue SrcVec = N0.getOperand(0);
51961 EVT SrcVecVT = SrcVec.getValueType();
51962
51963 // Check that the constant bitmask masks whole bytes.
51964 APInt UndefElts;
51965 SmallVector<APInt, 64> EltBits;
51966 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
51967 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
51968 llvm::all_of(EltBits, [](const APInt &M) {
51969 return M.isZero() || M.isAllOnes();
51970 })) {
51971 unsigned NumElts = SrcVecVT.getVectorNumElements();
51972 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
51973 unsigned Idx = N0.getConstantOperandVal(1);
51974
51975 // Create a root shuffle mask from the byte mask and the extracted index.
51976 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
51977 for (unsigned i = 0; i != Scale; ++i) {
51978 if (UndefElts[i])
51979 continue;
51980 int VecIdx = Scale * Idx + i;
51981 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
51982 }
51983
51985 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
51986 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
51987 /*AllowVariableCrossLaneMask=*/true,
51988 /*AllowVariablePerLaneMask=*/true,
51989 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
51990 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
51991 N0.getOperand(1));
51992 }
51993 }
51994
51995 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
51996 return R;
51997
51998 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
51999 return R;
52000
52001 return SDValue();
52002}
52003
52004// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52006 SelectionDAG &DAG,
52007 const X86Subtarget &Subtarget) {
52008 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52009
52010 MVT VT = N->getSimpleValueType(0);
52011 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52012 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52013 return SDValue();
52014
52015 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52016 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52017 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52018 return SDValue();
52019
52020 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52021 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52022 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52023 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52024 return SDValue();
52025
52026 // Attempt to extract constant byte masks.
52027 APInt UndefElts0, UndefElts1;
52028 SmallVector<APInt, 32> EltBits0, EltBits1;
52029 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52030 /*AllowWholeUndefs*/ false,
52031 /*AllowPartialUndefs*/ false))
52032 return SDValue();
52033 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52034 /*AllowWholeUndefs*/ false,
52035 /*AllowPartialUndefs*/ false))
52036 return SDValue();
52037
52038 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52039 // TODO - add UNDEF elts support.
52040 if (UndefElts0[i] || UndefElts1[i])
52041 return SDValue();
52042 if (EltBits0[i] != ~EltBits1[i])
52043 return SDValue();
52044 }
52045
52046 if (useVPTERNLOG(Subtarget, VT)) {
52047 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52048 // VPTERNLOG is only available as vXi32/64-bit types.
52049 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52050 MVT OpVT =
52051 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52052 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52053 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52054 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52055 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52056 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52057 DAG, Subtarget);
52058 return DAG.getBitcast(VT, Res);
52059 }
52060
52061 SDValue X = N->getOperand(0);
52062 SDValue Y =
52063 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52064 DAG.getBitcast(VT, N1.getOperand(0)));
52065 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52066}
52067
52068// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52069// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52070// Waiting for ANDNP combine allows other combines to happen that prevent
52071// matching.
52072static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52073 using namespace SDPatternMatch;
52074 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52075 m_And(m_Deferred(Mask), m_Value(Y))));
52076}
52077
52078// Try to fold:
52079// (or (and (m, y), (pandn m, x)))
52080// into:
52081// (vselect m, x, y)
52082// As a special case, try to fold:
52083// (or (and (m, (sub 0, x)), (pandn m, x)))
52084// into:
52085// (sub (xor X, M), M)
52087 SelectionDAG &DAG,
52088 const X86Subtarget &Subtarget) {
52089 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52090
52091 EVT VT = N->getValueType(0);
52092 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52093 (VT.is256BitVector() && Subtarget.hasInt256())))
52094 return SDValue();
52095
52096 SDValue X, Y, Mask;
52097 if (!matchLogicBlend(N, X, Y, Mask))
52098 return SDValue();
52099
52100 // Validate that X, Y, and Mask are bitcasts, and see through them.
52101 Mask = peekThroughBitcasts(Mask);
52104
52105 EVT MaskVT = Mask.getValueType();
52106 unsigned EltBits = MaskVT.getScalarSizeInBits();
52107
52108 // TODO: Attempt to handle floating point cases as well?
52109 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52110 return SDValue();
52111
52112 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52113 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52114 DAG, Subtarget))
52115 return Res;
52116
52117 // PBLENDVB is only available on SSE 4.1.
52118 if (!Subtarget.hasSSE41())
52119 return SDValue();
52120
52121 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52122 if (Subtarget.hasVLX())
52123 return SDValue();
52124
52125 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52126
52127 X = DAG.getBitcast(BlendVT, X);
52128 Y = DAG.getBitcast(BlendVT, Y);
52129 Mask = DAG.getBitcast(BlendVT, Mask);
52130 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52131 return DAG.getBitcast(VT, Mask);
52132}
52133
52134// Helper function for combineOrCmpEqZeroToCtlzSrl
52135// Transforms:
52136// seteq(cmp x, 0)
52137// into:
52138// srl(ctlz x), log2(bitsize(x))
52139// Input pattern is checked by caller.
52141 SDValue Cmp = Op.getOperand(1);
52142 EVT VT = Cmp.getOperand(0).getValueType();
52143 unsigned Log2b = Log2_32(VT.getSizeInBits());
52144 SDLoc dl(Op);
52145 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52146 // The result of the shift is true or false, and on X86, the 32-bit
52147 // encoding of shr and lzcnt is more desirable.
52148 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52149 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52150 DAG.getConstant(Log2b, dl, MVT::i8));
52151 return Scc;
52152}
52153
52154// Try to transform:
52155// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52156// into:
52157// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52158// Will also attempt to match more generic cases, eg:
52159// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52160// Only applies if the target supports the FastLZCNT feature.
52163 const X86Subtarget &Subtarget) {
52164 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52165 return SDValue();
52166
52167 auto isORCandidate = [](SDValue N) {
52168 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52169 };
52170
52171 // Check the zero extend is extending to 32-bit or more. The code generated by
52172 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52173 // instructions to clear the upper bits.
52174 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52175 !isORCandidate(N->getOperand(0)))
52176 return SDValue();
52177
52178 // Check the node matches: setcc(eq, cmp 0)
52179 auto isSetCCCandidate = [](SDValue N) {
52180 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52181 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52182 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52183 isNullConstant(N->getOperand(1).getOperand(1)) &&
52184 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52185 };
52186
52187 SDNode *OR = N->getOperand(0).getNode();
52188 SDValue LHS = OR->getOperand(0);
52189 SDValue RHS = OR->getOperand(1);
52190
52191 // Save nodes matching or(or, setcc(eq, cmp 0)).
52193 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52194 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52195 ORNodes.push_back(OR);
52196 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52197 LHS = OR->getOperand(0);
52198 RHS = OR->getOperand(1);
52199 }
52200
52201 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52202 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52203 !isORCandidate(SDValue(OR, 0)))
52204 return SDValue();
52205
52206 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52207 // to
52208 // or(srl(ctlz),srl(ctlz)).
52209 // The dag combiner can then fold it into:
52210 // srl(or(ctlz, ctlz)).
52211 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52212 SDValue Ret, NewRHS;
52213 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52214 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52215
52216 if (!Ret)
52217 return SDValue();
52218
52219 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52220 while (!ORNodes.empty()) {
52221 OR = ORNodes.pop_back_val();
52222 LHS = OR->getOperand(0);
52223 RHS = OR->getOperand(1);
52224 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52225 if (RHS->getOpcode() == ISD::OR)
52226 std::swap(LHS, RHS);
52227 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52228 if (!NewRHS)
52229 return SDValue();
52230 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52231 }
52232
52233 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52234}
52235
52236/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52237/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52238/// with CMP+{ADC, SBB}.
52239/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52240static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52241 SDValue X, SDValue Y,
52242 SelectionDAG &DAG,
52243 bool ZeroSecondOpOnly = false) {
52244 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52245 return SDValue();
52246
52247 // Look through a one-use zext.
52248 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52249 Y = Y.getOperand(0);
52250
52251 X86::CondCode CC;
52252 SDValue EFLAGS;
52253 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52254 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52255 EFLAGS = Y.getOperand(1);
52256 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52257 Y.hasOneUse()) {
52258 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52259 }
52260
52261 if (!EFLAGS)
52262 return SDValue();
52263
52264 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52265 // the general case below.
52266 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52267 if (ConstantX && !ZeroSecondOpOnly) {
52268 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52269 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52270 // This is a complicated way to get -1 or 0 from the carry flag:
52271 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52272 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52273 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52274 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52275 EFLAGS);
52276 }
52277
52278 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52279 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52280 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52281 EFLAGS.getValueType().isInteger() &&
52282 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52283 // Swap the operands of a SUB, and we have the same pattern as above.
52284 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52285 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52286 SDValue NewSub = DAG.getNode(
52287 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52288 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52289 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52290 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52291 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52292 NewEFLAGS);
52293 }
52294 }
52295 }
52296
52297 if (CC == X86::COND_B) {
52298 // X + SETB Z --> adc X, 0
52299 // X - SETB Z --> sbb X, 0
52300 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52301 DAG.getVTList(VT, MVT::i32), X,
52302 DAG.getConstant(0, DL, VT), EFLAGS);
52303 }
52304
52305 if (ZeroSecondOpOnly)
52306 return SDValue();
52307
52308 if (CC == X86::COND_A) {
52309 // Try to convert COND_A into COND_B in an attempt to facilitate
52310 // materializing "setb reg".
52311 //
52312 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52313 // cannot take an immediate as its first operand.
52314 //
52315 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52316 EFLAGS.getValueType().isInteger() &&
52317 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52318 SDValue NewSub =
52319 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52320 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52321 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52322 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52323 DAG.getVTList(VT, MVT::i32), X,
52324 DAG.getConstant(0, DL, VT), NewEFLAGS);
52325 }
52326 }
52327
52328 if (CC == X86::COND_AE) {
52329 // X + SETAE --> sbb X, -1
52330 // X - SETAE --> adc X, -1
52331 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52332 DAG.getVTList(VT, MVT::i32), X,
52333 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52334 }
52335
52336 if (CC == X86::COND_BE) {
52337 // X + SETBE --> sbb X, -1
52338 // X - SETBE --> adc X, -1
52339 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52340 // materializing "setae reg".
52341 //
52342 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52343 // cannot take an immediate as its first operand.
52344 //
52345 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52346 EFLAGS.getValueType().isInteger() &&
52347 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52348 SDValue NewSub =
52349 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52350 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52351 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52352 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52353 DAG.getVTList(VT, MVT::i32), X,
52354 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52355 }
52356 }
52357
52358 if (CC != X86::COND_E && CC != X86::COND_NE)
52359 return SDValue();
52360
52361 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52362 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52363 !EFLAGS.getOperand(0).getValueType().isInteger())
52364 return SDValue();
52365
52366 SDValue Z = EFLAGS.getOperand(0);
52367 EVT ZVT = Z.getValueType();
52368
52369 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52370 // the general case below.
52371 if (ConstantX) {
52372 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52373 // fake operands:
52374 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52375 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52376 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52377 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52378 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52379 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52380 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52381 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52382 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52383 SDValue(Neg.getNode(), 1));
52384 }
52385
52386 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52387 // with fake operands:
52388 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52389 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52390 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52391 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52392 SDValue One = DAG.getConstant(1, DL, ZVT);
52393 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52394 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52395 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52396 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52397 Cmp1.getValue(1));
52398 }
52399 }
52400
52401 // (cmp Z, 1) sets the carry flag if Z is 0.
52402 SDValue One = DAG.getConstant(1, DL, ZVT);
52403 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52404 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52405
52406 // Add the flags type for ADC/SBB nodes.
52407 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52408
52409 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52410 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52411 if (CC == X86::COND_NE)
52412 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52413 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52414
52415 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52416 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52417 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52418 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52419}
52420
52421/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52422/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52423/// with CMP+{ADC, SBB}.
52425 SelectionDAG &DAG) {
52426 bool IsSub = N->getOpcode() == ISD::SUB;
52427 SDValue X = N->getOperand(0);
52428 SDValue Y = N->getOperand(1);
52429 EVT VT = N->getValueType(0);
52430
52431 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52432 return ADCOrSBB;
52433
52434 // Commute and try again (negate the result for subtracts).
52435 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52436 if (IsSub)
52437 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52438 return ADCOrSBB;
52439 }
52440
52441 return SDValue();
52442}
52443
52444static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52445 SDValue N0, SDValue N1,
52446 SelectionDAG &DAG) {
52447 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52448
52449 // Delegate to combineAddOrSubToADCOrSBB if we have:
52450 //
52451 // (xor/or (zero_extend (setcc)) imm)
52452 //
52453 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52454 // equivalent to a SUB/ADD, respectively.
52455 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52456 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52457 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52458 bool IsSub = Opc == ISD::XOR;
52459 bool N1COdd = N1C->getZExtValue() & 1;
52460 if (IsSub ? N1COdd : !N1COdd)
52461 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52462 return R;
52463 }
52464 }
52465
52466 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52467 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52468 N0.getOperand(0).getOpcode() == ISD::AND &&
52472 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52473 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52474 N0.getOperand(0).getOperand(1));
52475 }
52476
52477 return SDValue();
52478}
52479
52482 const X86Subtarget &Subtarget) {
52483 SDValue N0 = N->getOperand(0);
52484 SDValue N1 = N->getOperand(1);
52485 EVT VT = N->getValueType(0);
52486 SDLoc dl(N);
52487 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52488
52489 // If this is SSE1 only convert to FOR to avoid scalarization.
52490 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52491 return DAG.getBitcast(MVT::v4i32,
52492 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52493 DAG.getBitcast(MVT::v4f32, N0),
52494 DAG.getBitcast(MVT::v4f32, N1)));
52495 }
52496
52497 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52498 // TODO: Support multiple SrcOps.
52499 if (VT == MVT::i1) {
52501 SmallVector<APInt, 2> SrcPartials;
52502 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52503 SrcOps.size() == 1) {
52504 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52505 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52506 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52507 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52508 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52509 if (Mask) {
52510 assert(SrcPartials[0].getBitWidth() == NumElts &&
52511 "Unexpected partial reduction mask");
52512 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52513 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52514 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52515 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52516 }
52517 }
52518 }
52519
52520 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52521 return SetCC;
52522
52523 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52524 return R;
52525
52526 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52527 return R;
52528
52529 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52530 return R;
52531
52532 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52533 DAG, DCI, Subtarget))
52534 return FPLogic;
52535
52536 if (DCI.isBeforeLegalizeOps())
52537 return SDValue();
52538
52539 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52540 return R;
52541
52542 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52543 return R;
52544
52545 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52546 return R;
52547
52548 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52549 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52550 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52551 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52552 uint64_t Val = CN->getZExtValue();
52553 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52554 Val == 8) {
52555 SDValue NotCond;
52556 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52557 N0.getOperand(1).hasOneUse()) {
52560 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52561 } else if (N0.getOpcode() == ISD::SUB &&
52562 isNullConstant(N0.getOperand(0))) {
52563 SDValue Cond = N0.getOperand(1);
52564 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52565 Cond = Cond.getOperand(0);
52566 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52567 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52569 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52570 }
52571 }
52572
52573 if (NotCond) {
52574 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52575 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52576 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52577 return R;
52578 }
52579 }
52580 }
52581 }
52582
52583 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52584 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52585 // iff the upper elements of the non-shifted arg are zero.
52586 // KUNPCK require 16+ bool vector elements.
52587 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52588 unsigned NumElts = VT.getVectorNumElements();
52589 unsigned HalfElts = NumElts / 2;
52590 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52591 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52592 N1.getConstantOperandAPInt(1) == HalfElts &&
52593 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52594 return DAG.getNode(
52595 ISD::CONCAT_VECTORS, dl, VT,
52596 extractSubVector(N0, 0, DAG, dl, HalfElts),
52597 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52598 }
52599 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52600 N0.getConstantOperandAPInt(1) == HalfElts &&
52601 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52602 return DAG.getNode(
52603 ISD::CONCAT_VECTORS, dl, VT,
52604 extractSubVector(N1, 0, DAG, dl, HalfElts),
52605 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52606 }
52607 }
52608
52609 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52610 // Attempt to recursively combine an OR of shuffles.
52611 SDValue Op(N, 0);
52612 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52613 return Res;
52614
52615 // If either operand is a constant mask, then only the elements that aren't
52616 // allones are actually demanded by the other operand.
52617 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52618 APInt UndefElts;
52619 SmallVector<APInt> EltBits;
52620 int NumElts = VT.getVectorNumElements();
52621 int EltSizeInBits = VT.getScalarSizeInBits();
52622 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52623 return false;
52624
52625 APInt DemandedElts = APInt::getZero(NumElts);
52626 for (int I = 0; I != NumElts; ++I)
52627 if (!EltBits[I].isAllOnes())
52628 DemandedElts.setBit(I);
52629
52630 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52631 };
52632 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52633 if (N->getOpcode() != ISD::DELETED_NODE)
52634 DCI.AddToWorklist(N);
52635 return SDValue(N, 0);
52636 }
52637 }
52638
52639 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52640 return R;
52641
52642 return SDValue();
52643}
52644
52645/// Try to turn tests against the signbit in the form of:
52646/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52647/// into:
52648/// SETGT(X, -1)
52650 SelectionDAG &DAG) {
52651 // This is only worth doing if the output type is i8 or i1.
52652 EVT ResultType = N->getValueType(0);
52653 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52654 return SDValue();
52655
52656 SDValue N0 = N->getOperand(0);
52657 SDValue N1 = N->getOperand(1);
52658
52659 // We should be performing an xor against a truncated shift.
52660 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52661 return SDValue();
52662
52663 // Make sure we are performing an xor against one.
52664 if (!isOneConstant(N1))
52665 return SDValue();
52666
52667 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52668 SDValue Shift = N0.getOperand(0);
52669 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52670 return SDValue();
52671
52672 // Make sure we are truncating from one of i16, i32 or i64.
52673 EVT ShiftTy = Shift.getValueType();
52674 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52675 return SDValue();
52676
52677 // Make sure the shift amount extracts the sign bit.
52678 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52679 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52680 return SDValue();
52681
52682 // Create a greater-than comparison against -1.
52683 // N.B. Using SETGE against 0 works but we want a canonical looking
52684 // comparison, using SETGT matches up with what TranslateX86CC.
52685 SDValue ShiftOp = Shift.getOperand(0);
52686 EVT ShiftOpTy = ShiftOp.getValueType();
52687 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52688 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52689 *DAG.getContext(), ResultType);
52690 SDValue Cond =
52691 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52692 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52693 if (SetCCResultType != ResultType)
52694 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52695 return Cond;
52696}
52697
52698/// Turn vector tests of the signbit in the form of:
52699/// xor (sra X, elt_size(X)-1), -1
52700/// into:
52701/// pcmpgt X, -1
52702///
52703/// This should be called before type legalization because the pattern may not
52704/// persist after that.
52706 const X86Subtarget &Subtarget) {
52707 EVT VT = N->getValueType(0);
52708 if (!VT.isSimple())
52709 return SDValue();
52710
52711 switch (VT.getSimpleVT().SimpleTy) {
52712 // clang-format off
52713 default: return SDValue();
52714 case MVT::v16i8:
52715 case MVT::v8i16:
52716 case MVT::v4i32:
52717 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52718 case MVT::v32i8:
52719 case MVT::v16i16:
52720 case MVT::v8i32:
52721 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52722 // clang-format on
52723 }
52724
52725 // There must be a shift right algebraic before the xor, and the xor must be a
52726 // 'not' operation.
52727 SDValue Shift = N->getOperand(0);
52728 SDValue Ones = N->getOperand(1);
52729 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52731 return SDValue();
52732
52733 // The shift should be smearing the sign bit across each vector element.
52734 auto *ShiftAmt =
52735 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52736 if (!ShiftAmt ||
52737 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52738 return SDValue();
52739
52740 // Create a greater-than comparison against -1. We don't use the more obvious
52741 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52742 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52743}
52744
52745/// Detect patterns of truncation with unsigned saturation:
52746///
52747/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52748/// Return the source value x to be truncated or SDValue() if the pattern was
52749/// not matched.
52750///
52751/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52752/// where C1 >= 0 and C2 is unsigned max of destination type.
52753///
52754/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52755/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52756///
52757/// These two patterns are equivalent to:
52758/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52759/// So return the smax(x, C1) value to be truncated or SDValue() if the
52760/// pattern was not matched.
52762 const SDLoc &DL) {
52763 using namespace llvm::SDPatternMatch;
52764 EVT InVT = In.getValueType();
52765
52766 // Saturation with truncation. We truncate from InVT to VT.
52768 "Unexpected types for truncate operation");
52769
52770 APInt C1, C2;
52772
52773 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52774 // the element size of the destination type.
52775 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52776 C2.isMask(VT.getScalarSizeInBits()))
52777 return UMin;
52778
52779 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52780 sd_match(SMin, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52781 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52782 return SMin;
52783
52784 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52785 sd_match(SMax, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52786 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52787 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52788
52789 return SDValue();
52790}
52791
52792/// Detect patterns of truncation with signed saturation:
52793/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52794/// signed_max_of_dest_type)) to dest_type)
52795/// or:
52796/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52797/// signed_min_of_dest_type)) to dest_type).
52798/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52799/// Return the source value to be truncated or SDValue() if the pattern was not
52800/// matched.
52801static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52802 using namespace llvm::SDPatternMatch;
52803 unsigned NumDstBits = VT.getScalarSizeInBits();
52804 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52805 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52806
52807 APInt SignedMax, SignedMin;
52808 if (MatchPackUS) {
52809 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52810 SignedMin = APInt::getZero(NumSrcBits);
52811 } else {
52812 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52813 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52814 }
52815
52816 SDValue SMin, SMax;
52817 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52818 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52819 return SMax;
52820
52821 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52822 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52823 return SMin;
52824
52825 return SDValue();
52826}
52827
52829 SelectionDAG &DAG,
52830 const X86Subtarget &Subtarget) {
52831 if (!Subtarget.hasSSE2() || !VT.isVector())
52832 return SDValue();
52833
52834 EVT SVT = VT.getVectorElementType();
52835 EVT InVT = In.getValueType();
52836 EVT InSVT = InVT.getVectorElementType();
52837
52838 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52839 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52840 // and concatenate at the same time. Then we can use a final vpmovuswb to
52841 // clip to 0-255.
52842 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52843 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52844 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52845 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52846 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52847 DL, DAG, Subtarget);
52848 assert(Mid && "Failed to pack!");
52849 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52850 }
52851 }
52852
52853 // vXi32 truncate instructions are available with AVX512F.
52854 // vXi16 truncate instructions are only available with AVX512BW.
52855 // For 256-bit or smaller vectors, we require VLX.
52856 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52857 // If the result type is 256-bits or larger and we have disable 512-bit
52858 // registers, we should go ahead and use the pack instructions if possible.
52859 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52860 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52861 (InVT.getSizeInBits() > 128) &&
52862 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52863 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52864
52865 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52867 (SVT == MVT::i8 || SVT == MVT::i16) &&
52868 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52869 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52870 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52871 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52872 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52873 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52874 DAG, Subtarget);
52875 assert(Mid && "Failed to pack!");
52877 Subtarget);
52878 assert(V && "Failed to pack!");
52879 return V;
52880 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52881 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52882 Subtarget);
52883 }
52884 if (SDValue SSatVal = detectSSatPattern(In, VT))
52885 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52886 Subtarget);
52887 }
52888
52889 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52890 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52891 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52892 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52893 unsigned TruncOpc = 0;
52894 SDValue SatVal;
52895 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52896 SatVal = SSatVal;
52897 TruncOpc = X86ISD::VTRUNCS;
52898 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52899 SatVal = USatVal;
52900 TruncOpc = X86ISD::VTRUNCUS;
52901 }
52902 if (SatVal) {
52903 unsigned ResElts = VT.getVectorNumElements();
52904 // If the input type is less than 512 bits and we don't have VLX, we need
52905 // to widen to 512 bits.
52906 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52907 unsigned NumConcats = 512 / InVT.getSizeInBits();
52908 ResElts *= NumConcats;
52909 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52910 ConcatOps[0] = SatVal;
52911 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52912 NumConcats * InVT.getVectorNumElements());
52913 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52914 }
52915 // Widen the result if its narrower than 128 bits.
52916 if (ResElts * SVT.getSizeInBits() < 128)
52917 ResElts = 128 / SVT.getSizeInBits();
52918 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52919 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52920 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52921 DAG.getVectorIdxConstant(0, DL));
52922 }
52923 }
52924
52925 return SDValue();
52926}
52927
52929 SelectionDAG &DAG,
52931 const X86Subtarget &Subtarget) {
52932 auto *Ld = cast<LoadSDNode>(N);
52933 EVT RegVT = Ld->getValueType(0);
52934 SDValue Ptr = Ld->getBasePtr();
52935 SDValue Chain = Ld->getChain();
52936 ISD::LoadExtType Ext = Ld->getExtensionType();
52937
52938 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52939 return SDValue();
52940
52941 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
52942 return SDValue();
52943
52945 if (!LdC)
52946 return SDValue();
52947
52948 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
52949 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
52950 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
52951 if (Undefs[I])
52952 continue;
52953 if (UserUndefs[I] || Bits[I] != UserBits[I])
52954 return false;
52955 }
52956 return true;
52957 };
52958
52959 // Look through all other loads/broadcasts in the chain for another constant
52960 // pool entry.
52961 for (SDNode *User : Chain->users()) {
52962 auto *UserLd = dyn_cast<MemSDNode>(User);
52963 if (User != N && UserLd &&
52964 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
52965 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
52967 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
52968 User->getValueSizeInBits(0).getFixedValue() >
52969 RegVT.getFixedSizeInBits()) {
52970 EVT UserVT = User->getValueType(0);
52971 SDValue UserPtr = UserLd->getBasePtr();
52972 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
52973
52974 // See if we are loading a constant that matches in the lower
52975 // bits of a longer constant (but from a different constant pool ptr).
52976 if (UserC && UserPtr != Ptr) {
52977 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
52978 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
52979 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
52980 APInt Undefs, UserUndefs;
52981 SmallVector<APInt> Bits, UserBits;
52982 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
52983 UserVT.getScalarSizeInBits());
52984 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
52985 Bits) &&
52987 UserUndefs, UserBits)) {
52988 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
52990 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
52991 RegVT.getSizeInBits());
52992 Extract = DAG.getBitcast(RegVT, Extract);
52993 return DCI.CombineTo(N, Extract, SDValue(User, 1));
52994 }
52995 }
52996 }
52997 }
52998 }
52999 }
53000
53001 return SDValue();
53002}
53003
53006 const X86Subtarget &Subtarget) {
53007 auto *Ld = cast<LoadSDNode>(N);
53008 EVT RegVT = Ld->getValueType(0);
53009 EVT MemVT = Ld->getMemoryVT();
53010 SDLoc dl(Ld);
53011 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53012
53013 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53014 // into two 16-byte operations. Also split non-temporal aligned loads on
53015 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53016 ISD::LoadExtType Ext = Ld->getExtensionType();
53017 unsigned Fast;
53018 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53019 Ext == ISD::NON_EXTLOAD &&
53020 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53021 Ld->getAlign() >= Align(16)) ||
53022 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53023 *Ld->getMemOperand(), &Fast) &&
53024 !Fast))) {
53025 unsigned NumElems = RegVT.getVectorNumElements();
53026 if (NumElems < 2)
53027 return SDValue();
53028
53029 unsigned HalfOffset = 16;
53030 SDValue Ptr1 = Ld->getBasePtr();
53031 SDValue Ptr2 =
53032 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53033 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53034 NumElems / 2);
53035 SDValue Load1 =
53036 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53037 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53038 SDValue Load2 =
53039 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53040 Ld->getPointerInfo().getWithOffset(HalfOffset),
53041 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53042 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53043 Load1.getValue(1), Load2.getValue(1));
53044
53045 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53046 return DCI.CombineTo(N, NewVec, TF, true);
53047 }
53048
53049 // Bool vector load - attempt to cast to an integer, as we have good
53050 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53051 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53052 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53053 unsigned NumElts = RegVT.getVectorNumElements();
53054 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53055 if (TLI.isTypeLegal(IntVT)) {
53056 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53057 Ld->getPointerInfo(), Ld->getBaseAlign(),
53058 Ld->getMemOperand()->getFlags());
53059 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53060 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53061 }
53062 }
53063
53064 // If we also broadcast this vector to a wider type, then just extract the
53065 // lowest subvector.
53066 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53067 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53068 SDValue Ptr = Ld->getBasePtr();
53069 SDValue Chain = Ld->getChain();
53070 for (SDNode *User : Chain->users()) {
53071 auto *UserLd = dyn_cast<MemSDNode>(User);
53072 if (User != N && UserLd &&
53073 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53074 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53075 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53076 User->hasAnyUseOfValue(0) &&
53077 User->getValueSizeInBits(0).getFixedValue() >
53078 RegVT.getFixedSizeInBits()) {
53080 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53081 RegVT.getSizeInBits());
53082 Extract = DAG.getBitcast(RegVT, Extract);
53083 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53084 }
53085 }
53086 }
53087
53088 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53089 return V;
53090
53091 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53092 unsigned AddrSpace = Ld->getAddressSpace();
53093 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53094 AddrSpace == X86AS::PTR32_UPTR) {
53095 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53096 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53097 SDValue Cast =
53098 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53099 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53100 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53101 Ld->getMemOperand()->getFlags());
53102 }
53103 }
53104
53105 return SDValue();
53106}
53107
53108/// If V is a build vector of boolean constants and exactly one of those
53109/// constants is true, return the operand index of that true element.
53110/// Otherwise, return -1.
53111static int getOneTrueElt(SDValue V) {
53112 // This needs to be a build vector of booleans.
53113 // TODO: Checking for the i1 type matches the IR definition for the mask,
53114 // but the mask check could be loosened to i8 or other types. That might
53115 // also require checking more than 'allOnesValue'; eg, the x86 HW
53116 // instructions only require that the MSB is set for each mask element.
53117 // The ISD::MSTORE comments/definition do not specify how the mask operand
53118 // is formatted.
53119 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53120 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53121 return -1;
53122
53123 int TrueIndex = -1;
53124 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53125 for (unsigned i = 0; i < NumElts; ++i) {
53126 const SDValue &Op = BV->getOperand(i);
53127 if (Op.isUndef())
53128 continue;
53129 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53130 if (!ConstNode)
53131 return -1;
53132 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53133 // If we already found a one, this is too many.
53134 if (TrueIndex >= 0)
53135 return -1;
53136 TrueIndex = i;
53137 }
53138 }
53139 return TrueIndex;
53140}
53141
53142/// Given a masked memory load/store operation, return true if it has one mask
53143/// bit set. If it has one mask bit set, then also return the memory address of
53144/// the scalar element to load/store, the vector index to insert/extract that
53145/// scalar element, and the alignment for the scalar memory access.
53147 SelectionDAG &DAG, SDValue &Addr,
53148 SDValue &Index, Align &Alignment,
53149 unsigned &Offset) {
53150 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53151 if (TrueMaskElt < 0)
53152 return false;
53153
53154 // Get the address of the one scalar element that is specified by the mask
53155 // using the appropriate offset from the base pointer.
53156 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53157 Offset = 0;
53158 Addr = MaskedOp->getBasePtr();
53159 if (TrueMaskElt != 0) {
53160 Offset = TrueMaskElt * EltVT.getStoreSize();
53162 SDLoc(MaskedOp));
53163 }
53164
53165 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53166 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53167 return true;
53168}
53169
53170/// If exactly one element of the mask is set for a non-extending masked load,
53171/// it is a scalar load and vector insert.
53172/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53173/// mask have already been optimized in IR, so we don't bother with those here.
53174static SDValue
53177 const X86Subtarget &Subtarget) {
53178 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53179 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53180 // However, some target hooks may need to be added to know when the transform
53181 // is profitable. Endianness would also have to be considered.
53182
53183 SDValue Addr, VecIndex;
53184 Align Alignment;
53185 unsigned Offset;
53186 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53187 return SDValue();
53188
53189 // Load the one scalar element that is specified by the mask using the
53190 // appropriate offset from the base pointer.
53191 SDLoc DL(ML);
53192 EVT VT = ML->getValueType(0);
53193 EVT EltVT = VT.getVectorElementType();
53194
53195 EVT CastVT = VT;
53196 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53197 EltVT = MVT::f64;
53198 CastVT = VT.changeVectorElementType(EltVT);
53199 }
53200
53201 SDValue Load =
53202 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53203 ML->getPointerInfo().getWithOffset(Offset),
53204 Alignment, ML->getMemOperand()->getFlags());
53205
53206 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53207
53208 // Insert the loaded element into the appropriate place in the vector.
53209 SDValue Insert =
53210 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53211 Insert = DAG.getBitcast(VT, Insert);
53212 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53213}
53214
53215static SDValue
53218 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53219 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53220 return SDValue();
53221
53222 SDLoc DL(ML);
53223 EVT VT = ML->getValueType(0);
53224
53225 // If we are loading the first and last elements of a vector, it is safe and
53226 // always faster to load the whole vector. Replace the masked load with a
53227 // vector load and select.
53228 unsigned NumElts = VT.getVectorNumElements();
53229 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53230 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53231 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53232 if (LoadFirstElt && LoadLastElt) {
53233 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53234 ML->getMemOperand());
53235 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53236 ML->getPassThru());
53237 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53238 }
53239
53240 // Convert a masked load with a constant mask into a masked load and a select.
53241 // This allows the select operation to use a faster kind of select instruction
53242 // (for example, vblendvps -> vblendps).
53243
53244 // Don't try this if the pass-through operand is already undefined. That would
53245 // cause an infinite loop because that's what we're about to create.
53246 if (ML->getPassThru().isUndef())
53247 return SDValue();
53248
53249 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53250 return SDValue();
53251
53252 // The new masked load has an undef pass-through operand. The select uses the
53253 // original pass-through operand.
53254 SDValue NewML = DAG.getMaskedLoad(
53255 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53256 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53257 ML->getAddressingMode(), ML->getExtensionType());
53258 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53259 ML->getPassThru());
53260
53261 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53262}
53263
53266 const X86Subtarget &Subtarget) {
53267 auto *Mld = cast<MaskedLoadSDNode>(N);
53268
53269 // TODO: Expanding load with constant mask may be optimized as well.
53270 if (Mld->isExpandingLoad())
53271 return SDValue();
53272
53273 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53274 if (SDValue ScalarLoad =
53275 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53276 return ScalarLoad;
53277
53278 // TODO: Do some AVX512 subsets benefit from this transform?
53279 if (!Subtarget.hasAVX512())
53280 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53281 return Blend;
53282 }
53283
53284 // If the mask value has been legalized to a non-boolean vector, try to
53285 // simplify ops leading up to it. We only demand the MSB of each lane.
53286 SDValue Mask = Mld->getMask();
53287 if (Mask.getScalarValueSizeInBits() != 1) {
53288 EVT VT = Mld->getValueType(0);
53289 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53291 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53292 if (N->getOpcode() != ISD::DELETED_NODE)
53293 DCI.AddToWorklist(N);
53294 return SDValue(N, 0);
53295 }
53296 if (SDValue NewMask =
53298 return DAG.getMaskedLoad(
53299 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53300 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53301 Mld->getAddressingMode(), Mld->getExtensionType());
53302 }
53303
53304 return SDValue();
53305}
53306
53307/// If exactly one element of the mask is set for a non-truncating masked store,
53308/// it is a vector extract and scalar store.
53309/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53310/// mask have already been optimized in IR, so we don't bother with those here.
53312 SelectionDAG &DAG,
53313 const X86Subtarget &Subtarget) {
53314 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53315 // However, some target hooks may need to be added to know when the transform
53316 // is profitable. Endianness would also have to be considered.
53317
53318 SDValue Addr, VecIndex;
53319 Align Alignment;
53320 unsigned Offset;
53321 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53322 return SDValue();
53323
53324 // Extract the one scalar element that is actually being stored.
53325 SDLoc DL(MS);
53326 SDValue Value = MS->getValue();
53327 EVT VT = Value.getValueType();
53328 EVT EltVT = VT.getVectorElementType();
53329 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53330 EltVT = MVT::f64;
53331 EVT CastVT = VT.changeVectorElementType(EltVT);
53332 Value = DAG.getBitcast(CastVT, Value);
53333 }
53334 SDValue Extract =
53335 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53336
53337 // Store that element at the appropriate offset from the base pointer.
53338 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53340 Alignment, MS->getMemOperand()->getFlags());
53341}
53342
53345 const X86Subtarget &Subtarget) {
53346 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
53347 if (Mst->isCompressingStore())
53348 return SDValue();
53349
53350 EVT VT = Mst->getValue().getValueType();
53351 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53352
53353 if (Mst->isTruncatingStore())
53354 return SDValue();
53355
53356 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53357 return ScalarStore;
53358
53359 // If the mask value has been legalized to a non-boolean vector, try to
53360 // simplify ops leading up to it. We only demand the MSB of each lane.
53361 SDValue Mask = Mst->getMask();
53362 if (Mask.getScalarValueSizeInBits() != 1) {
53364 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53365 if (N->getOpcode() != ISD::DELETED_NODE)
53366 DCI.AddToWorklist(N);
53367 return SDValue(N, 0);
53368 }
53369 if (SDValue NewMask =
53371 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53372 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53373 Mst->getMemoryVT(), Mst->getMemOperand(),
53374 Mst->getAddressingMode());
53375 }
53376
53377 SDValue Value = Mst->getValue();
53378 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53379 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53380 Mst->getMemoryVT())) {
53381 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53382 Mst->getBasePtr(), Mst->getOffset(), Mask,
53383 Mst->getMemoryVT(), Mst->getMemOperand(),
53384 Mst->getAddressingMode(), true);
53385 }
53386
53387 return SDValue();
53388}
53389
53392 const X86Subtarget &Subtarget) {
53393 StoreSDNode *St = cast<StoreSDNode>(N);
53394 EVT StVT = St->getMemoryVT();
53395 SDLoc dl(St);
53396 SDValue StoredVal = St->getValue();
53397 EVT VT = StoredVal.getValueType();
53398 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53399
53400 // Convert a store of vXi1 into a store of iX and a bitcast.
53401 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53402 VT.getVectorElementType() == MVT::i1) {
53403
53405 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53406
53407 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53408 St->getPointerInfo(), St->getBaseAlign(),
53409 St->getMemOperand()->getFlags());
53410 }
53411
53412 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53413 // This will avoid a copy to k-register.
53414 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53415 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53416 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53417 SDValue Val = StoredVal.getOperand(0);
53418 // We must store zeros to the unused bits.
53419 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53420 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53421 St->getPointerInfo(), St->getBaseAlign(),
53422 St->getMemOperand()->getFlags());
53423 }
53424
53425 // Widen v2i1/v4i1 stores to v8i1.
53426 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53427 Subtarget.hasAVX512()) {
53428 unsigned NumConcats = 8 / VT.getVectorNumElements();
53429 // We must store zeros to the unused bits.
53430 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53431 Ops[0] = StoredVal;
53432 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53433 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53434 St->getPointerInfo(), St->getBaseAlign(),
53435 St->getMemOperand()->getFlags());
53436 }
53437
53438 // Turn vXi1 stores of constants into a scalar store.
53439 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53440 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53442 // If its a v64i1 store without 64-bit support, we need two stores.
53443 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53444 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53445 StoredVal->ops().slice(0, 32));
53447 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53448 StoredVal->ops().slice(32, 32));
53450
53451 SDValue Ptr0 = St->getBasePtr();
53452 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53453
53454 SDValue Ch0 =
53455 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53456 St->getBaseAlign(), St->getMemOperand()->getFlags());
53457 SDValue Ch1 = DAG.getStore(
53458 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53459 St->getBaseAlign(), St->getMemOperand()->getFlags());
53460 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53461 }
53462
53463 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53464 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53465 St->getPointerInfo(), St->getBaseAlign(),
53466 St->getMemOperand()->getFlags());
53467 }
53468
53469 // Convert scalar fabs/fneg load-store to integer equivalents.
53470 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53471 (StoredVal.getOpcode() == ISD::FABS ||
53472 StoredVal.getOpcode() == ISD::FNEG) &&
53473 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53474 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53475 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53476 if (TLI.isTypeLegal(IntVT)) {
53478 unsigned SignOp = ISD::XOR;
53479 if (StoredVal.getOpcode() == ISD::FABS) {
53480 SignMask = ~SignMask;
53481 SignOp = ISD::AND;
53482 }
53483 SDValue LogicOp = DAG.getNode(
53484 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53485 DAG.getConstant(SignMask, dl, IntVT));
53486 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53487 St->getPointerInfo(), St->getBaseAlign(),
53488 St->getMemOperand()->getFlags());
53489 }
53490 }
53491
53492 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53493 // Sandy Bridge, perform two 16-byte stores.
53494 unsigned Fast;
53495 if (VT.is256BitVector() && StVT == VT &&
53496 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53497 *St->getMemOperand(), &Fast) &&
53498 !Fast) {
53499 unsigned NumElems = VT.getVectorNumElements();
53500 if (NumElems < 2)
53501 return SDValue();
53502
53503 return splitVectorStore(St, DAG);
53504 }
53505
53506 // Split under-aligned vector non-temporal stores.
53507 if (St->isNonTemporal() && StVT == VT &&
53508 St->getAlign().value() < VT.getStoreSize()) {
53509 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53510 // vectors or the legalizer can scalarize it to use MOVNTI.
53511 if (VT.is256BitVector() || VT.is512BitVector()) {
53512 unsigned NumElems = VT.getVectorNumElements();
53513 if (NumElems < 2)
53514 return SDValue();
53515 return splitVectorStore(St, DAG);
53516 }
53517
53518 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53519 // to use MOVNTI.
53520 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53521 MVT NTVT = Subtarget.hasSSE4A()
53522 ? MVT::v2f64
53523 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53524 return scalarizeVectorStore(St, NTVT, DAG);
53525 }
53526 }
53527
53528 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53529 // supported, but avx512f is by extending to v16i32 and truncating.
53530 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53531 St->getValue().getOpcode() == ISD::TRUNCATE &&
53532 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53533 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53534 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53535 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53536 St->getValue().getOperand(0));
53537 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53538 MVT::v16i8, St->getMemOperand());
53539 }
53540
53541 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53542 if (!St->isTruncatingStore() &&
53543 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53544 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53545 StoredVal.hasOneUse() &&
53546 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53547 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53548 return EmitTruncSStore(IsSigned, St->getChain(),
53549 dl, StoredVal.getOperand(0), St->getBasePtr(),
53550 VT, St->getMemOperand(), DAG);
53551 }
53552
53553 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53554 if (!St->isTruncatingStore()) {
53555 auto IsExtractedElement = [](SDValue V) {
53556 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53557 V = V.getOperand(0);
53558 unsigned Opc = V.getOpcode();
53560 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53561 V.getOperand(0).hasOneUse())
53562 return V.getOperand(0);
53563 return SDValue();
53564 };
53565 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53566 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53567 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53568 SDValue Src = Trunc.getOperand(0);
53569 MVT DstVT = Trunc.getSimpleValueType();
53570 MVT SrcVT = Src.getSimpleValueType();
53571 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53572 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53573 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53574 if (NumTruncBits == VT.getSizeInBits() &&
53575 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53576 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53577 TruncVT, St->getMemOperand());
53578 }
53579 }
53580 }
53581 }
53582
53583 // Optimize trunc store (of multiple scalars) to shuffle and store.
53584 // First, pack all of the elements in one place. Next, store to memory
53585 // in fewer chunks.
53586 if (St->isTruncatingStore() && VT.isVector()) {
53587 if (TLI.isTruncStoreLegal(VT, StVT)) {
53588 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53589 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53590 dl, Val, St->getBasePtr(),
53591 St->getMemoryVT(), St->getMemOperand(), DAG);
53592 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53593 DAG, dl))
53594 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53595 dl, Val, St->getBasePtr(),
53596 St->getMemoryVT(), St->getMemOperand(), DAG);
53597 }
53598
53599 return SDValue();
53600 }
53601
53602 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53603 unsigned AddrSpace = St->getAddressSpace();
53604 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53605 AddrSpace == X86AS::PTR32_UPTR) {
53606 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53607 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53608 SDValue Cast =
53609 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53610 return DAG.getTruncStore(
53611 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53612 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53613 }
53614 }
53615
53616 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53617 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53618 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53619 Subtarget.hasCF() && St->isSimple()) {
53620 SDValue Cmov;
53621 if (StoredVal.getOpcode() == X86ISD::CMOV)
53622 Cmov = StoredVal;
53623 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53624 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53625 Cmov = StoredVal.getOperand(0);
53626 else
53627 return SDValue();
53628
53629 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53630 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53631 return SDValue();
53632
53633 bool InvertCC = false;
53634 SDValue V = SDValue(Ld, 0);
53635 if (V == Cmov.getOperand(1))
53636 InvertCC = true;
53637 else if (V != Cmov.getOperand(0))
53638 return SDValue();
53639
53640 SDVTList Tys = DAG.getVTList(MVT::Other);
53641 SDValue CC = Cmov.getOperand(2);
53642 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53643 if (InvertCC)
53644 CC = DAG.getTargetConstant(
53647 dl, MVT::i8);
53648 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53649 Cmov.getOperand(3)};
53650 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53651 St->getMemOperand());
53652 }
53653
53654 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53655 // the FP state in cases where an emms may be missing.
53656 // A preferable solution to the general problem is to figure out the right
53657 // places to insert EMMS. This qualifies as a quick hack.
53658
53659 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53660 if (VT.getSizeInBits() != 64)
53661 return SDValue();
53662
53663 const Function &F = DAG.getMachineFunction().getFunction();
53664 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53665 bool F64IsLegal =
53666 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53667
53668 if (!F64IsLegal || Subtarget.is64Bit())
53669 return SDValue();
53670
53671 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53672 cast<LoadSDNode>(St->getValue())->isSimple() &&
53673 St->getChain().hasOneUse() && St->isSimple()) {
53674 auto *Ld = cast<LoadSDNode>(St->getValue());
53675
53676 if (!ISD::isNormalLoad(Ld))
53677 return SDValue();
53678
53679 // Avoid the transformation if there are multiple uses of the loaded value.
53680 if (!Ld->hasNUsesOfValue(1, 0))
53681 return SDValue();
53682
53683 SDLoc LdDL(Ld);
53684 SDLoc StDL(N);
53685
53686 // Remove any range metadata as we're converting to f64 load/store.
53687 Ld->getMemOperand()->clearRanges();
53688
53689 // Lower to a single movq load/store pair.
53690 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53691 Ld->getBasePtr(), Ld->getMemOperand());
53692
53693 // Make sure new load is placed in same chain order.
53694 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53695 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53696 St->getMemOperand());
53697 }
53698
53699 // This is similar to the above case, but here we handle a scalar 64-bit
53700 // integer store that is extracted from a vector on a 32-bit target.
53701 // If we have SSE2, then we can treat it like a floating-point double
53702 // to get past legalization. The execution dependencies fixup pass will
53703 // choose the optimal machine instruction for the store if this really is
53704 // an integer or v2f32 rather than an f64.
53705 if (VT == MVT::i64 &&
53707 SDValue OldExtract = St->getOperand(1);
53708 SDValue ExtOp0 = OldExtract.getOperand(0);
53709 unsigned VecSize = ExtOp0.getValueSizeInBits();
53710 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53711 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53712 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53713 BitCast, OldExtract.getOperand(1));
53714 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53715 St->getPointerInfo(), St->getBaseAlign(),
53716 St->getMemOperand()->getFlags());
53717 }
53718
53719 return SDValue();
53720}
53721
53724 const X86Subtarget &Subtarget) {
53725 auto *St = cast<MemIntrinsicSDNode>(N);
53726
53727 SDValue StoredVal = N->getOperand(1);
53728 MVT VT = StoredVal.getSimpleValueType();
53729 EVT MemVT = St->getMemoryVT();
53730
53731 // Figure out which elements we demand.
53732 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53733 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53734
53735 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53736 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53737 if (N->getOpcode() != ISD::DELETED_NODE)
53738 DCI.AddToWorklist(N);
53739 return SDValue(N, 0);
53740 }
53741
53742 return SDValue();
53743}
53744
53745/// Return 'true' if this vector operation is "horizontal"
53746/// and return the operands for the horizontal operation in LHS and RHS. A
53747/// horizontal operation performs the binary operation on successive elements
53748/// of its first operand, then on successive elements of its second operand,
53749/// returning the resulting values in a vector. For example, if
53750/// A = < float a0, float a1, float a2, float a3 >
53751/// and
53752/// B = < float b0, float b1, float b2, float b3 >
53753/// then the result of doing a horizontal operation on A and B is
53754/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53755/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53756/// A horizontal-op B, for some already available A and B, and if so then LHS is
53757/// set to A, RHS to B, and the routine returns 'true'.
53758static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53759 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53760 bool IsCommutative,
53761 SmallVectorImpl<int> &PostShuffleMask,
53762 bool ForceHorizOp) {
53763 // If either operand is undef, bail out. The binop should be simplified.
53764 if (LHS.isUndef() || RHS.isUndef())
53765 return false;
53766
53767 // Look for the following pattern:
53768 // A = < float a0, float a1, float a2, float a3 >
53769 // B = < float b0, float b1, float b2, float b3 >
53770 // and
53771 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53772 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53773 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53774 // which is A horizontal-op B.
53775
53776 MVT VT = LHS.getSimpleValueType();
53777 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53778 "Unsupported vector type for horizontal add/sub");
53779 unsigned NumElts = VT.getVectorNumElements();
53780
53781 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53782 SmallVectorImpl<int> &ShuffleMask) {
53783 bool UseSubVector = false;
53784 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53785 Op.getOperand(0).getValueType().is256BitVector() &&
53786 llvm::isNullConstant(Op.getOperand(1))) {
53787 Op = Op.getOperand(0);
53788 UseSubVector = true;
53789 }
53791 SmallVector<int, 16> SrcMask, ScaledMask;
53793 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53794 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53795 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53796 })) {
53797 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53798 if (!UseSubVector && SrcOps.size() <= 2 &&
53799 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53800 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53801 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53802 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53803 }
53804 if (UseSubVector && SrcOps.size() == 1 &&
53805 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53806 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53807 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53808 ShuffleMask.assign(Mask.begin(), Mask.end());
53809 }
53810 }
53811 };
53812
53813 // View LHS in the form
53814 // LHS = VECTOR_SHUFFLE A, B, LMask
53815 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53816 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53817 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53818 SDValue A, B;
53820 GetShuffle(LHS, A, B, LMask);
53821
53822 // Likewise, view RHS in the form
53823 // RHS = VECTOR_SHUFFLE C, D, RMask
53824 SDValue C, D;
53826 GetShuffle(RHS, C, D, RMask);
53827
53828 // At least one of the operands should be a vector shuffle.
53829 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53830 if (NumShuffles == 0)
53831 return false;
53832
53833 if (LMask.empty()) {
53834 A = LHS;
53835 for (unsigned i = 0; i != NumElts; ++i)
53836 LMask.push_back(i);
53837 }
53838
53839 if (RMask.empty()) {
53840 C = RHS;
53841 for (unsigned i = 0; i != NumElts; ++i)
53842 RMask.push_back(i);
53843 }
53844
53845 // If we have an unary mask, ensure the other op is set to null.
53846 if (isUndefOrInRange(LMask, 0, NumElts))
53847 B = SDValue();
53848 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53849 A = SDValue();
53850
53851 if (isUndefOrInRange(RMask, 0, NumElts))
53852 D = SDValue();
53853 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53854 C = SDValue();
53855
53856 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53857 // RHS operands and shuffle mask.
53858 if (A != C) {
53859 std::swap(C, D);
53861 }
53862 // Check that the shuffles are both shuffling the same vectors.
53863 if (!(A == C && B == D))
53864 return false;
53865
53866 PostShuffleMask.clear();
53867 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53868
53869 // LHS and RHS are now:
53870 // LHS = shuffle A, B, LMask
53871 // RHS = shuffle A, B, RMask
53872 // Check that the masks correspond to performing a horizontal operation.
53873 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53874 // so we just repeat the inner loop if this is a 256-bit op.
53875 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53876 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53877 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53878 assert((NumEltsPer128BitChunk % 2 == 0) &&
53879 "Vector type should have an even number of elements in each lane");
53880 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53881 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53882 // Ignore undefined components.
53883 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53884 if (LIdx < 0 || RIdx < 0 ||
53885 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53886 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53887 continue;
53888
53889 // Check that successive odd/even elements are being operated on. If not,
53890 // this is not a horizontal operation.
53891 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53892 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53893 return false;
53894
53895 // Compute the post-shuffle mask index based on where the element
53896 // is stored in the HOP result, and where it needs to be moved to.
53897 int Base = LIdx & ~1u;
53898 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53899 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53900
53901 // The low half of the 128-bit result must choose from A.
53902 // The high half of the 128-bit result must choose from B,
53903 // unless B is undef. In that case, we are always choosing from A.
53904 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53905 Index += NumEltsPer64BitChunk;
53906 PostShuffleMask[i + j] = Index;
53907 }
53908 }
53909
53910 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53911 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53912
53913 bool IsIdentityPostShuffle =
53914 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53915 if (IsIdentityPostShuffle)
53916 PostShuffleMask.clear();
53917
53918 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53919 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53920 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53921 return false;
53922
53923 // If the source nodes are already used in HorizOps then always accept this.
53924 // Shuffle folding should merge these back together.
53925 auto FoundHorizUser = [&](SDNode *User) {
53926 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53927 };
53928 ForceHorizOp =
53929 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53930 llvm::any_of(NewRHS->users(), FoundHorizUser));
53931
53932 // Assume a SingleSource HOP if we only shuffle one input and don't need to
53933 // shuffle the result.
53934 if (!ForceHorizOp &&
53935 !shouldUseHorizontalOp(NewLHS == NewRHS &&
53936 (NumShuffles < 2 || !IsIdentityPostShuffle),
53937 DAG, Subtarget))
53938 return false;
53939
53940 LHS = DAG.getBitcast(VT, NewLHS);
53941 RHS = DAG.getBitcast(VT, NewRHS);
53942 return true;
53943}
53944
53945// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
53947 const X86Subtarget &Subtarget) {
53948 EVT VT = N->getValueType(0);
53949 unsigned Opcode = N->getOpcode();
53950 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
53951 SmallVector<int, 8> PostShuffleMask;
53952
53953 auto MergableHorizOp = [N](unsigned HorizOpcode) {
53954 return N->hasOneUse() &&
53955 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
53956 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
53957 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
53958 };
53959
53960 switch (Opcode) {
53961 case ISD::FADD:
53962 case ISD::FSUB:
53963 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
53964 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
53965 SDValue LHS = N->getOperand(0);
53966 SDValue RHS = N->getOperand(1);
53967 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
53968 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53969 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53970 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
53971 if (!PostShuffleMask.empty())
53972 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53973 DAG.getUNDEF(VT), PostShuffleMask);
53974 return HorizBinOp;
53975 }
53976 }
53977 break;
53978 case ISD::ADD:
53979 case ISD::SUB:
53980 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
53981 VT == MVT::v16i16 || VT == MVT::v8i32)) {
53982 SDValue LHS = N->getOperand(0);
53983 SDValue RHS = N->getOperand(1);
53984 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
53985 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53986 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53987 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
53988 ArrayRef<SDValue> Ops) {
53989 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
53990 };
53991 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
53992 {LHS, RHS}, HOpBuilder);
53993 if (!PostShuffleMask.empty())
53994 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53995 DAG.getUNDEF(VT), PostShuffleMask);
53996 return HorizBinOp;
53997 }
53998 }
53999 break;
54000 }
54001
54002 return SDValue();
54003}
54004
54005// Try to combine the following nodes
54006// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54007// <i32 -2147483648[float -0.000000e+00]> 0
54008// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54009// <(load 4 from constant-pool)> t0, t29
54010// [t30: v16i32 = bitcast t27]
54011// t6: v16i32 = xor t7, t27[t30]
54012// t11: v16f32 = bitcast t6
54013// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54014// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54015// t22: v16f32 = bitcast t7
54016// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54017// t24: v32f16 = bitcast t23
54019 const X86Subtarget &Subtarget) {
54020 EVT VT = N->getValueType(0);
54021 SDValue LHS = N->getOperand(0);
54022 SDValue RHS = N->getOperand(1);
54023 int CombineOpcode =
54024 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54025 auto combineConjugation = [&](SDValue &r) {
54026 if (LHS->getOpcode() == ISD::BITCAST) {
54027 SDValue XOR = LHS.getOperand(0);
54028 if (XOR->getOpcode() == ISD::XOR) {
54029 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54030 if (XORRHS.isConstant()) {
54031 APInt ConjugationInt32 = APInt(32, 0x80000000);
54032 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54033 if ((XORRHS.getBitWidth() == 32 &&
54034 XORRHS.getConstant() == ConjugationInt32) ||
54035 (XORRHS.getBitWidth() == 64 &&
54036 XORRHS.getConstant() == ConjugationInt64)) {
54037 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54038 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54039 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54040 r = DAG.getBitcast(VT, FCMulC);
54041 return true;
54042 }
54043 }
54044 }
54045 }
54046 return false;
54047 };
54048 SDValue Res;
54049 if (combineConjugation(Res))
54050 return Res;
54051 std::swap(LHS, RHS);
54052 if (combineConjugation(Res))
54053 return Res;
54054 return Res;
54055}
54056
54057// Try to combine the following nodes:
54058// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54060 const X86Subtarget &Subtarget) {
54061 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54063 Flags.hasAllowContract();
54064 };
54065
54066 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54067 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54068 Flags.hasNoSignedZeros();
54069 };
54070 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54071 APInt AI = APInt(32, 0x80008000);
54072 KnownBits Bits = DAG.computeKnownBits(Op);
54073 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54074 Bits.getConstant() == AI;
54075 };
54076
54077 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54078 !AllowContract(N->getFlags()))
54079 return SDValue();
54080
54081 EVT VT = N->getValueType(0);
54082 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54083 return SDValue();
54084
54085 SDValue LHS = N->getOperand(0);
54086 SDValue RHS = N->getOperand(1);
54087 bool IsConj;
54088 SDValue FAddOp1, MulOp0, MulOp1;
54089 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54090 &IsVectorAllNegativeZero,
54091 &HasNoSignedZero](SDValue N) -> bool {
54092 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54093 return false;
54094 SDValue Op0 = N.getOperand(0);
54095 unsigned Opcode = Op0.getOpcode();
54096 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54097 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54098 MulOp0 = Op0.getOperand(0);
54099 MulOp1 = Op0.getOperand(1);
54100 IsConj = Opcode == X86ISD::VFCMULC;
54101 return true;
54102 }
54103 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54105 HasNoSignedZero(Op0->getFlags())) ||
54106 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54107 MulOp0 = Op0.getOperand(0);
54108 MulOp1 = Op0.getOperand(1);
54109 IsConj = Opcode == X86ISD::VFCMADDC;
54110 return true;
54111 }
54112 }
54113 return false;
54114 };
54115
54116 if (GetCFmulFrom(LHS))
54117 FAddOp1 = RHS;
54118 else if (GetCFmulFrom(RHS))
54119 FAddOp1 = LHS;
54120 else
54121 return SDValue();
54122
54123 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54124 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54125 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54126 // FIXME: How do we handle when fast math flags of FADD are different from
54127 // CFMUL's?
54128 SDValue CFmul =
54129 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54130 return DAG.getBitcast(VT, CFmul);
54131}
54132
54133/// Do target-specific dag combines on floating-point adds/subs.
54135 const X86Subtarget &Subtarget) {
54136 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54137 return HOp;
54138
54139 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54140 return COp;
54141
54142 return SDValue();
54143}
54144
54146 const X86Subtarget &Subtarget) {
54147 EVT VT = N->getValueType(0);
54148 SDValue Src = N->getOperand(0);
54149 EVT SrcVT = Src.getValueType();
54150 SDLoc DL(N);
54151
54152 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54153
54154 // Let legalize expand this if it isn't a legal type yet.
54155 if (!TLI.isTypeLegal(VT))
54156 return SDValue();
54157
54158 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54159 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54160 return SDValue();
54161
54162 if (SrcVT == MVT::v2f16) {
54163 SrcVT = MVT::v4f16;
54164 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54165 DAG.getUNDEF(MVT::v2f16));
54166 }
54167
54168 if (SrcVT == MVT::v4f16) {
54169 SrcVT = MVT::v8f16;
54170 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54171 DAG.getUNDEF(MVT::v4f16));
54172 } else if (SrcVT == MVT::v2f32) {
54173 SrcVT = MVT::v4f32;
54174 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54175 DAG.getUNDEF(MVT::v2f32));
54176 } else {
54177 return SDValue();
54178 }
54179
54180 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54181}
54182
54183// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54184// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54185// are able to avoid generating code with MOVABS and large constants in certain
54186// cases.
54188 const SDLoc &DL) {
54189 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54190 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54191 if (!ValidSrlConst)
54192 return SDValue();
54193 unsigned SrlConstVal = *ValidSrlConst;
54194
54195 SDValue Op = N.getOperand(0);
54196 unsigned Opcode = Op.getOpcode();
54197 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54198 "Illegal truncation types");
54199
54200 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54201 !isa<ConstantSDNode>(Op.getOperand(1)))
54202 return SDValue();
54203 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54204
54205 if (SrlConstVal <= 32 ||
54206 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54207 return SDValue();
54208
54209 SDValue OpLhsSrl =
54210 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54211 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54212
54213 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54214 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54215 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54216
54217 if (Opcode == ISD::ADD) {
54218 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54219 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54220 }
54221 return NewOpNode;
54222}
54223
54224/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54225/// the codegen.
54226/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54227/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54228/// anything that is guaranteed to be transformed by DAGCombiner.
54230 const X86Subtarget &Subtarget,
54231 const SDLoc &DL) {
54232 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54233 SDValue Src = N->getOperand(0);
54234 unsigned SrcOpcode = Src.getOpcode();
54235 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54236
54237 EVT VT = N->getValueType(0);
54238 EVT SrcVT = Src.getValueType();
54239
54240 auto IsFreeTruncation = [VT](SDValue Op) {
54241 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54242
54243 // See if this has been extended from a smaller/equal size to
54244 // the truncation size, allowing a truncation to combine with the extend.
54245 unsigned Opcode = Op.getOpcode();
54246 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54247 Opcode == ISD::ZERO_EXTEND) &&
54248 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54249 return true;
54250
54251 // See if this is a single use constant which can be constant folded.
54252 // NOTE: We don't peek throught bitcasts here because there is currently
54253 // no support for constant folding truncate+bitcast+vector_of_constants. So
54254 // we'll just send up with a truncate on both operands which will
54255 // get turned back into (truncate (binop)) causing an infinite loop.
54256 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54257 };
54258
54259 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54260 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54261 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54262 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54263 };
54264
54265 // Don't combine if the operation has other uses.
54266 if (!Src.hasOneUse())
54267 return SDValue();
54268
54269 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54270 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54271
54272 if (!VT.isVector())
54273 return SDValue();
54274
54275 // In most cases its only worth pre-truncating if we're only facing the cost
54276 // of one truncation.
54277 // i.e. if one of the inputs will constant fold or the input is repeated.
54278 switch (SrcOpcode) {
54279 case ISD::MUL:
54280 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54281 // better to truncate if we have the chance.
54282 if (SrcVT.getScalarType() == MVT::i64 &&
54283 TLI.isOperationLegal(SrcOpcode, VT) &&
54284 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54285 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54286 [[fallthrough]];
54287 case ISD::AND:
54288 case ISD::XOR:
54289 case ISD::OR:
54290 case ISD::ADD:
54291 case ISD::SUB: {
54292 SDValue Op0 = Src.getOperand(0);
54293 SDValue Op1 = Src.getOperand(1);
54294 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54295 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54296 return TruncateArithmetic(Op0, Op1);
54297 break;
54298 }
54299 }
54300
54301 return SDValue();
54302}
54303
54304// Try to form a MULHU or MULHS node by looking for
54305// (trunc (srl (mul ext, ext), >= 16))
54306// TODO: This is X86 specific because we want to be able to handle wide types
54307// before type legalization. But we can only do it if the vector will be
54308// legalized via widening/splitting. Type legalization can't handle promotion
54309// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54310// combiner.
54311static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54312 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54313 using namespace llvm::SDPatternMatch;
54314
54315 if (!Subtarget.hasSSE2())
54316 return SDValue();
54317
54318 // Only handle vXi16 types that are at least 128-bits unless they will be
54319 // widened.
54320 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54321 return SDValue();
54322
54323 // Input type should be at least vXi32.
54324 EVT InVT = Src.getValueType();
54325 if (InVT.getVectorElementType().getSizeInBits() < 32)
54326 return SDValue();
54327
54328 // First instruction should be a right shift by 16 of a multiply.
54329 SDValue LHS, RHS;
54330 APInt ShiftAmt;
54331 if (!sd_match(Src,
54332 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54333 return SDValue();
54334
54335 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54336 return SDValue();
54337
54338 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54339
54340 // Count leading sign/zero bits on both inputs - if there are enough then
54341 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54342 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54343 // truncations may actually be free by peeking through to the ext source.
54344 auto IsSext = [&DAG](SDValue V) {
54345 return DAG.ComputeMaxSignificantBits(V) <= 16;
54346 };
54347 auto IsZext = [&DAG](SDValue V) {
54348 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54349 };
54350
54351 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54352 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54353 if (!IsSigned && !IsUnsigned)
54354 return SDValue();
54355
54356 // Check if both inputs are extensions, which will be removed by truncation.
54357 auto isOpTruncateFree = [](SDValue Op) {
54358 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54359 Op.getOpcode() == ISD::ZERO_EXTEND)
54360 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54361 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54362 };
54363 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54364
54365 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54366 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54367 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54368 // will have to split anyway.
54369 unsigned InSizeInBits = InVT.getSizeInBits();
54370 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54371 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54372 (InSizeInBits % 16) == 0) {
54373 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54374 InVT.getSizeInBits() / 16);
54375 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54376 DAG.getBitcast(BCVT, RHS));
54377 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54378 return DAG.getNode(ISD::SRL, DL, VT, Res,
54379 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54380 }
54381
54382 // Truncate back to source type.
54383 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54384 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54385
54386 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54387 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54388 return DAG.getNode(ISD::SRL, DL, VT, Res,
54389 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54390}
54391
54392// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54393// from one vector with signed bytes from another vector, adds together
54394// adjacent pairs of 16-bit products, and saturates the result before
54395// truncating to 16-bits.
54396//
54397// Which looks something like this:
54398// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54399// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54401 const X86Subtarget &Subtarget,
54402 const SDLoc &DL) {
54403 if (!VT.isVector() || !Subtarget.hasSSSE3())
54404 return SDValue();
54405
54406 unsigned NumElems = VT.getVectorNumElements();
54407 EVT ScalarVT = VT.getVectorElementType();
54408 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54409 return SDValue();
54410
54411 SDValue SSatVal = detectSSatPattern(In, VT);
54412 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54413 return SDValue();
54414
54415 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54416 // of multiplies from even/odd elements.
54417 SDValue N0 = SSatVal.getOperand(0);
54418 SDValue N1 = SSatVal.getOperand(1);
54419
54420 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54421 return SDValue();
54422
54423 SDValue N00 = N0.getOperand(0);
54424 SDValue N01 = N0.getOperand(1);
54425 SDValue N10 = N1.getOperand(0);
54426 SDValue N11 = N1.getOperand(1);
54427
54428 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54429 // Canonicalize zero_extend to LHS.
54430 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54431 std::swap(N00, N01);
54432 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54433 std::swap(N10, N11);
54434
54435 // Ensure we have a zero_extend and a sign_extend.
54436 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54437 N01.getOpcode() != ISD::SIGN_EXTEND ||
54438 N10.getOpcode() != ISD::ZERO_EXTEND ||
54439 N11.getOpcode() != ISD::SIGN_EXTEND)
54440 return SDValue();
54441
54442 // Peek through the extends.
54443 N00 = N00.getOperand(0);
54444 N01 = N01.getOperand(0);
54445 N10 = N10.getOperand(0);
54446 N11 = N11.getOperand(0);
54447
54448 // Ensure the extend is from vXi8.
54449 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54450 N01.getValueType().getVectorElementType() != MVT::i8 ||
54451 N10.getValueType().getVectorElementType() != MVT::i8 ||
54452 N11.getValueType().getVectorElementType() != MVT::i8)
54453 return SDValue();
54454
54455 // All inputs should be build_vectors.
54456 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54457 N01.getOpcode() != ISD::BUILD_VECTOR ||
54458 N10.getOpcode() != ISD::BUILD_VECTOR ||
54460 return SDValue();
54461
54462 // N00/N10 are zero extended. N01/N11 are sign extended.
54463
54464 // For each element, we need to ensure we have an odd element from one vector
54465 // multiplied by the odd element of another vector and the even element from
54466 // one of the same vectors being multiplied by the even element from the
54467 // other vector. So we need to make sure for each element i, this operator
54468 // is being performed:
54469 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54470 SDValue ZExtIn, SExtIn;
54471 for (unsigned i = 0; i != NumElems; ++i) {
54472 SDValue N00Elt = N00.getOperand(i);
54473 SDValue N01Elt = N01.getOperand(i);
54474 SDValue N10Elt = N10.getOperand(i);
54475 SDValue N11Elt = N11.getOperand(i);
54476 // TODO: Be more tolerant to undefs.
54477 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54478 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54479 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54481 return SDValue();
54482 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54483 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54484 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54485 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54486 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54487 return SDValue();
54488 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54489 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54490 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54491 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54492 // Add is commutative so indices can be reordered.
54493 if (IdxN00 > IdxN10) {
54494 std::swap(IdxN00, IdxN10);
54495 std::swap(IdxN01, IdxN11);
54496 }
54497 // N0 indices be the even element. N1 indices must be the next odd element.
54498 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54499 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54500 return SDValue();
54501 SDValue N00In = N00Elt.getOperand(0);
54502 SDValue N01In = N01Elt.getOperand(0);
54503 SDValue N10In = N10Elt.getOperand(0);
54504 SDValue N11In = N11Elt.getOperand(0);
54505 // First time we find an input capture it.
54506 if (!ZExtIn) {
54507 ZExtIn = N00In;
54508 SExtIn = N01In;
54509 }
54510 if (ZExtIn != N00In || SExtIn != N01In ||
54511 ZExtIn != N10In || SExtIn != N11In)
54512 return SDValue();
54513 }
54514
54515 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54516 EVT ExtVT = Ext.getValueType();
54517 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54518 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54519 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54520 DAG.getVectorIdxConstant(0, DL));
54521 }
54522 };
54523 ExtractVec(ZExtIn);
54524 ExtractVec(SExtIn);
54525
54526 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54527 ArrayRef<SDValue> Ops) {
54528 // Shrink by adding truncate nodes and let DAGCombine fold with the
54529 // sources.
54530 EVT InVT = Ops[0].getValueType();
54531 assert(InVT.getScalarType() == MVT::i8 &&
54532 "Unexpected scalar element type");
54533 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54534 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54535 InVT.getVectorNumElements() / 2);
54536 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54537 };
54538 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54539 PMADDBuilder);
54540}
54541
54543 const X86Subtarget &Subtarget) {
54544 EVT VT = N->getValueType(0);
54545 SDValue Src = N->getOperand(0);
54546 SDLoc DL(N);
54547
54548 // Attempt to pre-truncate inputs to arithmetic ops instead.
54549 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54550 return V;
54551
54552 // Try to detect PMADD
54553 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54554 return PMAdd;
54555
54556 // Try to combine truncation with signed/unsigned saturation.
54557 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54558 return Val;
54559
54560 // Try to combine PMULHUW/PMULHW for vXi16.
54561 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54562 return V;
54563
54564 // The bitcast source is a direct mmx result.
54565 // Detect bitcasts between i32 to x86mmx
54566 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54567 SDValue BCSrc = Src.getOperand(0);
54568 if (BCSrc.getValueType() == MVT::x86mmx)
54569 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54570 }
54571
54572 return SDValue();
54573}
54574
54577 EVT VT = N->getValueType(0);
54578 SDValue In = N->getOperand(0);
54579 SDLoc DL(N);
54580
54581 if (SDValue SSatVal = detectSSatPattern(In, VT))
54582 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54583 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54584 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54585
54586 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54587 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54588 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54589 return SDValue(N, 0);
54590
54591 return SDValue();
54592}
54593
54594/// Returns the negated value if the node \p N flips sign of FP value.
54595///
54596/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54597/// or FSUB(0, x)
54598/// AVX512F does not have FXOR, so FNEG is lowered as
54599/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54600/// In this case we go though all bitcasts.
54601/// This also recognizes splat of a negated value and returns the splat of that
54602/// value.
54603static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54604 if (N->getOpcode() == ISD::FNEG)
54605 return N->getOperand(0);
54606
54607 // Don't recurse exponentially.
54609 return SDValue();
54610
54611 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54612
54614 EVT VT = Op->getValueType(0);
54615
54616 // Make sure the element size doesn't change.
54617 if (VT.getScalarSizeInBits() != ScalarSize)
54618 return SDValue();
54619
54620 unsigned Opc = Op.getOpcode();
54621 switch (Opc) {
54622 case ISD::VECTOR_SHUFFLE: {
54623 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54624 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54625 if (!Op.getOperand(1).isUndef())
54626 return SDValue();
54627 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54628 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54629 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54630 cast<ShuffleVectorSDNode>(Op)->getMask());
54631 break;
54632 }
54634 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54635 // -V, INDEX).
54636 SDValue InsVector = Op.getOperand(0);
54637 SDValue InsVal = Op.getOperand(1);
54638 if (!InsVector.isUndef())
54639 return SDValue();
54640 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54641 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54642 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54643 NegInsVal, Op.getOperand(2));
54644 break;
54645 }
54646 case ISD::FSUB:
54647 case ISD::XOR:
54648 case X86ISD::FXOR: {
54649 SDValue Op1 = Op.getOperand(1);
54650 SDValue Op0 = Op.getOperand(0);
54651
54652 // For XOR and FXOR, we want to check if constant
54653 // bits of Op1 are sign bit masks. For FSUB, we
54654 // have to check if constant bits of Op0 are sign
54655 // bit masks and hence we swap the operands.
54656 if (Opc == ISD::FSUB)
54657 std::swap(Op0, Op1);
54658
54659 APInt UndefElts;
54660 SmallVector<APInt, 16> EltBits;
54661 // Extract constant bits and see if they are all
54662 // sign bit masks. Ignore the undef elements.
54663 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54664 /* AllowWholeUndefs */ true,
54665 /* AllowPartialUndefs */ false)) {
54666 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54667 if (!UndefElts[I] && !EltBits[I].isSignMask())
54668 return SDValue();
54669
54670 // Only allow bitcast from correctly-sized constant.
54671 Op0 = peekThroughBitcasts(Op0);
54672 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54673 return Op0;
54674 }
54675 break;
54676 } // case
54677 } // switch
54678
54679 return SDValue();
54680}
54681
54682static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54683 bool NegRes) {
54684 if (NegMul) {
54685 switch (Opcode) {
54686 // clang-format off
54687 default: llvm_unreachable("Unexpected opcode");
54688 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54689 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54690 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54691 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54692 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54693 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54694 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54695 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54696 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54697 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54698 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54699 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54700 // clang-format on
54701 }
54702 }
54703
54704 if (NegAcc) {
54705 switch (Opcode) {
54706 // clang-format off
54707 default: llvm_unreachable("Unexpected opcode");
54708 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54709 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54710 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54711 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54712 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54713 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54714 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54715 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54716 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54717 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54718 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54719 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54720 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54721 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54722 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54723 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54724 // clang-format on
54725 }
54726 }
54727
54728 if (NegRes) {
54729 switch (Opcode) {
54730 // For accuracy reason, we never combine fneg and fma under strict FP.
54731 // clang-format off
54732 default: llvm_unreachable("Unexpected opcode");
54733 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54734 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54735 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54736 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54737 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54738 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54739 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54740 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54741 // clang-format on
54742 }
54743 }
54744
54745 return Opcode;
54746}
54747
54748/// Do target-specific dag combines on floating point negations.
54751 const X86Subtarget &Subtarget) {
54752 EVT OrigVT = N->getValueType(0);
54753 SDValue Arg = isFNEG(DAG, N);
54754 if (!Arg)
54755 return SDValue();
54756
54757 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54758 EVT VT = Arg.getValueType();
54759 EVT SVT = VT.getScalarType();
54760 SDLoc DL(N);
54761
54762 // Let legalize expand this if it isn't a legal type yet.
54763 if (!TLI.isTypeLegal(VT))
54764 return SDValue();
54765
54766 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54767 // use of a constant by performing (-0 - A*B) instead.
54768 // FIXME: Check rounding control flags as well once it becomes available.
54769 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54770 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54771 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54772 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54773 Arg.getOperand(1), Zero);
54774 return DAG.getBitcast(OrigVT, NewNode);
54775 }
54776
54778 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54779 if (SDValue NegArg =
54780 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54781 return DAG.getBitcast(OrigVT, NegArg);
54782
54783 return SDValue();
54784}
54785
54787 bool LegalOperations,
54788 bool ForCodeSize,
54790 unsigned Depth) const {
54791 // fneg patterns are removable even if they have multiple uses.
54792 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54794 return DAG.getBitcast(Op.getValueType(), Arg);
54795 }
54796
54797 EVT VT = Op.getValueType();
54798 EVT SVT = VT.getScalarType();
54799 unsigned Opc = Op.getOpcode();
54800 SDNodeFlags Flags = Op.getNode()->getFlags();
54801 switch (Opc) {
54802 case ISD::FMA:
54803 case X86ISD::FMSUB:
54804 case X86ISD::FNMADD:
54805 case X86ISD::FNMSUB:
54806 case X86ISD::FMADD_RND:
54807 case X86ISD::FMSUB_RND:
54808 case X86ISD::FNMADD_RND:
54809 case X86ISD::FNMSUB_RND: {
54810 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54811 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54813 break;
54814
54815 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54816 // if it may have signed zeros.
54817 if (!Flags.hasNoSignedZeros())
54818 break;
54819
54820 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54821 // keep temporary nodes alive.
54822 std::list<HandleSDNode> Handles;
54823
54824 // This is always negatible for free but we might be able to remove some
54825 // extra operand negations as well.
54827 for (int i = 0; i != 3; ++i) {
54828 NewOps[i] = getCheaperNegatedExpression(
54829 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54830 if (!!NewOps[i])
54831 Handles.emplace_back(NewOps[i]);
54832 }
54833
54834 bool NegA = !!NewOps[0];
54835 bool NegB = !!NewOps[1];
54836 bool NegC = !!NewOps[2];
54837 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54838
54839 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54841
54842 // Fill in the non-negated ops with the original values.
54843 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54844 if (!NewOps[i])
54845 NewOps[i] = Op.getOperand(i);
54846 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54847 }
54848 case X86ISD::FRCP:
54849 if (SDValue NegOp0 =
54850 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54851 ForCodeSize, Cost, Depth + 1))
54852 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54853 break;
54854 }
54855
54856 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54857 ForCodeSize, Cost, Depth);
54858}
54859
54861 const X86Subtarget &Subtarget) {
54862 MVT VT = N->getSimpleValueType(0);
54863 // If we have integer vector types available, use the integer opcodes.
54864 if (!VT.isVector() || !Subtarget.hasSSE2())
54865 return SDValue();
54866
54867 SDLoc dl(N);
54869 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54870 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54871 unsigned IntOpcode;
54872 switch (N->getOpcode()) {
54873 // clang-format off
54874 default: llvm_unreachable("Unexpected FP logic op");
54875 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54876 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54877 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54878 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54879 // clang-format on
54880 }
54881 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54882 return DAG.getBitcast(VT, IntOp);
54883}
54884
54885/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54887 if (N->getOpcode() != ISD::XOR)
54888 return SDValue();
54889
54890 SDValue LHS = N->getOperand(0);
54891 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54892 return SDValue();
54893
54895 X86::CondCode(LHS->getConstantOperandVal(0)));
54896 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54897}
54898
54900 const X86Subtarget &Subtarget) {
54901 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54902 "Invalid opcode for combing with CTLZ");
54903 if (Subtarget.hasFastLZCNT())
54904 return SDValue();
54905
54906 EVT VT = N->getValueType(0);
54907 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54908 (VT != MVT::i64 || !Subtarget.is64Bit()))
54909 return SDValue();
54910
54911 SDValue N0 = N->getOperand(0);
54912 SDValue N1 = N->getOperand(1);
54913
54914 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54916 return SDValue();
54917
54918 SDValue OpCTLZ;
54919 SDValue OpSizeTM1;
54920
54921 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54922 OpCTLZ = N1;
54923 OpSizeTM1 = N0;
54924 } else if (N->getOpcode() == ISD::SUB) {
54925 return SDValue();
54926 } else {
54927 OpCTLZ = N0;
54928 OpSizeTM1 = N1;
54929 }
54930
54931 if (!OpCTLZ.hasOneUse())
54932 return SDValue();
54933 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
54934 if (!C)
54935 return SDValue();
54936
54937 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54938 return SDValue();
54939 EVT OpVT = VT;
54940 SDValue Op = OpCTLZ.getOperand(0);
54941 if (VT == MVT::i8) {
54942 // Zero extend to i32 since there is not an i8 bsr.
54943 OpVT = MVT::i32;
54944 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
54945 }
54946
54947 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
54948 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
54949 if (VT == MVT::i8)
54950 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
54951
54952 return Op;
54953}
54954
54957 const X86Subtarget &Subtarget) {
54958 SDValue N0 = N->getOperand(0);
54959 SDValue N1 = N->getOperand(1);
54960 EVT VT = N->getValueType(0);
54961 SDLoc DL(N);
54962
54963 // If this is SSE1 only convert to FXOR to avoid scalarization.
54964 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
54965 return DAG.getBitcast(MVT::v4i32,
54966 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
54967 DAG.getBitcast(MVT::v4f32, N0),
54968 DAG.getBitcast(MVT::v4f32, N1)));
54969 }
54970
54971 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
54972 return Cmp;
54973
54974 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
54975 return R;
54976
54977 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
54978 return R;
54979
54980 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
54981 return R;
54982
54983 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
54984 DAG, DCI, Subtarget))
54985 return FPLogic;
54986
54987 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
54988 return R;
54989
54990 if (DCI.isBeforeLegalizeOps())
54991 return SDValue();
54992
54993 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
54994 return SetCC;
54995
54996 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
54997 return R;
54998
54999 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55000 return RV;
55001
55002 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55003 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55004 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55005 N0.getOperand(0).getValueType().isVector() &&
55006 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55007 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55008 return DAG.getBitcast(
55009 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55010 }
55011
55012 // Handle AVX512 mask widening.
55013 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55014 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55015 VT.getVectorElementType() == MVT::i1 &&
55017 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55018 return DAG.getNode(
55020 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55021 N0.getOperand(2));
55022 }
55023
55024 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55025 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55026 // TODO: Under what circumstances could this be performed in DAGCombine?
55027 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55028 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55029 SDValue TruncExtSrc = N0.getOperand(0);
55030 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55031 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55032 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55033 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55034 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55035 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55036 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55037 }
55038 }
55039
55040 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55041 return R;
55042
55043 return combineFneg(N, DAG, DCI, Subtarget);
55044}
55045
55048 const X86Subtarget &Subtarget) {
55049 SDValue N0 = N->getOperand(0);
55050 EVT VT = N->getValueType(0);
55051
55052 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55053 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55054 SDValue Src = N0.getOperand(0);
55055 EVT SrcVT = Src.getValueType();
55056 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55057 (DCI.isBeforeLegalize() ||
55058 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55059 Subtarget.hasSSSE3()) {
55060 unsigned NumElts = SrcVT.getVectorNumElements();
55061 SmallVector<int, 32> ReverseMask(NumElts);
55062 for (unsigned I = 0; I != NumElts; ++I)
55063 ReverseMask[I] = (NumElts - 1) - I;
55064 SDValue Rev =
55065 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55066 return DAG.getBitcast(VT, Rev);
55067 }
55068 }
55069
55070 return SDValue();
55071}
55072
55073// Various combines to try to convert to avgceilu.
55076 const X86Subtarget &Subtarget) {
55077 unsigned Opcode = N->getOpcode();
55078 SDValue N0 = N->getOperand(0);
55079 SDValue N1 = N->getOperand(1);
55080 EVT VT = N->getValueType(0);
55081 EVT SVT = VT.getScalarType();
55082 SDLoc DL(N);
55083
55084 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55085 // Only useful on vXi8 which doesn't have good SRA handling.
55086 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55088 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55089 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55090 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55091 return DAG.getNode(ISD::XOR, DL, VT,
55092 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55093 }
55094
55095 return SDValue();
55096}
55097
55100 const X86Subtarget &Subtarget) {
55101 EVT VT = N->getValueType(0);
55102 unsigned NumBits = VT.getSizeInBits();
55103
55104 // TODO - Constant Folding.
55105
55106 // Simplify the inputs.
55107 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55108 APInt DemandedMask(APInt::getAllOnes(NumBits));
55109 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55110 return SDValue(N, 0);
55111
55112 return SDValue();
55113}
55114
55116 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55117}
55118
55119/// If a value is a scalar FP zero or a vector FP zero (potentially including
55120/// undefined elements), return a zero constant that may be used to fold away
55121/// that value. In the case of a vector, the returned constant will not contain
55122/// undefined elements even if the input parameter does. This makes it suitable
55123/// to be used as a replacement operand with operations (eg, bitwise-and) where
55124/// an undef should not propagate.
55126 const X86Subtarget &Subtarget) {
55128 return SDValue();
55129
55130 if (V.getValueType().isVector())
55131 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55132
55133 return V;
55134}
55135
55137 const X86Subtarget &Subtarget) {
55138 SDValue N0 = N->getOperand(0);
55139 SDValue N1 = N->getOperand(1);
55140 EVT VT = N->getValueType(0);
55141 SDLoc DL(N);
55142
55143 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55144 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55145 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55146 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55147 return SDValue();
55148
55149 auto isAllOnesConstantFP = [](SDValue V) {
55150 if (V.getSimpleValueType().isVector())
55151 return ISD::isBuildVectorAllOnes(V.getNode());
55152 auto *C = dyn_cast<ConstantFPSDNode>(V);
55153 return C && C->getConstantFPValue()->isAllOnesValue();
55154 };
55155
55156 // fand (fxor X, -1), Y --> fandn X, Y
55157 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55158 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55159
55160 // fand X, (fxor Y, -1) --> fandn Y, X
55161 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55162 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55163
55164 return SDValue();
55165}
55166
55167/// Do target-specific dag combines on X86ISD::FAND nodes.
55169 const X86Subtarget &Subtarget) {
55170 // FAND(0.0, x) -> 0.0
55171 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55172 return V;
55173
55174 // FAND(x, 0.0) -> 0.0
55175 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55176 return V;
55177
55178 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55179 return V;
55180
55181 return lowerX86FPLogicOp(N, DAG, Subtarget);
55182}
55183
55184/// Do target-specific dag combines on X86ISD::FANDN nodes.
55186 const X86Subtarget &Subtarget) {
55187 // FANDN(0.0, x) -> x
55188 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55189 return N->getOperand(1);
55190
55191 // FANDN(x, 0.0) -> 0.0
55192 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55193 return V;
55194
55195 return lowerX86FPLogicOp(N, DAG, Subtarget);
55196}
55197
55198/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55201 const X86Subtarget &Subtarget) {
55202 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55203
55204 // F[X]OR(0.0, x) -> x
55205 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55206 return N->getOperand(1);
55207
55208 // F[X]OR(x, 0.0) -> x
55209 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55210 return N->getOperand(0);
55211
55212 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55213 return NewVal;
55214
55215 return lowerX86FPLogicOp(N, DAG, Subtarget);
55216}
55217
55218/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55220 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55221
55222 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55223 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55225 return SDValue();
55226
55227 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55228 // into FMINC and FMAXC, which are Commutative operations.
55229 unsigned NewOp = 0;
55230 switch (N->getOpcode()) {
55231 default: llvm_unreachable("unknown opcode");
55232 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55233 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55234 }
55235
55236 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55237 N->getOperand(0), N->getOperand(1));
55238}
55239
55241 const X86Subtarget &Subtarget) {
55242 EVT VT = N->getValueType(0);
55243 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55244 return SDValue();
55245
55246 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55247
55248 auto IsMinMaxLegal = [&](EVT VT) {
55249 if (!TLI.isTypeLegal(VT))
55250 return false;
55251 return VT.getScalarType() != MVT::f16 ||
55252 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55253 };
55254
55255 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55256 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55257 (Subtarget.hasFP16() && VT == MVT::f16) ||
55258 (VT.isVector() && IsMinMaxLegal(VT))))
55259 return SDValue();
55260
55261 SDValue Op0 = N->getOperand(0);
55262 SDValue Op1 = N->getOperand(1);
55263 SDLoc DL(N);
55264 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55265
55266 // If we don't have to respect NaN inputs, this is a direct translation to x86
55267 // min/max instructions.
55268 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55269 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55270
55271 // If one of the operands is known non-NaN use the native min/max instructions
55272 // with the non-NaN input as second operand.
55273 if (DAG.isKnownNeverNaN(Op1))
55274 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55275 if (DAG.isKnownNeverNaN(Op0))
55276 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55277
55278 // If we have to respect NaN inputs, this takes at least 3 instructions.
55279 // Favor a library call when operating on a scalar and minimizing code size.
55280 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55281 return SDValue();
55282
55283 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55284 VT);
55285
55286 // There are 4 possibilities involving NaN inputs, and these are the required
55287 // outputs:
55288 // Op1
55289 // Num NaN
55290 // ----------------
55291 // Num | Max | Op0 |
55292 // Op0 ----------------
55293 // NaN | Op1 | NaN |
55294 // ----------------
55295 //
55296 // The SSE FP max/min instructions were not designed for this case, but rather
55297 // to implement:
55298 // Min = Op1 < Op0 ? Op1 : Op0
55299 // Max = Op1 > Op0 ? Op1 : Op0
55300 //
55301 // So they always return Op0 if either input is a NaN. However, we can still
55302 // use those instructions for fmaxnum by selecting away a NaN input.
55303
55304 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55305 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55306 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55307
55308 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55309 // are NaN, the NaN value of Op1 is the result.
55310 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55311}
55312
55315 EVT VT = N->getValueType(0);
55316 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55317
55318 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55319 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55320 return SDValue(N, 0);
55321
55322 // Convert a full vector load into vzload when not all bits are needed.
55323 SDValue In = N->getOperand(0);
55324 MVT InVT = In.getSimpleValueType();
55325 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55326 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55327 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55328 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55329 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55330 MVT MemVT = MVT::getIntegerVT(NumBits);
55331 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55332 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55333 SDLoc dl(N);
55334 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55335 DAG.getBitcast(InVT, VZLoad));
55336 DCI.CombineTo(N, Convert);
55337 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55339 return SDValue(N, 0);
55340 }
55341 }
55342
55343 return SDValue();
55344}
55345
55349 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55350 EVT VT = N->getValueType(0);
55351
55352 // Convert a full vector load into vzload when not all bits are needed.
55353 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55354 MVT InVT = In.getSimpleValueType();
55355 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55356 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55357 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55358 LoadSDNode *LN = cast<LoadSDNode>(In);
55359 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55360 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55361 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55362 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55363 SDLoc dl(N);
55364 if (IsStrict) {
55365 SDValue Convert =
55366 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55367 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55368 DCI.CombineTo(N, Convert, Convert.getValue(1));
55369 } else {
55370 SDValue Convert =
55371 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55372 DCI.CombineTo(N, Convert);
55373 }
55374 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55376 return SDValue(N, 0);
55377 }
55378 }
55379
55380 return SDValue();
55381}
55382
55383/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55386 const X86Subtarget &Subtarget) {
55387 SDValue N0 = N->getOperand(0);
55388 SDValue N1 = N->getOperand(1);
55389 MVT VT = N->getSimpleValueType(0);
55390 int NumElts = VT.getVectorNumElements();
55391 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55392 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55393 SDLoc DL(N);
55394
55395 // ANDNP(undef, x) -> 0
55396 // ANDNP(x, undef) -> 0
55397 if (N0.isUndef() || N1.isUndef())
55398 return DAG.getConstant(0, DL, VT);
55399
55400 // ANDNP(0, x) -> x
55402 return N1;
55403
55404 // ANDNP(x, 0) -> 0
55406 return DAG.getConstant(0, DL, VT);
55407
55408 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55410 return DAG.getNOT(DL, N0, VT);
55411
55412 // Turn ANDNP back to AND if input is inverted.
55413 if (SDValue Not = IsNOT(N0, DAG))
55414 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55415
55416 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55417 // to make use of predicated selects.
55418 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55419 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55420 SDValue Src = N0.getOperand(0);
55421 EVT SrcVT = Src.getValueType();
55422 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55423 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55424 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55425 getZeroVector(VT, Subtarget, DAG, DL));
55426 }
55427
55428 // Constant Folding
55429 APInt Undefs0, Undefs1;
55430 SmallVector<APInt> EltBits0, EltBits1;
55431 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55432 /*AllowWholeUndefs*/ true,
55433 /*AllowPartialUndefs*/ true)) {
55434 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55435 /*AllowWholeUndefs*/ true,
55436 /*AllowPartialUndefs*/ true)) {
55437 SmallVector<APInt> ResultBits;
55438 for (int I = 0; I != NumElts; ++I)
55439 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55440 return getConstVector(ResultBits, VT, DAG, DL);
55441 }
55442
55443 // Constant fold NOT(N0) to allow us to use AND.
55444 // Ensure this is only performed if we can confirm that the bitcasted source
55445 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55446 if (N0->hasOneUse()) {
55448 if (BC0.getOpcode() != ISD::BITCAST) {
55449 for (APInt &Elt : EltBits0)
55450 Elt = ~Elt;
55451 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55452 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55453 }
55454 }
55455 }
55456
55457 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55458 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55459 SDValue Op(N, 0);
55460 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55461 return Res;
55462
55463 // If either operand is a constant mask, then only the elements that aren't
55464 // zero are actually demanded by the other operand.
55465 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55466 APInt UndefElts;
55467 SmallVector<APInt> EltBits;
55468 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55469 APInt DemandedElts = APInt::getAllOnes(NumElts);
55470 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55471 EltBits)) {
55472 DemandedBits.clearAllBits();
55473 DemandedElts.clearAllBits();
55474 for (int I = 0; I != NumElts; ++I) {
55475 if (UndefElts[I]) {
55476 // We can't assume an undef src element gives an undef dst - the
55477 // other src might be zero.
55478 DemandedBits.setAllBits();
55479 DemandedElts.setBit(I);
55480 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55481 (!Invert && !EltBits[I].isZero())) {
55482 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55483 DemandedElts.setBit(I);
55484 }
55485 }
55486 }
55487 return std::make_pair(DemandedBits, DemandedElts);
55488 };
55489 APInt Bits0, Elts0;
55490 APInt Bits1, Elts1;
55491 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55492 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55493
55494 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55495 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55496 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55497 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55498 if (N->getOpcode() != ISD::DELETED_NODE)
55499 DCI.AddToWorklist(N);
55500 return SDValue(N, 0);
55501 }
55502 }
55503
55504 // Folds for better commutativity:
55505 if (N1->hasOneUse()) {
55506 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55507 if (SDValue Not = IsNOT(N1, DAG))
55508 return DAG.getNOT(
55509 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55510
55511 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55512 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55513 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55515 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55516 EVT ShufVT = BC1.getValueType();
55517 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55518 DAG.getBitcast(ShufVT, N0));
55519 SDValue NewShuf =
55520 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55521 return DAG.getBitcast(VT, NewShuf);
55522 }
55523 }
55524 }
55525
55526 return SDValue();
55527}
55528
55531 SDValue N1 = N->getOperand(1);
55532
55533 // BT ignores high bits in the bit index operand.
55534 unsigned BitWidth = N1.getValueSizeInBits();
55536 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55537 if (N->getOpcode() != ISD::DELETED_NODE)
55538 DCI.AddToWorklist(N);
55539 return SDValue(N, 0);
55540 }
55541
55542 return SDValue();
55543}
55544
55547 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55548 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55549
55550 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55551 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55552 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55553 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55554 if (N->getOpcode() != ISD::DELETED_NODE)
55555 DCI.AddToWorklist(N);
55556 return SDValue(N, 0);
55557 }
55558
55559 // Convert a full vector load into vzload when not all bits are needed.
55560 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55561 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55562 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55563 SDLoc dl(N);
55564 if (IsStrict) {
55565 SDValue Convert = DAG.getNode(
55566 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55567 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55568 DCI.CombineTo(N, Convert, Convert.getValue(1));
55569 } else {
55570 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55571 DAG.getBitcast(MVT::v8i16, VZLoad));
55572 DCI.CombineTo(N, Convert);
55573 }
55574
55575 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55577 return SDValue(N, 0);
55578 }
55579 }
55580 }
55581
55582 return SDValue();
55583}
55584
55585// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55587 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55588
55589 EVT DstVT = N->getValueType(0);
55590
55591 SDValue N0 = N->getOperand(0);
55592 SDValue N1 = N->getOperand(1);
55593 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55594
55595 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55596 return SDValue();
55597
55598 // Look through single use any_extends / truncs.
55599 SDValue IntermediateBitwidthOp;
55600 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55601 N0.hasOneUse()) {
55602 IntermediateBitwidthOp = N0;
55603 N0 = N0.getOperand(0);
55604 }
55605
55606 // See if we have a single use cmov.
55607 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55608 return SDValue();
55609
55610 SDValue CMovOp0 = N0.getOperand(0);
55611 SDValue CMovOp1 = N0.getOperand(1);
55612
55613 // Make sure both operands are constants.
55614 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55615 !isa<ConstantSDNode>(CMovOp1.getNode()))
55616 return SDValue();
55617
55618 SDLoc DL(N);
55619
55620 // If we looked through an any_extend/trunc above, add one to the constants.
55621 if (IntermediateBitwidthOp) {
55622 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55623 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55624 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55625 }
55626
55627 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55628 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55629
55630 EVT CMovVT = DstVT;
55631 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55632 if (DstVT == MVT::i16) {
55633 CMovVT = MVT::i32;
55634 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55635 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55636 }
55637
55638 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55639 N0.getOperand(2), N0.getOperand(3));
55640
55641 if (CMovVT != DstVT)
55642 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55643
55644 return CMov;
55645}
55646
55648 const X86Subtarget &Subtarget) {
55649 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55650
55651 if (SDValue V = combineSextInRegCmov(N, DAG))
55652 return V;
55653
55654 EVT VT = N->getValueType(0);
55655 SDValue N0 = N->getOperand(0);
55656 SDValue N1 = N->getOperand(1);
55657 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55658 SDLoc dl(N);
55659
55660 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55661 // both SSE and AVX2 since there is no sign-extended shift right
55662 // operation on a vector with 64-bit elements.
55663 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55664 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55665 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55666 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55667 SDValue N00 = N0.getOperand(0);
55668
55669 // EXTLOAD has a better solution on AVX2,
55670 // it may be replaced with X86ISD::VSEXT node.
55671 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55672 if (!ISD::isNormalLoad(N00.getNode()))
55673 return SDValue();
55674
55675 // Attempt to promote any comparison mask ops before moving the
55676 // SIGN_EXTEND_INREG in the way.
55677 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55678 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55679
55680 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55681 SDValue Tmp =
55682 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55683 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55684 }
55685 }
55686 return SDValue();
55687}
55688
55689/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55690/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55691/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55692/// opportunities to combine math ops, use an LEA, or use a complex addressing
55693/// mode. This can eliminate extend, add, and shift instructions.
55695 const X86Subtarget &Subtarget) {
55696 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55697 Ext->getOpcode() != ISD::ZERO_EXTEND)
55698 return SDValue();
55699
55700 // TODO: This should be valid for other integer types.
55701 EVT VT = Ext->getValueType(0);
55702 if (VT != MVT::i64)
55703 return SDValue();
55704
55705 SDValue Add = Ext->getOperand(0);
55706 if (Add.getOpcode() != ISD::ADD)
55707 return SDValue();
55708
55709 SDValue AddOp0 = Add.getOperand(0);
55710 SDValue AddOp1 = Add.getOperand(1);
55711 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55712 bool NSW = Add->getFlags().hasNoSignedWrap();
55713 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55714 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55715 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55716
55717 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55718 // into the 'zext'
55719 if ((Sext && !NSW) || (!Sext && !NUW))
55720 return SDValue();
55721
55722 // Having a constant operand to the 'add' ensures that we are not increasing
55723 // the instruction count because the constant is extended for free below.
55724 // A constant operand can also become the displacement field of an LEA.
55725 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55726 if (!AddOp1C)
55727 return SDValue();
55728
55729 // Don't make the 'add' bigger if there's no hope of combining it with some
55730 // other 'add' or 'shl' instruction.
55731 // TODO: It may be profitable to generate simpler LEA instructions in place
55732 // of single 'add' instructions, but the cost model for selecting an LEA
55733 // currently has a high threshold.
55734 bool HasLEAPotential = false;
55735 for (auto *User : Ext->users()) {
55736 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55737 HasLEAPotential = true;
55738 break;
55739 }
55740 }
55741 if (!HasLEAPotential)
55742 return SDValue();
55743
55744 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55745 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55746 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55747 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55748
55749 // The wider add is guaranteed to not wrap because both operands are
55750 // sign-extended.
55751 SDNodeFlags Flags;
55752 Flags.setNoSignedWrap(NSW);
55753 Flags.setNoUnsignedWrap(NUW);
55754 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55755}
55756
55757// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55758// operands and the result of CMOV is not used anywhere else - promote CMOV
55759// itself instead of promoting its result. This could be beneficial, because:
55760// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55761// (or more) pseudo-CMOVs only when they go one-after-another and
55762// getting rid of result extension code after CMOV will help that.
55763// 2) Promotion of constant CMOV arguments is free, hence the
55764// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55765// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55766// promotion is also good in terms of code-size.
55767// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55768// promotion).
55770 SDValue CMovN = Extend->getOperand(0);
55771 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55772 return SDValue();
55773
55774 EVT TargetVT = Extend->getValueType(0);
55775 unsigned ExtendOpcode = Extend->getOpcode();
55776 SDLoc DL(Extend);
55777
55778 EVT VT = CMovN.getValueType();
55779 SDValue CMovOp0 = CMovN.getOperand(0);
55780 SDValue CMovOp1 = CMovN.getOperand(1);
55781
55782 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55783 !isa<ConstantSDNode>(CMovOp1.getNode()))
55784 return SDValue();
55785
55786 // Only extend to i32 or i64.
55787 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55788 return SDValue();
55789
55790 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55791 // are free.
55792 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55793 return SDValue();
55794
55795 // If this a zero extend to i64, we should only extend to i32 and use a free
55796 // zero extend to finish.
55797 EVT ExtendVT = TargetVT;
55798 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55799 ExtendVT = MVT::i32;
55800
55801 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55802 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55803
55804 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55805 CMovN.getOperand(2), CMovN.getOperand(3));
55806
55807 // Finish extending if needed.
55808 if (ExtendVT != TargetVT)
55809 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55810
55811 return Res;
55812}
55813
55814// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55815// result type.
55817 const X86Subtarget &Subtarget) {
55818 SDValue N0 = N->getOperand(0);
55819 EVT VT = N->getValueType(0);
55820 SDLoc dl(N);
55821
55822 // Only do this combine with AVX512 for vector extends.
55823 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55824 return SDValue();
55825
55826 // Only combine legal element types.
55827 EVT SVT = VT.getVectorElementType();
55828 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55829 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55830 return SDValue();
55831
55832 // We don't have CMPP Instruction for vxf16
55833 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55834 return SDValue();
55835 // We can only do this if the vector size in 256 bits or less.
55836 unsigned Size = VT.getSizeInBits();
55837 if (Size > 256 && Subtarget.useAVX512Regs())
55838 return SDValue();
55839
55840 EVT N00VT = N0.getOperand(0).getValueType();
55841
55842 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55843 // that's the only integer compares with we have.
55844 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
55845 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55846 return SDValue();
55847
55848 // Only do this combine if the extension will be fully consumed by the setcc.
55849 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55850 if (Size != MatchingVecType.getSizeInBits())
55851 return SDValue();
55852
55853 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55854
55855 if (N->getOpcode() == ISD::ZERO_EXTEND)
55856 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55857
55858 return Res;
55859}
55860
55863 const X86Subtarget &Subtarget) {
55864 SDValue N0 = N->getOperand(0);
55865 EVT VT = N->getValueType(0);
55866 SDLoc DL(N);
55867
55868 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55869 if (!DCI.isBeforeLegalizeOps() &&
55871 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55872 N0->getOperand(1));
55873 bool ReplaceOtherUses = !N0.hasOneUse();
55874 DCI.CombineTo(N, Setcc);
55875 // Replace other uses with a truncate of the widened setcc_carry.
55876 if (ReplaceOtherUses) {
55877 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55878 N0.getValueType(), Setcc);
55879 DCI.CombineTo(N0.getNode(), Trunc);
55880 }
55881
55882 return SDValue(N, 0);
55883 }
55884
55885 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55886 return NewCMov;
55887
55888 if (!DCI.isBeforeLegalizeOps())
55889 return SDValue();
55890
55891 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55892 return V;
55893
55894 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55895 DAG, DCI, Subtarget))
55896 return V;
55897
55898 if (VT.isVector()) {
55899 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55900 return R;
55901
55903 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55904 }
55905
55906 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55907 return NewAdd;
55908
55909 return SDValue();
55910}
55911
55912// Inverting a constant vector is profitable if it can be eliminated and the
55913// inverted vector is already present in DAG. Otherwise, it will be loaded
55914// anyway.
55915//
55916// We determine which of the values can be completely eliminated and invert it.
55917// If both are eliminable, select a vector with the first negative element.
55920 "ConstantFP build vector expected");
55921 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55922 // can eliminate it. Since this function is invoked for each FMA with this
55923 // vector.
55924 auto IsNotFMA = [](SDNode *User) {
55925 return User->getOpcode() != ISD::FMA &&
55926 User->getOpcode() != ISD::STRICT_FMA;
55927 };
55928 if (llvm::any_of(V->users(), IsNotFMA))
55929 return SDValue();
55930
55932 EVT VT = V.getValueType();
55933 EVT EltVT = VT.getVectorElementType();
55934 for (const SDValue &Op : V->op_values()) {
55935 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55936 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55937 } else {
55938 assert(Op.isUndef());
55939 Ops.push_back(DAG.getUNDEF(EltVT));
55940 }
55941 }
55942
55943 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
55944 if (!NV)
55945 return SDValue();
55946
55947 // If an inverted version cannot be eliminated, choose it instead of the
55948 // original version.
55949 if (llvm::any_of(NV->users(), IsNotFMA))
55950 return SDValue(NV, 0);
55951
55952 // If the inverted version also can be eliminated, we have to consistently
55953 // prefer one of the values. We prefer a constant with a negative value on
55954 // the first place.
55955 // N.B. We need to skip undefs that may precede a value.
55956 for (const SDValue &Op : V->op_values()) {
55957 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55958 if (Cst->isNegative())
55959 return SDValue();
55960 break;
55961 }
55962 }
55963 return SDValue(NV, 0);
55964}
55965
55968 const X86Subtarget &Subtarget) {
55969 SDLoc dl(N);
55970 EVT VT = N->getValueType(0);
55972 bool IsStrict = N->isTargetOpcode()
55973 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
55974 : N->isStrictFPOpcode();
55975
55976 // Let legalize expand this if it isn't a legal type yet.
55977 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55978 if (!TLI.isTypeLegal(VT))
55979 return SDValue();
55980
55981 SDValue A = N->getOperand(IsStrict ? 1 : 0);
55982 SDValue B = N->getOperand(IsStrict ? 2 : 1);
55983 SDValue C = N->getOperand(IsStrict ? 3 : 2);
55984
55985 // If the operation allows fast-math and the target does not support FMA,
55986 // split this into mul+add to avoid libcall(s).
55987 SDNodeFlags Flags = N->getFlags();
55988 if (!IsStrict && Flags.hasAllowReassociation() &&
55989 TLI.isOperationExpand(ISD::FMA, VT)) {
55990 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
55991 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
55992 }
55993
55994 EVT ScalarVT = VT.getScalarType();
55995 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
55996 !Subtarget.hasAnyFMA()) &&
55997 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
55998 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
55999 return SDValue();
56000
56001 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56003 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56004 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56005 CodeSize)) {
56006 V = NegV;
56007 return true;
56008 }
56009 // Look through extract_vector_elts. If it comes from an FNEG, create a
56010 // new extract from the FNEG input.
56011 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56012 isNullConstant(V.getOperand(1))) {
56013 SDValue Vec = V.getOperand(0);
56014 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56015 Vec, DAG, LegalOperations, CodeSize)) {
56016 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56017 NegV, V.getOperand(1));
56018 return true;
56019 }
56020 }
56021 // Lookup if there is an inverted version of constant vector V in DAG.
56022 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56023 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56024 V = NegV;
56025 return true;
56026 }
56027 }
56028 return false;
56029 };
56030
56031 // Do not convert the passthru input of scalar intrinsics.
56032 // FIXME: We could allow negations of the lower element only.
56033 bool NegA = invertIfNegative(A);
56034 // Create a dummy use for A so that in the process of negating B or C
56035 // recursively, it is not deleted.
56036 HandleSDNode NegAHandle(A);
56037 bool NegB = invertIfNegative(B);
56038 // Similar to A, get a handle on B.
56039 HandleSDNode NegBHandle(B);
56040 bool NegC = invertIfNegative(C);
56041
56042 if (!NegA && !NegB && !NegC)
56043 return SDValue();
56044
56045 unsigned NewOpcode =
56046 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56047
56048 // Propagate fast-math-flags to new FMA node.
56049 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56050 if (IsStrict) {
56051 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56052 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56053 {N->getOperand(0), A, B, C});
56054 } else {
56055 if (N->getNumOperands() == 4)
56056 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56057 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56058 }
56059}
56060
56061// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56062// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56065 SDLoc dl(N);
56066 EVT VT = N->getValueType(0);
56067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56069 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56070
56071 SDValue N2 = N->getOperand(2);
56072
56073 SDValue NegN2 =
56074 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56075 if (!NegN2)
56076 return SDValue();
56077 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56078
56079 if (N->getNumOperands() == 4)
56080 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56081 NegN2, N->getOperand(3));
56082 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56083 NegN2);
56084}
56085
56086// Try to widen the build vector and bitcast it to the type of zext.
56087// This is a special case for the 128-bit vector types. Intention is to remove
56088// the zext and replace it with a bitcast the wider type. While lowering
56089// the bitcast is removed and extra commutation due to zext is avoided.
56090// For example:
56091// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56092// build_vector (x, 0, y, 0, z, w, 0)
56094
56095 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56096 return SDValue();
56097
56098 EVT ExtendVT = Extend->getValueType(0);
56099
56100 SDValue BV = Extend->getOperand(0);
56101 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56102 return SDValue();
56103
56104 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56105 // If the build vector has undef elements, we cannot widen it.
56106 // The widening would create a vector with more undef elements, which
56107 // is not valid.
56108 return SDValue();
56109 }
56110
56111 if (!all_of(BV->op_values(),
56112 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56113 // If the build vector any element other than \ISD::LOAD, we cannot widen
56114 // it.
56115 return SDValue();
56116 }
56117
56118 SDLoc dl(BV);
56119 EVT VT = BV.getValueType();
56120 EVT EltVT = BV.getOperand(0).getValueType();
56121 unsigned NumElts = VT.getVectorNumElements();
56122
56123 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56124
56125 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56127 return SDValue();
56128
56129 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56130 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56131
56132 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56133 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56134 // Fill the new elements with Zero.
56135 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56136 // Compute the step to place the elements in the right place and control the
56137 // iteration.
56138 unsigned step = WidenNumElts / NumElts;
56139 if (WidenVT.is128BitVector()) {
56140 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56141 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56142 i--, j -= step) {
56143 SDValue temp = NewOps[i];
56144 NewOps[i] = NewOps[j];
56145 NewOps[j] = temp;
56146 }
56147 // Create new build vector with WidenVT and NewOps
56148 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56149 // Replace the old build vector with the new one. Bitcast the
56150 // new build vector to the type of the zext.
56151 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56152 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56153 return NewBV;
56154 }
56155 }
56156 return SDValue();
56157}
56158
56161 const X86Subtarget &Subtarget) {
56162 SDLoc dl(N);
56163 SDValue N0 = N->getOperand(0);
56164 EVT VT = N->getValueType(0);
56165
56166 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56167 // FIXME: Is this needed? We don't seem to have any tests for it.
56168 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56170 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56171 N0->getOperand(1));
56172 bool ReplaceOtherUses = !N0.hasOneUse();
56173 DCI.CombineTo(N, Setcc);
56174 // Replace other uses with a truncate of the widened setcc_carry.
56175 if (ReplaceOtherUses) {
56176 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56177 N0.getValueType(), Setcc);
56178 DCI.CombineTo(N0.getNode(), Trunc);
56179 }
56180
56181 return SDValue(N, 0);
56182 }
56183
56184 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56185 return NewCMov;
56186
56187 if (DCI.isBeforeLegalizeOps())
56188 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56189 return V;
56190
56191 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56192 DAG, DCI, Subtarget))
56193 return V;
56194
56195 if (VT.isVector())
56196 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56197 return R;
56198
56199 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56200 return NewAdd;
56201
56202 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56203 return R;
56204
56205 // TODO: Combine with any target/faux shuffle.
56206 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56208 SDValue N00 = N0.getOperand(0);
56209 SDValue N01 = N0.getOperand(1);
56210 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56211 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56212 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56213 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56214 return concatSubVectors(N00, N01, DAG, dl);
56215 }
56216 }
56217
56218 if (SDValue V = widenBuildVec(N, DAG))
56219 return V;
56220
56221 return SDValue();
56222}
56223
56224/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56225/// pre-promote its result type since vXi1 vectors don't get promoted
56226/// during type legalization.
56228 SDValue RHS, ISD::CondCode CC,
56229 const SDLoc &DL, SelectionDAG &DAG,
56230 const X86Subtarget &Subtarget) {
56231 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56232 VT.getVectorElementType() == MVT::i1 &&
56233 (OpVT.getVectorElementType() == MVT::i8 ||
56234 OpVT.getVectorElementType() == MVT::i16)) {
56235 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56236 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56237 }
56238 return SDValue();
56239}
56240
56241// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56242// eq/ne) is generated when using an integer as a mask. Instead of generating a
56243// broadcast + vptest, we can directly move the integer to a mask register.
56245 const SDLoc &DL, SelectionDAG &DAG,
56246 const X86Subtarget &Subtarget) {
56247 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56248 return SDValue();
56249
56250 if (!Subtarget.hasAVX512())
56251 return SDValue();
56252
56253 if (Op0.getOpcode() != ISD::AND)
56254 return SDValue();
56255
56256 SDValue Broadcast = Op0.getOperand(0);
56257 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56258 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56259 return SDValue();
56260
56261 SDValue Load = Op0.getOperand(1);
56262 EVT LoadVT = Load.getSimpleValueType();
56263
56264 APInt UndefElts;
56265 SmallVector<APInt, 32> EltBits;
56267 UndefElts, EltBits,
56268 /*AllowWholeUndefs*/ true,
56269 /*AllowPartialUndefs*/ false) ||
56270 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56271 return SDValue();
56272
56273 // Check if the constant pool contains only powers of 2 starting from some
56274 // 2^N. The table may also contain undefs because of widening of vector
56275 // operands.
56276 unsigned N = EltBits[0].logBase2();
56277 unsigned Len = UndefElts.getBitWidth();
56278 for (unsigned I = 1; I != Len; ++I) {
56279 if (UndefElts[I]) {
56280 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56281 return SDValue();
56282 break;
56283 }
56284
56285 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56286 return SDValue();
56287 }
56288
56289 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56290 SDValue BroadcastOp;
56291 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56292 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56293 Broadcast, DAG.getVectorIdxConstant(0, DL));
56294 } else {
56295 BroadcastOp = Broadcast.getOperand(0);
56296 if (BroadcastOp.getValueType().isVector())
56297 return SDValue();
56298 }
56299
56300 SDValue Masked = BroadcastOp;
56301 if (N != 0) {
56302 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56303 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56304
56305 if (NumDefinedElts > BroadcastOpBitWidth)
56306 return SDValue();
56307
56308 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56309 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56310 DAG.getConstant(N, DL, BroadcastOpVT));
56311 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56312 DAG.getConstant(Mask, DL, BroadcastOpVT));
56313 }
56314 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56315 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56316 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56317 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56318
56319 if (CC == ISD::SETEQ)
56320 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56321
56322 if (VT != MVT::v16i1)
56323 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56324 DAG.getVectorIdxConstant(0, DL));
56325
56326 return Bitcast;
56327}
56328
56331 const X86Subtarget &Subtarget) {
56332 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56333 const SDValue LHS = N->getOperand(0);
56334 const SDValue RHS = N->getOperand(1);
56335 EVT VT = N->getValueType(0);
56336 EVT OpVT = LHS.getValueType();
56337 SDLoc DL(N);
56338
56339 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56340 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56341 Subtarget))
56342 return V;
56343 }
56344
56345 if (VT == MVT::i1) {
56346 X86::CondCode X86CC;
56347 if (SDValue V =
56348 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56349 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56350 }
56351
56352 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56353 if (OpVT.isScalarInteger()) {
56354 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56355 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56356 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56357 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56358 if (N0.getOperand(0) == N1)
56359 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56360 N0.getOperand(1));
56361 if (N0.getOperand(1) == N1)
56362 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56363 N0.getOperand(0));
56364 }
56365 return SDValue();
56366 };
56367 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56368 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56369 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56370 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56371
56372 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56373 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56374 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56375 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56376 if (N0.getOperand(0) == N1)
56377 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56378 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56379 if (N0.getOperand(1) == N1)
56380 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56381 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56382 }
56383 return SDValue();
56384 };
56385 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56386 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56387 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56388 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56389
56390 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56391 // cmpne(trunc(x),C) --> cmpne(x,C)
56392 // iff x upper bits are zero.
56393 if (LHS.getOpcode() == ISD::TRUNCATE &&
56394 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56395 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
56396 EVT SrcVT = LHS.getOperand(0).getValueType();
56398 OpVT.getScalarSizeInBits());
56399 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56400 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56401 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56402 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56403 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56404 }
56405
56406 // With C as a power of 2 and C != 0 and C != INT_MIN:
56407 // icmp eq Abs(X) C ->
56408 // (icmp eq A, C) | (icmp eq A, -C)
56409 // icmp ne Abs(X) C ->
56410 // (icmp ne A, C) & (icmp ne A, -C)
56411 // Both of these patterns can be better optimized in
56412 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56413 // integers which is checked above.
56414 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56415 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56416 const APInt &CInt = C->getAPIntValue();
56417 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56418 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56419 SDValue BaseOp = LHS.getOperand(0);
56420 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56421 SDValue SETCC1 = DAG.getSetCC(
56422 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56423 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56424 SETCC0, SETCC1);
56425 }
56426 }
56427 }
56428 }
56429 }
56430
56431 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56432 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56433 // Using temporaries to avoid messing up operand ordering for later
56434 // transformations if this doesn't work.
56435 SDValue Op0 = LHS;
56436 SDValue Op1 = RHS;
56437 ISD::CondCode TmpCC = CC;
56438 // Put build_vector on the right.
56439 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56440 std::swap(Op0, Op1);
56441 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56442 }
56443
56444 bool IsSEXT0 =
56445 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56446 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56447 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56448
56449 if (IsSEXT0 && IsVZero1) {
56450 assert(VT == Op0.getOperand(0).getValueType() &&
56451 "Unexpected operand type");
56452 if (TmpCC == ISD::SETGT)
56453 return DAG.getConstant(0, DL, VT);
56454 if (TmpCC == ISD::SETLE)
56455 return DAG.getConstant(1, DL, VT);
56456 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56457 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56458
56459 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56460 "Unexpected condition code!");
56461 return Op0.getOperand(0);
56462 }
56463
56464 if (IsVZero1)
56465 if (SDValue V =
56466 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56467 return V;
56468 }
56469
56470 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56471 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56472 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56473 // a mask, there are signed AVX512 comparisons).
56474 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56475 bool CanMakeSigned = false;
56476 if (ISD::isUnsignedIntSetCC(CC)) {
56477 KnownBits CmpKnown =
56479 // If we know LHS/RHS share the same sign bit at each element we can
56480 // make this signed.
56481 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56482 // across all lanes. So a pattern where the sign varies from lane to
56483 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56484 // missed. We could get around this by demanding each lane
56485 // independently, but this isn't the most important optimization and
56486 // that may eat into compile time.
56487 CanMakeSigned =
56488 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56489 }
56490 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56491 SDValue LHSOut = LHS;
56492 SDValue RHSOut = RHS;
56493 ISD::CondCode NewCC = CC;
56494 switch (CC) {
56495 case ISD::SETGE:
56496 case ISD::SETUGE:
56497 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56498 /*NSW*/ true))
56499 LHSOut = NewLHS;
56500 else if (SDValue NewRHS = incDecVectorConstant(
56501 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56502 RHSOut = NewRHS;
56503 else
56504 break;
56505
56506 [[fallthrough]];
56507 case ISD::SETUGT:
56508 NewCC = ISD::SETGT;
56509 break;
56510
56511 case ISD::SETLE:
56512 case ISD::SETULE:
56513 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56514 /*NSW*/ true))
56515 LHSOut = NewLHS;
56516 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56517 /*NSW*/ true))
56518 RHSOut = NewRHS;
56519 else
56520 break;
56521
56522 [[fallthrough]];
56523 case ISD::SETULT:
56524 // Will be swapped to SETGT in LowerVSETCC*.
56525 NewCC = ISD::SETLT;
56526 break;
56527 default:
56528 break;
56529 }
56530 if (NewCC != CC) {
56531 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56532 NewCC, DL, DAG, Subtarget))
56533 return R;
56534 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56535 }
56536 }
56537 }
56538
56539 if (SDValue R =
56540 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56541 return R;
56542
56543 // In the middle end transforms:
56544 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56545 // -> `(icmp ult (add x, -C), 2)`
56546 // Likewise inverted cases with `ugt`.
56547 //
56548 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56549 // in worse codegen. So, undo the middle-end transform and go back to `(or
56550 // (icmp eq), (icmp eq))` form.
56551 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56552 // the xmm approach.
56553 //
56554 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56555 // ne))` as it doesn't end up instruction positive.
56556 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56557 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56558 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56559 !Subtarget.hasAVX512() &&
56560 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56561 Subtarget.hasAVX2()) &&
56562 LHS.hasOneUse()) {
56563
56564 APInt CmpC;
56565 SDValue AddC = LHS.getOperand(1);
56566 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56568 // See which form we have depending on the constant/condition.
56569 SDValue C0 = SDValue();
56570 SDValue C1 = SDValue();
56571
56572 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56573 // we will end up generating an additional constant. Keeping in the
56574 // current form has a slight latency cost, but it probably worth saving a
56575 // constant.
56578 // Pass
56579 }
56580 // Normal Cases
56581 else if ((CC == ISD::SETULT && CmpC == 2) ||
56582 (CC == ISD::SETULE && CmpC == 1)) {
56583 // These will constant fold.
56584 C0 = DAG.getNegative(AddC, DL, OpVT);
56585 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56586 DAG.getAllOnesConstant(DL, OpVT));
56587 }
56588 // Inverted Cases
56589 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56590 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56591 // These will constant fold.
56592 C0 = DAG.getNOT(DL, AddC, OpVT);
56593 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56594 DAG.getAllOnesConstant(DL, OpVT));
56595 }
56596 if (C0 && C1) {
56597 SDValue NewLHS =
56598 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56599 SDValue NewRHS =
56600 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56601 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56602 }
56603 }
56604 }
56605
56606 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56607 // to avoid scalarization via legalization because v4i32 is not a legal type.
56608 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56609 LHS.getValueType() == MVT::v4f32)
56610 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56611
56612 // X pred 0.0 --> X pred -X
56613 // If the negation of X already exists, use it in the comparison. This removes
56614 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56615 // instructions in patterns with a 'select' node.
56617 SDVTList FNegVT = DAG.getVTList(OpVT);
56618 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56619 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56620 }
56621
56622 return SDValue();
56623}
56624
56627 const X86Subtarget &Subtarget) {
56628 SDValue Src = N->getOperand(0);
56629 MVT SrcVT = Src.getSimpleValueType();
56630 MVT VT = N->getSimpleValueType(0);
56631 unsigned NumBits = VT.getScalarSizeInBits();
56632 unsigned NumElts = SrcVT.getVectorNumElements();
56633 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56634 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56635
56636 // Perform constant folding.
56637 APInt UndefElts;
56638 SmallVector<APInt, 32> EltBits;
56639 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56640 /*AllowWholeUndefs*/ true,
56641 /*AllowPartialUndefs*/ true)) {
56642 APInt Imm(32, 0);
56643 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56644 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56645 Imm.setBit(Idx);
56646
56647 return DAG.getConstant(Imm, SDLoc(N), VT);
56648 }
56649
56650 // Look through int->fp bitcasts that don't change the element width.
56651 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56652 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56653 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56654 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56655
56656 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56657 // with scalar comparisons.
56658 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56659 SDLoc DL(N);
56660 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56661 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56662 return DAG.getNode(ISD::XOR, DL, VT,
56663 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56664 DAG.getConstant(NotMask, DL, VT));
56665 }
56666
56667 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56668 // results with scalar comparisons.
56669 if (Src.getOpcode() == X86ISD::PCMPGT &&
56670 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56671 SDLoc DL(N);
56672 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56673 return DAG.getNode(ISD::XOR, DL, VT,
56674 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56675 DAG.getConstant(NotMask, DL, VT));
56676 }
56677
56678 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56679 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56680 // iff pow2splat(c1).
56681 // Use KnownBits to determine if only a single bit is non-zero
56682 // in each element (pow2 or zero), and shift that bit to the msb.
56683 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56684 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56685 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56686 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56687 if (KnownLHS.countMaxPopulation() == 1 &&
56688 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56689 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56690 SDLoc DL(N);
56691 MVT ShiftVT = SrcVT;
56692 SDValue ShiftLHS = Src.getOperand(0);
56693 SDValue ShiftRHS = Src.getOperand(1);
56694 if (ShiftVT.getScalarType() == MVT::i8) {
56695 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56696 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56697 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56698 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56699 }
56700 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56701 ShiftLHS, ShiftAmt, DAG);
56702 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56703 ShiftRHS, ShiftAmt, DAG);
56704 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56705 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56706 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56707 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56708 }
56709 }
56710
56711 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56712 if (N->isOnlyUserOf(Src.getNode())) {
56714 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56715 APInt UndefElts;
56716 SmallVector<APInt, 32> EltBits;
56717 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56718 UndefElts, EltBits)) {
56719 APInt Mask = APInt::getZero(NumBits);
56720 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56721 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56722 Mask.setBit(Idx);
56723 }
56724 SDLoc DL(N);
56725 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56726 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56727 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56728 DAG.getConstant(Mask, DL, VT));
56729 }
56730 }
56731 }
56732
56733 // Simplify the inputs.
56734 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56735 APInt DemandedMask(APInt::getAllOnes(NumBits));
56736 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56737 return SDValue(N, 0);
56738
56739 return SDValue();
56740}
56741
56744 const X86Subtarget &Subtarget) {
56745 MVT VT = N->getSimpleValueType(0);
56746 unsigned NumBits = VT.getScalarSizeInBits();
56747
56748 // Simplify the inputs.
56749 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56750 APInt DemandedMask(APInt::getAllOnes(NumBits));
56751 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56752 return SDValue(N, 0);
56753
56754 return SDValue();
56755}
56756
56759 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
56760 SDValue Mask = MemOp->getMask();
56761
56762 // With vector masks we only demand the upper bit of the mask.
56763 if (Mask.getScalarValueSizeInBits() != 1) {
56764 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56765 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56766 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56767 if (N->getOpcode() != ISD::DELETED_NODE)
56768 DCI.AddToWorklist(N);
56769 return SDValue(N, 0);
56770 }
56771 }
56772
56773 return SDValue();
56774}
56775
56777 SDValue Index, SDValue Base, SDValue Scale,
56778 SelectionDAG &DAG) {
56779 SDLoc DL(GorS);
56780
56781 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56782 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56783 Gather->getMask(), Base, Index, Scale } ;
56784 return DAG.getMaskedGather(Gather->getVTList(),
56785 Gather->getMemoryVT(), DL, Ops,
56786 Gather->getMemOperand(),
56787 Gather->getIndexType(),
56788 Gather->getExtensionType());
56789 }
56790 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56791 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56792 Scatter->getMask(), Base, Index, Scale };
56793 return DAG.getMaskedScatter(Scatter->getVTList(),
56794 Scatter->getMemoryVT(), DL,
56795 Ops, Scatter->getMemOperand(),
56796 Scatter->getIndexType(),
56797 Scatter->isTruncatingStore());
56798}
56799
56802 SDLoc DL(N);
56803 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56804 SDValue Index = GorS->getIndex();
56805 SDValue Base = GorS->getBasePtr();
56806 SDValue Scale = GorS->getScale();
56807 EVT IndexVT = Index.getValueType();
56808 EVT IndexSVT = IndexVT.getVectorElementType();
56809 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56810 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56811 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56812
56813 if (DCI.isBeforeLegalize()) {
56814 // Attempt to move shifted index into the address scale, allows further
56815 // index truncation below.
56816 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56817 isa<ConstantSDNode>(Scale)) {
56818 unsigned ScaleAmt = Scale->getAsZExtVal();
56819 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56820 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56821 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56822 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56823 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56824 if (N->getOpcode() != ISD::DELETED_NODE)
56825 DCI.AddToWorklist(N);
56826 return SDValue(N, 0);
56827 }
56828 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56829 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56830 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56831 SDValue ShAmt = Index.getOperand(1);
56832 SDValue NewShAmt =
56833 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56834 DAG.getConstant(1, DL, ShAmt.getValueType()));
56835 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56836 Index.getOperand(0), NewShAmt);
56837 SDValue NewScale =
56838 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56839 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56840 }
56841 }
56842 }
56843
56844 // Shrink indices if they are larger than 32-bits.
56845 // Only do this before legalize types since v2i64 could become v2i32.
56846 // FIXME: We could check that the type is legal if we're after legalize
56847 // types, but then we would need to construct test cases where that happens.
56848 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56849 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56850
56851 // FIXME: We could support more than just constant fold, but we need to
56852 // careful with costing. A truncate that can be optimized out would be
56853 // fine. Otherwise we might only want to create a truncate if it avoids
56854 // a split.
56855 if (SDValue TruncIndex =
56856 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56857 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56858
56859 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56860 // there are sufficient sign bits. Only do this before legalize types to
56861 // avoid creating illegal types in truncate.
56862 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56863 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56864 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56865 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56866 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56867 }
56868
56869 // Shrink if we remove an illegal type.
56870 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56871 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56872 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56873 }
56874 }
56875 }
56876
56877 // Try to move splat adders from the index operand to the base
56878 // pointer operand. Taking care to multiply by the scale. We can only do
56879 // this when index element type is the same as the pointer type.
56880 // Otherwise we need to be sure the math doesn't wrap before the scale.
56881 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56882 isa<ConstantSDNode>(Scale)) {
56883 uint64_t ScaleAmt = Scale->getAsZExtVal();
56884
56885 for (unsigned I = 0; I != 2; ++I)
56886 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56887 BitVector UndefElts;
56888 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56889 if (UndefElts.none()) {
56890 // If the splat value is constant we can add the scaled splat value
56891 // to the existing base.
56892 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56893 APInt Adder = C->getAPIntValue() * ScaleAmt;
56894 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56895 DAG.getConstant(Adder, DL, PtrVT));
56896 SDValue NewIndex = Index.getOperand(1 - I);
56897 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56898 }
56899 // For non-constant cases, limit this to non-scaled cases.
56900 if (ScaleAmt == 1) {
56901 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
56902 SDValue NewIndex = Index.getOperand(1 - I);
56903 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56904 }
56905 }
56906 }
56907 // It's also possible base is just a constant. In that case, just
56908 // replace it with 0 and move the displacement into the index.
56909 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
56910 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
56911 // Combine the constant build_vector and the constant base.
56912 Splat =
56913 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
56914 // Add to the other half of the original Index add.
56915 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
56916 Index.getOperand(1 - I), Splat);
56917 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
56918 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56919 }
56920 }
56921 }
56922
56923 if (DCI.isBeforeLegalizeOps()) {
56924 // Make sure the index is either i32 or i64
56925 if (IndexWidth != 32 && IndexWidth != 64) {
56926 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
56927 IndexVT = IndexVT.changeVectorElementType(EltVT);
56928 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
56929 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56930 }
56931 }
56932
56933 // With vector masks we only demand the upper bit of the mask.
56934 SDValue Mask = GorS->getMask();
56935 if (Mask.getScalarValueSizeInBits() != 1) {
56936 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56937 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56938 if (N->getOpcode() != ISD::DELETED_NODE)
56939 DCI.AddToWorklist(N);
56940 return SDValue(N, 0);
56941 }
56942 }
56943
56944 return SDValue();
56945}
56946
56947// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
56949 const X86Subtarget &Subtarget) {
56950 SDLoc DL(N);
56951 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
56952 SDValue EFLAGS = N->getOperand(1);
56953
56954 // Try to simplify the EFLAGS and condition code operands.
56955 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
56956 return getSETCC(CC, Flags, DL, DAG);
56957
56958 return SDValue();
56959}
56960
56961/// Optimize branch condition evaluation.
56963 const X86Subtarget &Subtarget) {
56964 SDLoc DL(N);
56965 SDValue EFLAGS = N->getOperand(3);
56966 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
56967
56968 // Try to simplify the EFLAGS and condition code operands.
56969 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
56970 // RAUW them under us.
56971 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
56972 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
56973 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
56974 N->getOperand(1), Cond, Flags);
56975 }
56976
56977 return SDValue();
56978}
56979
56980// TODO: Could we move this to DAGCombine?
56982 SelectionDAG &DAG) {
56983 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
56984 // to optimize away operation when it's from a constant.
56985 //
56986 // The general transformation is:
56987 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
56988 // AND(VECTOR_CMP(x,y), constant2)
56989 // constant2 = UNARYOP(constant)
56990
56991 // Early exit if this isn't a vector operation, the operand of the
56992 // unary operation isn't a bitwise AND, or if the sizes of the operations
56993 // aren't the same.
56994 EVT VT = N->getValueType(0);
56995 bool IsStrict = N->isStrictFPOpcode();
56996 unsigned NumEltBits = VT.getScalarSizeInBits();
56997 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56998 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
56999 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57000 VT.getSizeInBits() != Op0.getValueSizeInBits())
57001 return SDValue();
57002
57003 // Now check that the other operand of the AND is a constant. We could
57004 // make the transformation for non-constant splats as well, but it's unclear
57005 // that would be a benefit as it would not eliminate any operations, just
57006 // perform one more step in scalar code before moving to the vector unit.
57007 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57008 // Bail out if the vector isn't a constant.
57009 if (!BV->isConstant())
57010 return SDValue();
57011
57012 // Everything checks out. Build up the new and improved node.
57013 SDLoc DL(N);
57014 EVT IntVT = BV->getValueType(0);
57015 // Create a new constant of the appropriate type for the transformed
57016 // DAG.
57017 SDValue SourceConst;
57018 if (IsStrict)
57019 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57020 {N->getOperand(0), SDValue(BV, 0)});
57021 else
57022 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57023 // The AND node needs bitcasts to/from an integer vector type around it.
57024 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57025 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57026 MaskConst);
57027 SDValue Res = DAG.getBitcast(VT, NewAnd);
57028 if (IsStrict)
57029 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57030 return Res;
57031 }
57032
57033 return SDValue();
57034}
57035
57036/// If we are converting a value to floating-point, try to replace scalar
57037/// truncate of an extracted vector element with a bitcast. This tries to keep
57038/// the sequence on XMM registers rather than moving between vector and GPRs.
57040 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57041 // to allow being called by any similar cast opcode.
57042 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57043 SDValue Trunc = N->getOperand(0);
57044 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57045 return SDValue();
57046
57047 SDValue ExtElt = Trunc.getOperand(0);
57048 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57049 !isNullConstant(ExtElt.getOperand(1)))
57050 return SDValue();
57051
57052 EVT TruncVT = Trunc.getValueType();
57053 EVT SrcVT = ExtElt.getValueType();
57054 unsigned DestWidth = TruncVT.getSizeInBits();
57055 unsigned SrcWidth = SrcVT.getSizeInBits();
57056 if (SrcWidth % DestWidth != 0)
57057 return SDValue();
57058
57059 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57060 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57061 unsigned VecWidth = SrcVecVT.getSizeInBits();
57062 unsigned NumElts = VecWidth / DestWidth;
57063 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57064 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57065 SDLoc DL(N);
57066 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57067 BitcastVec, ExtElt.getOperand(1));
57068 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57069}
57070
57072 const X86Subtarget &Subtarget) {
57073 bool IsStrict = N->isStrictFPOpcode();
57074 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57075 EVT VT = N->getValueType(0);
57076 EVT InVT = Op0.getValueType();
57077
57078 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57079 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57080 // if hasFP16 support:
57081 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57082 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57083 // else
57084 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57085 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57086 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57087 unsigned ScalarSize = InVT.getScalarSizeInBits();
57088 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57089 ScalarSize >= 64)
57090 return SDValue();
57091 SDLoc dl(N);
57092 EVT DstVT =
57094 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57095 : ScalarSize < 32 ? MVT::i32
57096 : MVT::i64,
57097 InVT.getVectorNumElements());
57098 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57099 if (IsStrict)
57100 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57101 {N->getOperand(0), P});
57102 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57103 }
57104
57105 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57106 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57107 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57108 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57109 VT.getScalarType() != MVT::f16) {
57110 SDLoc dl(N);
57111 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57112 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57113
57114 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57115 if (IsStrict)
57116 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57117 {N->getOperand(0), P});
57118 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57119 }
57120
57121 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57122 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57123 // the optimization here.
57124 SDNodeFlags Flags = N->getFlags();
57125 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57126 if (IsStrict)
57127 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57128 {N->getOperand(0), Op0});
57129 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57130 }
57131
57132 return SDValue();
57133}
57134
57137 const X86Subtarget &Subtarget) {
57138 // First try to optimize away the conversion entirely when it's
57139 // conditionally from a constant. Vectors only.
57140 bool IsStrict = N->isStrictFPOpcode();
57142 return Res;
57143
57144 // Now move on to more general possibilities.
57145 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57146 EVT VT = N->getValueType(0);
57147 EVT InVT = Op0.getValueType();
57148
57149 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57150 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57151 // if hasFP16 support:
57152 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57153 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57154 // else
57155 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57156 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57157 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57158 unsigned ScalarSize = InVT.getScalarSizeInBits();
57159 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57160 ScalarSize >= 64)
57161 return SDValue();
57162 SDLoc dl(N);
57163 EVT DstVT =
57165 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57166 : ScalarSize < 32 ? MVT::i32
57167 : MVT::i64,
57168 InVT.getVectorNumElements());
57169 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57170 if (IsStrict)
57171 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57172 {N->getOperand(0), P});
57173 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57174 }
57175
57176 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57177 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57178 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57179 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57180 VT.getScalarType() != MVT::f16) {
57181 SDLoc dl(N);
57182 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57183 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57184 if (IsStrict)
57185 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57186 {N->getOperand(0), P});
57187 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57188 }
57189
57190 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57191 // vectors and scalars, see if we know that the upper bits are all the sign
57192 // bit, in which case we can truncate the input to i32 and convert from that.
57193 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57194 unsigned BitWidth = InVT.getScalarSizeInBits();
57195 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57196 if (NumSignBits >= (BitWidth - 31)) {
57197 EVT TruncVT = MVT::i32;
57198 if (InVT.isVector())
57199 TruncVT = InVT.changeVectorElementType(TruncVT);
57200 SDLoc dl(N);
57201 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57202 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57203 if (IsStrict)
57204 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57205 {N->getOperand(0), Trunc});
57206 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57207 }
57208 // If we're after legalize and the type is v2i32 we need to shuffle and
57209 // use CVTSI2P.
57210 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57211 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57212 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57213 { 0, 2, -1, -1 });
57214 if (IsStrict)
57215 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57216 {N->getOperand(0), Shuf});
57217 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57218 }
57219 }
57220
57221 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57222 // a 32-bit target where SSE doesn't support i64->FP operations.
57223 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57224 Op0.getOpcode() == ISD::LOAD) {
57225 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57226
57227 // This transformation is not supported if the result type is f16 or f128.
57228 if (VT == MVT::f16 || VT == MVT::f128)
57229 return SDValue();
57230
57231 // If we have AVX512DQ we can use packed conversion instructions unless
57232 // the VT is f80.
57233 if (Subtarget.hasDQI() && VT != MVT::f80)
57234 return SDValue();
57235
57236 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57237 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57238 std::pair<SDValue, SDValue> Tmp =
57239 Subtarget.getTargetLowering()->BuildFILD(
57240 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57241 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57242 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57243 return Tmp.first;
57244 }
57245 }
57246
57247 if (IsStrict)
57248 return SDValue();
57249
57250 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57251 return V;
57252
57253 return SDValue();
57254}
57255
57257 const X86Subtarget &Subtarget) {
57258 EVT VT = N->getValueType(0);
57259 SDValue Src = N->getOperand(0);
57260 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57261 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57262 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57263
57264 return SDValue();
57265}
57266
57267// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57269 const X86Subtarget &Subtarget) {
57270 if (!Subtarget.hasAVX10_2())
57271 return SDValue();
57272
57273 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57274 EVT SrcVT = N->getOperand(0).getValueType();
57275 EVT DstVT = N->getValueType(0);
57276 SDLoc dl(N);
57277
57278 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57279 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57280
57281 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57282 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57283 N->getOperand(0), V2F32Value);
57284
57285 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57286 if (IsSigned)
57287 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57288
57289 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57290 }
57291 return SDValue();
57292}
57293
57295 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57296
57297 for (const SDNode *User : Flags->users()) {
57298 X86::CondCode CC;
57299 switch (User->getOpcode()) {
57300 default:
57301 // Be conservative.
57302 return true;
57303 case X86ISD::SETCC:
57305 CC = (X86::CondCode)User->getConstantOperandVal(0);
57306 break;
57307 case X86ISD::BRCOND:
57308 case X86ISD::CMOV:
57309 CC = (X86::CondCode)User->getConstantOperandVal(2);
57310 break;
57311 }
57312
57313 switch (CC) {
57314 // clang-format off
57315 default: break;
57316 case X86::COND_A: case X86::COND_AE:
57317 case X86::COND_B: case X86::COND_BE:
57318 case X86::COND_O: case X86::COND_NO:
57319 case X86::COND_G: case X86::COND_GE:
57320 case X86::COND_L: case X86::COND_LE:
57321 return true;
57322 // clang-format on
57323 }
57324 }
57325
57326 return false;
57327}
57328
57329static bool onlyZeroFlagUsed(SDValue Flags) {
57330 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57331
57332 for (const SDNode *User : Flags->users()) {
57333 unsigned CCOpNo;
57334 switch (User->getOpcode()) {
57335 default:
57336 // Be conservative.
57337 return false;
57338 case X86ISD::SETCC:
57340 CCOpNo = 0;
57341 break;
57342 case X86ISD::BRCOND:
57343 case X86ISD::CMOV:
57344 CCOpNo = 2;
57345 break;
57346 }
57347
57348 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57349 if (CC != X86::COND_E && CC != X86::COND_NE)
57350 return false;
57351 }
57352
57353 return true;
57354}
57355
57358 const X86Subtarget &Subtarget) {
57359 // Only handle test patterns.
57360 if (!isNullConstant(N->getOperand(1)))
57361 return SDValue();
57362
57363 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57364 // and use its flags directly.
57365 // TODO: Maybe we should try promoting compares that only use the zero flag
57366 // first if we can prove the upper bits with computeKnownBits?
57367 SDLoc dl(N);
57368 SDValue Op = N->getOperand(0);
57369 EVT VT = Op.getValueType();
57370 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57371
57372 if (SDValue CMP =
57373 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57374 return CMP;
57375
57376 // If we have a constant logical shift that's only used in a comparison
57377 // against zero turn it into an equivalent AND. This allows turning it into
57378 // a TEST instruction later.
57379 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57380 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57381 onlyZeroFlagUsed(SDValue(N, 0))) {
57382 unsigned BitWidth = VT.getSizeInBits();
57383 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57384 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57385 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57386 APInt Mask = Op.getOpcode() == ISD::SRL
57387 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57388 : APInt::getLowBitsSet(BitWidth, MaskBits);
57389 if (Mask.isSignedIntN(32)) {
57390 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57391 DAG.getConstant(Mask, dl, VT));
57392 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57393 DAG.getConstant(0, dl, VT));
57394 }
57395 }
57396 }
57397
57398 // If we're extracting from a avx512 bool vector and comparing against zero,
57399 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57400 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57401 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57402 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57403 SDValue Src = Op.getOperand(0);
57404 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57405 isNullConstant(Src.getOperand(1)) &&
57406 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57407 SDValue BoolVec = Src.getOperand(0);
57408 unsigned ShAmt = 0;
57409 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57410 ShAmt = BoolVec.getConstantOperandVal(1);
57411 BoolVec = BoolVec.getOperand(0);
57412 }
57413 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57414 EVT VecVT = BoolVec.getValueType();
57415 unsigned BitWidth = VecVT.getVectorNumElements();
57416 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57417 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57418 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57419 Op = DAG.getBitcast(BCVT, BoolVec);
57420 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57421 DAG.getConstant(Mask, dl, BCVT));
57422 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57423 DAG.getConstant(0, dl, BCVT));
57424 }
57425 }
57426 }
57427
57428 // Peek through any zero-extend if we're only testing for a zero result.
57429 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57430 SDValue Src = Op.getOperand(0);
57431 EVT SrcVT = Src.getValueType();
57432 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57433 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57434 DAG.getConstant(0, dl, SrcVT));
57435 }
57436
57437 // Look for a truncate.
57438 if (Op.getOpcode() != ISD::TRUNCATE)
57439 return SDValue();
57440
57441 SDValue Trunc = Op;
57442 Op = Op.getOperand(0);
57443
57444 // See if we can compare with zero against the truncation source,
57445 // which should help using the Z flag from many ops. Only do this for
57446 // i32 truncated op to prevent partial-reg compares of promoted ops.
57447 EVT OpVT = Op.getValueType();
57448 APInt UpperBits =
57450 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57451 onlyZeroFlagUsed(SDValue(N, 0))) {
57452 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57453 DAG.getConstant(0, dl, OpVT));
57454 }
57455
57456 // After this the truncate and arithmetic op must have a single use.
57457 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57458 return SDValue();
57459
57460 unsigned NewOpc;
57461 switch (Op.getOpcode()) {
57462 default: return SDValue();
57463 case ISD::AND:
57464 // Skip and with constant. We have special handling for and with immediate
57465 // during isel to generate test instructions.
57466 if (isa<ConstantSDNode>(Op.getOperand(1)))
57467 return SDValue();
57468 NewOpc = X86ISD::AND;
57469 break;
57470 case ISD::OR: NewOpc = X86ISD::OR; break;
57471 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57472 case ISD::ADD:
57473 // If the carry or overflow flag is used, we can't truncate.
57475 return SDValue();
57476 NewOpc = X86ISD::ADD;
57477 break;
57478 case ISD::SUB:
57479 // If the carry or overflow flag is used, we can't truncate.
57481 return SDValue();
57482 NewOpc = X86ISD::SUB;
57483 break;
57484 }
57485
57486 // We found an op we can narrow. Truncate its inputs.
57487 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57488 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57489
57490 // Use a X86 specific opcode to avoid DAG combine messing with it.
57491 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57492 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57493
57494 // For AND, keep a CMP so that we can match the test pattern.
57495 if (NewOpc == X86ISD::AND)
57496 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57497 DAG.getConstant(0, dl, VT));
57498
57499 // Return the flags.
57500 return Op.getValue(1);
57501}
57502
57505 const X86Subtarget &ST) {
57506 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57507 "Expected X86ISD::ADD or X86ISD::SUB");
57508
57509 SDLoc DL(N);
57510 SDValue LHS = N->getOperand(0);
57511 SDValue RHS = N->getOperand(1);
57512 MVT VT = LHS.getSimpleValueType();
57513 bool IsSub = X86ISD::SUB == N->getOpcode();
57514 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57515
57516 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57517 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57518 return CMP;
57519
57520 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57521 if (!N->hasAnyUseOfValue(1)) {
57522 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57523 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57524 }
57525
57526 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57527 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57528 SDValue Ops[] = {N0, N1};
57529 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57530 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57531 SDValue Op(N, 0);
57532 if (Negate) {
57533 // Bail if this is only used by a user of the x86 add/sub.
57534 if (GenericAddSub->hasOneUse() &&
57535 GenericAddSub->user_begin()->isOnlyUserOf(N))
57536 return;
57537 Op = DAG.getNegative(Op, DL, VT);
57538 }
57539 DCI.CombineTo(GenericAddSub, Op);
57540 }
57541 };
57542 MatchGeneric(LHS, RHS, false);
57543 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57544
57545 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57546 // EFLAGS result doesn't change.
57547 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57548 /*ZeroSecondOpOnly*/ true);
57549}
57550
57552 SDValue LHS = N->getOperand(0);
57553 SDValue RHS = N->getOperand(1);
57554 SDValue BorrowIn = N->getOperand(2);
57555
57556 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57557 MVT VT = N->getSimpleValueType(0);
57558 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57559 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57560 }
57561
57562 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57563 // iff the flag result is dead.
57564 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57565 !N->hasAnyUseOfValue(1))
57566 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57567 LHS.getOperand(1), BorrowIn);
57568
57569 return SDValue();
57570}
57571
57572// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57575 SDValue LHS = N->getOperand(0);
57576 SDValue RHS = N->getOperand(1);
57577 SDValue CarryIn = N->getOperand(2);
57578 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57579 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57580
57581 // Canonicalize constant to RHS.
57582 if (LHSC && !RHSC)
57583 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57584 CarryIn);
57585
57586 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57587 // the result is either zero or one (depending on the input carry bit).
57588 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57589 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57590 // We don't have a good way to replace an EFLAGS use, so only do this when
57591 // dead right now.
57592 SDValue(N, 1).use_empty()) {
57593 SDLoc DL(N);
57594 EVT VT = N->getValueType(0);
57595 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57596 SDValue Res1 = DAG.getNode(
57597 ISD::AND, DL, VT,
57599 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57600 DAG.getConstant(1, DL, VT));
57601 return DCI.CombineTo(N, Res1, CarryOut);
57602 }
57603
57604 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57605 // iff the flag result is dead.
57606 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57607 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57608 SDLoc DL(N);
57609 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57610 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57611 DAG.getConstant(0, DL, LHS.getValueType()),
57612 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57613 }
57614
57615 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57616 MVT VT = N->getSimpleValueType(0);
57617 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57618 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57619 }
57620
57621 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57622 // iff the flag result is dead.
57623 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57624 !N->hasAnyUseOfValue(1))
57625 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57626 LHS.getOperand(1), CarryIn);
57627
57628 return SDValue();
57629}
57630
57632 const SDLoc &DL, EVT VT,
57633 const X86Subtarget &Subtarget) {
57634 using namespace SDPatternMatch;
57635
57636 // Example of pattern we try to detect:
57637 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57638 //(add (build_vector (extract_elt t, 0),
57639 // (extract_elt t, 2),
57640 // (extract_elt t, 4),
57641 // (extract_elt t, 6)),
57642 // (build_vector (extract_elt t, 1),
57643 // (extract_elt t, 3),
57644 // (extract_elt t, 5),
57645 // (extract_elt t, 7)))
57646
57647 if (!Subtarget.hasSSE2())
57648 return SDValue();
57649
57650 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57651 VT.getVectorNumElements() < 4 ||
57653 return SDValue();
57654
57655 SDValue Op0, Op1, Accum;
57656 if (!sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
57657 m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op1)))) &&
57658 !sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
57659 m_Add(m_Value(Accum), m_AllOf(m_Opc(ISD::BUILD_VECTOR),
57660 m_Value(Op1))))))
57661 return SDValue();
57662
57663 // Check if one of Op0,Op1 is of the form:
57664 // (build_vector (extract_elt Mul, 0),
57665 // (extract_elt Mul, 2),
57666 // (extract_elt Mul, 4),
57667 // ...
57668 // the other is of the form:
57669 // (build_vector (extract_elt Mul, 1),
57670 // (extract_elt Mul, 3),
57671 // (extract_elt Mul, 5),
57672 // ...
57673 // and identify Mul.
57674 SDValue Mul;
57675 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57676 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57677 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57678 // TODO: Be more tolerant to undefs.
57679 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57680 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57681 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57682 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57683 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57684 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57685 return SDValue();
57686 // Commutativity of mul allows factors of a product to reorder.
57687 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57688 std::swap(Idx0L, Idx1L);
57689 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57690 std::swap(Idx0H, Idx1H);
57691 // Commutativity of add allows pairs of factors to reorder.
57692 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57693 std::swap(Idx0L, Idx0H);
57694 std::swap(Idx1L, Idx1H);
57695 }
57696 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57697 Idx1H != 2 * i + 3)
57698 return SDValue();
57699 if (!Mul) {
57700 // First time an extract_elt's source vector is visited. Must be a MUL
57701 // with 2X number of vector elements than the BUILD_VECTOR.
57702 // Both extracts must be from same MUL.
57703 Mul = Vec0L;
57704 if (Mul.getOpcode() != ISD::MUL ||
57705 Mul.getValueType().getVectorNumElements() != 2 * e)
57706 return SDValue();
57707 }
57708 // Check that the extract is from the same MUL previously seen.
57709 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57710 return SDValue();
57711 }
57712
57713 // Check if the Mul source can be safely shrunk.
57714 ShrinkMode Mode;
57715 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57716 Mode == ShrinkMode::MULU16)
57717 return SDValue();
57718
57719 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57720 VT.getVectorNumElements() * 2);
57721 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57722 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57723
57724 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57725 ArrayRef<SDValue> Ops) {
57726 EVT InVT = Ops[0].getValueType();
57727 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57728 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57729 InVT.getVectorNumElements() / 2);
57730 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57731 };
57732 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57733 if (Accum)
57734 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57735 return R;
57736}
57737
57738// Attempt to turn this pattern into PMADDWD.
57739// (add (mul (sext (build_vector)), (sext (build_vector))),
57740// (mul (sext (build_vector)), (sext (build_vector)))
57742 const SDLoc &DL, EVT VT,
57743 const X86Subtarget &Subtarget) {
57744 using namespace SDPatternMatch;
57745
57746 if (!Subtarget.hasSSE2())
57747 return SDValue();
57748
57749 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57750 VT.getVectorNumElements() < 4 ||
57752 return SDValue();
57753
57754 // All inputs need to be sign extends.
57755 // TODO: Support ZERO_EXTEND from known positive?
57756 SDValue N00, N01, N10, N11;
57757 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57758 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57759 return SDValue();
57760
57761 // Must be extending from vXi16.
57762 EVT InVT = N00.getValueType();
57763 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57764 N10.getValueType() != InVT || N11.getValueType() != InVT)
57765 return SDValue();
57766
57767 // All inputs should be build_vectors.
57768 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57769 N01.getOpcode() != ISD::BUILD_VECTOR ||
57770 N10.getOpcode() != ISD::BUILD_VECTOR ||
57772 return SDValue();
57773
57774 // For each element, we need to ensure we have an odd element from one vector
57775 // multiplied by the odd element of another vector and the even element from
57776 // one of the same vectors being multiplied by the even element from the
57777 // other vector. So we need to make sure for each element i, this operator
57778 // is being performed:
57779 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57780 SDValue In0, In1;
57781 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57782 SDValue N00Elt = N00.getOperand(i);
57783 SDValue N01Elt = N01.getOperand(i);
57784 SDValue N10Elt = N10.getOperand(i);
57785 SDValue N11Elt = N11.getOperand(i);
57786 // TODO: Be more tolerant to undefs.
57787 SDValue N00In, N01In, N10In, N11In;
57788 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57789 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57790 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57791 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57792 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57793 return SDValue();
57794 // Add is commutative so indices can be reordered.
57795 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57796 std::swap(IdxN00, IdxN10);
57797 std::swap(IdxN01, IdxN11);
57798 }
57799 // N0 indices be the even element. N1 indices must be the next odd element.
57800 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57801 IdxN11 != 2 * i + 1)
57802 return SDValue();
57803
57804 // First time we find an input capture it.
57805 if (!In0) {
57806 In0 = N00In;
57807 In1 = N01In;
57808
57809 // The input vectors must be at least as wide as the output.
57810 // If they are larger than the output, we extract subvector below.
57811 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57812 In1.getValueSizeInBits() < VT.getSizeInBits())
57813 return SDValue();
57814 }
57815 // Mul is commutative so the input vectors can be in any order.
57816 // Canonicalize to make the compares easier.
57817 if (In0 != N00In)
57818 std::swap(N00In, N01In);
57819 if (In0 != N10In)
57820 std::swap(N10In, N11In);
57821 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57822 return SDValue();
57823 }
57824
57825 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57826 ArrayRef<SDValue> Ops) {
57827 EVT OpVT = Ops[0].getValueType();
57828 assert(OpVT.getScalarType() == MVT::i16 &&
57829 "Unexpected scalar element type");
57830 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57831 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57832 OpVT.getVectorNumElements() / 2);
57833 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57834 };
57835
57836 // If the output is narrower than an input, extract the low part of the input
57837 // vector.
57838 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57839 VT.getVectorNumElements() * 2);
57840 if (OutVT16.bitsLT(In0.getValueType())) {
57841 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57842 DAG.getVectorIdxConstant(0, DL));
57843 }
57844 if (OutVT16.bitsLT(In1.getValueType())) {
57845 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57846 DAG.getVectorIdxConstant(0, DL));
57847 }
57848 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57849 PMADDBuilder);
57850}
57851
57852// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57853// If upper element in each pair of both VPMADDWD are zero then we can merge
57854// the operand elements and use the implicit add of VPMADDWD.
57855// TODO: Add support for VPMADDUBSW (which isn't commutable).
57857 const SDLoc &DL, EVT VT) {
57858 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57859 return SDValue();
57860
57861 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57862 if (VT.getSizeInBits() > 128)
57863 return SDValue();
57864
57865 unsigned NumElts = VT.getVectorNumElements();
57866 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57868 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57869
57870 bool Op0HiZero =
57871 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57872 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57873 bool Op1HiZero =
57874 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57875 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57876
57877 // TODO: Check for zero lower elements once we have actual codegen that
57878 // creates them.
57879 if (!Op0HiZero || !Op1HiZero)
57880 return SDValue();
57881
57882 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57883 SmallVector<int> Mask;
57884 for (int i = 0; i != (int)NumElts; ++i) {
57885 Mask.push_back(2 * i);
57886 Mask.push_back(2 * (i + NumElts));
57887 }
57888
57889 SDValue LHS =
57890 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57891 SDValue RHS =
57892 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57893 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57894}
57895
57896/// CMOV of constants requires materializing constant operands in registers.
57897/// Try to fold those constants into an 'add' instruction to reduce instruction
57898/// count. We do this with CMOV rather the generic 'select' because there are
57899/// earlier folds that may be used to turn select-of-constants into logic hacks.
57901 SelectionDAG &DAG,
57902 const X86Subtarget &Subtarget) {
57903 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57904 // better because we eliminate 1-2 instructions. This transform is still
57905 // an improvement without zero operands because we trade 2 move constants and
57906 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57907 // immediate asm operands (fit in 32-bits).
57908 auto isSuitableCmov = [](SDValue V) {
57909 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57910 return false;
57911 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57912 !isa<ConstantSDNode>(V.getOperand(1)))
57913 return false;
57914 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57915 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57916 V.getConstantOperandAPInt(1).isSignedIntN(32));
57917 };
57918
57919 // Match an appropriate CMOV as the first operand of the add.
57920 SDValue Cmov = N->getOperand(0);
57921 SDValue OtherOp = N->getOperand(1);
57922 if (!isSuitableCmov(Cmov))
57923 std::swap(Cmov, OtherOp);
57924 if (!isSuitableCmov(Cmov))
57925 return SDValue();
57926
57927 // Don't remove a load folding opportunity for the add. That would neutralize
57928 // any improvements from removing constant materializations.
57929 if (X86::mayFoldLoad(OtherOp, Subtarget))
57930 return SDValue();
57931
57932 EVT VT = N->getValueType(0);
57933 SDValue FalseOp = Cmov.getOperand(0);
57934 SDValue TrueOp = Cmov.getOperand(1);
57935
57936 // We will push the add through the select, but we can potentially do better
57937 // if we know there is another add in the sequence and this is pointer math.
57938 // In that case, we can absorb an add into the trailing memory op and avoid
57939 // a 3-operand LEA which is likely slower than a 2-operand LEA.
57940 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
57941 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
57942 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
57943 all_of(N->users(), [&](SDNode *Use) {
57944 auto *MemNode = dyn_cast<MemSDNode>(Use);
57945 return MemNode && MemNode->getBasePtr().getNode() == N;
57946 })) {
57947 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
57948 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
57949 // it is possible that choosing op1 might be better.
57950 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
57951 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
57952 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
57953 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
57954 Cmov.getOperand(2), Cmov.getOperand(3));
57955 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
57956 }
57957
57958 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
57959 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
57960 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
57961 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
57962 Cmov.getOperand(3));
57963}
57964
57967 const X86Subtarget &Subtarget) {
57968 using namespace SDPatternMatch;
57969 EVT VT = N->getValueType(0);
57970 SDValue Op0 = N->getOperand(0);
57971 SDValue Op1 = N->getOperand(1);
57972 SDLoc DL(N);
57973
57974 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
57975 return Select;
57976
57977 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
57978 return MAdd;
57979 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
57980 return MAdd;
57981 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
57982 return MAdd;
57983
57984 // Try to synthesize horizontal adds from adds of shuffles.
57985 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
57986 return V;
57987
57988 // Canonicalize hidden LEA pattern:
57989 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
57990 // iff c < 4
57991 if (VT == MVT::i32 || VT == MVT::i64) {
57992 SDValue Y, Z, Shift;
57993 APInt Amt;
57994 if (sd_match(
57995 N, m_Add(m_OneUse(m_Sub(m_AllOf(m_Value(Shift),
57996 m_Shl(m_Value(), m_ConstInt(Amt))),
57997 m_Value(Y))),
57998 m_Value(Z))) &&
57999 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58000 return DAG.getNode(ISD::SUB, DL, VT,
58001 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58002 }
58003 }
58004
58005 SDValue X, Y;
58006
58007 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58008 // iff X and Y won't overflow.
58009 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58010 sd_match(Op1, m_c_BinOp(X86ISD::PSADBW, m_Value(Y), m_Zero())) &&
58011 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58012 MVT OpVT = X.getSimpleValueType();
58013 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58014 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58015 getZeroVector(OpVT, Subtarget, DAG, DL));
58016 }
58017
58018 if (VT.isVector()) {
58019 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58021
58022 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58023 // (sub Y, (sext (vXi1 X))).
58024 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58025 // in generic DAG combine without a legal type check, but adding this there
58026 // caused regressions.
58027 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58028 sd_match(N, m_Add(m_ZExt(m_AllOf(m_SpecificVT(BoolVT), m_Value(X))),
58029 m_Value(Y)))) {
58030 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58031 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58032 }
58033
58034 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58035 // canonicalisation as we don't have good vXi8 shifts.
58036 if (VT.getScalarType() == MVT::i8 &&
58037 sd_match(N, m_Add(m_Value(X), m_Srl(m_Value(Y), m_SpecificInt(7))))) {
58038 SDValue Cmp =
58039 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58040 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58041 }
58042 }
58043
58044 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58045 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58046 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58047 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58048 if (sd_match(N, m_Add(m_Value(Accum),
58049 m_Node(ISD::CONCAT_VECTORS,
58051 m_Value(Lo1)),
58053 m_Value(Hi1)))))) {
58054 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58055 concatSubVectors(Lo0, Hi0, DAG, DL),
58056 concatSubVectors(Lo1, Hi1, DAG, DL));
58057 }
58058 }
58059
58060 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58061 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58062 X86::isZeroNode(Op0.getOperand(1))) {
58063 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58064 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58065 Op0.getOperand(0), Op0.getOperand(2));
58066 }
58067
58068 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58069}
58070
58071// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58072// condition comes from the subtract node that produced -X. This matches the
58073// cmov expansion for absolute value. By swapping the operands we convert abs
58074// to nabs.
58075static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58076 SelectionDAG &DAG) {
58077 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58078 return SDValue();
58079
58080 SDValue Cond = N1.getOperand(3);
58081 if (Cond.getOpcode() != X86ISD::SUB)
58082 return SDValue();
58083 assert(Cond.getResNo() == 1 && "Unexpected result number");
58084
58085 SDValue FalseOp = N1.getOperand(0);
58086 SDValue TrueOp = N1.getOperand(1);
58088
58089 // ABS condition should come from a negate operation.
58090 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58091 isNullConstant(Cond.getOperand(0))) {
58092 // Get the X and -X from the negate.
58093 SDValue NegX = Cond.getValue(0);
58094 SDValue X = Cond.getOperand(1);
58095
58096 // Cmov operands should be X and NegX. Order doesn't matter.
58097 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58098 return SDValue();
58099
58100 // Build a new CMOV with the operands swapped.
58101 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58102 N1.getOperand(2), Cond);
58103 // Convert sub to add.
58104 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58105 }
58106
58107 // Handle ABD special case:
58108 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58109 // ABD condition should come from a pair of matching subtracts.
58110 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58111 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58112 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58113 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58114 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58115 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58116 // Build a new CMOV with the operands swapped.
58117 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58118 Cond);
58119 }
58120
58121 return SDValue();
58122}
58123
58125 SDValue Op0 = N->getOperand(0);
58126 SDValue Op1 = N->getOperand(1);
58127
58128 // (sub C (zero_extend (setcc)))
58129 // =>
58130 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58131 // Don't disturb (sub 0 setcc), which is easily done with neg.
58132 EVT VT = N->getValueType(0);
58133 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58134 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58135 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58136 Op1.getOperand(0).hasOneUse()) {
58137 SDValue SetCC = Op1.getOperand(0);
58140 APInt NewImm = Op0C->getAPIntValue() - 1;
58141 SDLoc DL(Op1);
58142 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58143 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58144 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58145 DAG.getConstant(NewImm, DL, VT));
58146 }
58147
58148 return SDValue();
58149}
58150
58152 if (N->getConstantOperandVal(3) != X86::COND_NE)
58153 return SDValue();
58154
58155 SDValue Sub = N->getOperand(4);
58156 if (Sub.getOpcode() != X86ISD::SUB)
58157 return SDValue();
58158
58159 SDValue Op1 = Sub.getOperand(1);
58160
58161 if (!X86::isZeroNode(Sub.getOperand(0)))
58162 return SDValue();
58163
58164 SDLoc DL(N);
58165 SmallVector<SDValue, 5> Ops(N->op_values());
58166 if (Op1.getOpcode() == X86ISD::SETCC) {
58167 // res, flags2 = sub 0, (setcc cc, flag)
58168 // cload/cstore ..., cond_ne, flag2
58169 // ->
58170 // cload/cstore cc, flag
58171 Ops[3] = Op1.getOperand(0);
58172 Ops[4] = Op1.getOperand(1);
58173 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58174 SDValue Src = Op1;
58175 SDValue Op10 = Op1.getOperand(0);
58176 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58177 // res, flags2 = sub 0, (and (xor X, -1), Y)
58178 // cload/cstore ..., cond_ne, flag2
58179 // ->
58180 // res, flags2 = sub 0, (and X, Y)
58181 // cload/cstore ..., cond_e, flag2
58182 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58183 Op1.getOperand(1));
58184 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58185 }
58186 // res, flags2 = sub 0, (and X, Y)
58187 // cload/cstore ..., cc, flag2
58188 // ->
58189 // res, flags2 = cmp (and X, Y), 0
58190 // cload/cstore ..., cc, flag2
58191 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58192 } else {
58193 return SDValue();
58194 }
58195
58196 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58197 cast<MemSDNode>(N)->getMemoryVT(),
58198 cast<MemSDNode>(N)->getMemOperand());
58199}
58200
58203 const X86Subtarget &Subtarget) {
58204 EVT VT = N->getValueType(0);
58205 SDValue Op0 = N->getOperand(0);
58206 SDValue Op1 = N->getOperand(1);
58207 SDLoc DL(N);
58208
58209 auto IsNonOpaqueConstant = [&](SDValue Op) {
58211 /*AllowOpaques*/ false);
58212 };
58213
58214 // X86 can't encode an immediate LHS of a sub. See if we can push the
58215 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58216 // one use and a constant, invert the immediate, saving one register.
58217 // However, ignore cases where C1 is 0, as those will become a NEG.
58218 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58219 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58220 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58221 Op1->hasOneUse()) {
58222 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58223 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58224 SDValue NewAdd =
58225 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58226 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58227 }
58228
58229 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58230 return V;
58231
58232 // Try to synthesize horizontal subs from subs of shuffles.
58233 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58234 return V;
58235
58236 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58237 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58238 X86::isZeroNode(Op1.getOperand(1))) {
58239 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58240 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58241 Op1.getOperand(0), Op1.getOperand(2));
58242 }
58243
58244 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58245 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58246 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58247 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58248 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58249 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58250 Op1.getOperand(1), Op1.getOperand(2));
58251 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58252 }
58253
58254 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58255 return V;
58256
58257 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58258 return V;
58259
58260 return combineSubSetcc(N, DAG);
58261}
58262
58264 const X86Subtarget &Subtarget) {
58265 unsigned Opcode = N->getOpcode();
58266 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58267 "Unknown PCMP opcode");
58268
58269 SDValue LHS = N->getOperand(0);
58270 SDValue RHS = N->getOperand(1);
58271 MVT VT = N->getSimpleValueType(0);
58272 unsigned EltBits = VT.getScalarSizeInBits();
58273 unsigned NumElts = VT.getVectorNumElements();
58274 SDLoc DL(N);
58275
58276 if (LHS == RHS)
58277 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58278 : DAG.getConstant(0, DL, VT);
58279
58280 // Constant Folding.
58281 // PCMPEQ(X,UNDEF) -> UNDEF
58282 // PCMPGT(X,UNDEF) -> 0
58283 // PCMPGT(UNDEF,X) -> 0
58284 APInt LHSUndefs, RHSUndefs;
58285 SmallVector<APInt> LHSBits, RHSBits;
58286 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58287 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58288 APInt Ones = APInt::getAllOnes(EltBits);
58289 APInt Zero = APInt::getZero(EltBits);
58290 SmallVector<APInt> Results(NumElts);
58291 for (unsigned I = 0; I != NumElts; ++I) {
58292 if (Opcode == X86ISD::PCMPEQ) {
58293 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58294 } else {
58295 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58296 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58297 }
58298 }
58299 if (Opcode == X86ISD::PCMPEQ)
58300 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58301 return getConstVector(Results, VT, DAG, DL);
58302 }
58303
58304 return SDValue();
58305}
58306
58307// Helper to determine if we can convert an integer comparison to a float
58308// comparison byt casting the operands.
58309static std::optional<unsigned>
58310CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58311 unsigned NumSignificantBitsRHS) {
58312 MVT SVT = VT.getScalarType();
58313 assert(SVT == MVT::f32 && "Only tested for float so far");
58314 const fltSemantics &Sem = SVT.getFltSemantics();
58315 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58316 "Only PCMPEQ/PCMPGT currently supported");
58317
58318 // TODO: Handle bitcastable integers.
58319
58320 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58321 // a fp value.
58322 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58323 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58324 return ISD::SINT_TO_FP;
58325
58326 return std::nullopt;
58327}
58328
58329/// Helper that combines an array of subvector ops as if they were the operands
58330/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58331/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58334 const X86Subtarget &Subtarget,
58335 unsigned Depth) {
58336 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58337 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58338
58339 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58340 return DAG.getUNDEF(VT);
58341
58342 if (llvm::all_of(Ops, [](SDValue Op) {
58343 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58344 }))
58345 return getZeroVector(VT, Subtarget, DAG, DL);
58346
58348 return SDValue(); // Limit search depth.
58349
58350 SDValue Op0 = Ops[0];
58351 bool IsSplat = llvm::all_equal(Ops);
58352 unsigned NumOps = Ops.size();
58353 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58354 LLVMContext &Ctx = *DAG.getContext();
58355
58356 // Repeated subvectors.
58357 if (IsSplat &&
58358 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58359 // If this broadcast is inserted into both halves, use a larger broadcast.
58360 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58361 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58362
58363 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58364 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58365 (Subtarget.hasAVX2() ||
58367 VT.getScalarType(), Subtarget)))
58368 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58369 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58370 Op0.getOperand(0),
58371 DAG.getVectorIdxConstant(0, DL)));
58372
58373 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58374 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58375 (Subtarget.hasAVX2() ||
58376 (EltSizeInBits >= 32 &&
58377 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58378 Op0.getOperand(0).getValueType() == VT.getScalarType())
58379 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58380
58381 // concat_vectors(extract_subvector(splat(x)),
58382 // extract_subvector(splat(x))) -> splat(x)
58383 // concat_vectors(extract_subvector(subv_broadcast(x)),
58384 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58385 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58386 Op0.getOperand(0).getValueType() == VT) {
58387 SDValue SrcVec = Op0.getOperand(0);
58388 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58389 return SrcVec;
58390 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58391 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58392 return SrcVec;
58393 }
58394
58395 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58396 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58397 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58398 return DAG.getNode(Op0.getOpcode(), DL, VT,
58400 Op0.getOperand(0), Op0.getOperand(0)),
58401 Op0.getOperand(1));
58402 }
58403
58404 // TODO: This should go in combineX86ShufflesRecursively eventually.
58405 if (NumOps == 2) {
58406 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58407 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58408 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58410 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58411 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58412 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58413 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58414 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58415 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58416 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58417 // Only concat of subvector high halves which vperm2x128 is best at or if
58418 // it should fold into a subvector broadcast.
58419 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58420 SrcVT1.is256BitVector()) {
58421 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58422 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58423 "Bad subvector index");
58424 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58425 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58426 unsigned Index = 0;
58427 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58428 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58429 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58430 DAG.getBitcast(VT, Src0.getOperand(0)),
58431 DAG.getBitcast(VT, Src1.getOperand(0)),
58432 DAG.getTargetConstant(Index, DL, MVT::i8));
58433 }
58434 }
58435 // Widen extract_subvector
58436 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58437 // --> extract_subvector(x,lo)
58438 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58439 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58440 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58441 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58442 return DAG.getBitcast(VT,
58444 Src0.getConstantOperandVal(1),
58445 DAG, DL, VT.getSizeInBits()));
58446 }
58447 }
58448 }
58449
58450 // Repeated opcode.
58451 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58452 // but it currently struggles with different vector widths.
58453 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58454 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58455 })) {
58456 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58458 for (SDValue SubOp : SubOps)
58459 Subs.push_back(SubOp.getOperand(I));
58460 // Attempt to peek through bitcasts and concat the original subvectors.
58461 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58462 if (SubVT.isSimple() && SubVT.isVector()) {
58463 MVT ConcatVT =
58465 SubVT.getVectorElementCount() * Subs.size());
58466 for (SDValue &Sub : Subs)
58467 Sub = DAG.getBitcast(SubVT, Sub);
58468 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58469 Subtarget, Depth + 1))
58470 return DAG.getBitcast(VT, ConcatSrc);
58471 return DAG.getBitcast(
58472 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58473 }
58474 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58475 };
58476 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58477 bool AllConstants = true;
58478 bool AllSubs = true;
58479 unsigned VecSize = VT.getSizeInBits();
58480 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58481 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58482 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58483 }))
58484 return true;
58485 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58486 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58487 unsigned SubSize = BC.getValueSizeInBits();
58488 unsigned EltSize = BC.getScalarValueSizeInBits();
58489 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58491 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58492 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58493 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58494 }
58495 return AllConstants || AllSubs;
58496 };
58497 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58498 bool AllConstants = true;
58500 for (SDValue SubOp : SubOps) {
58501 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58502 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58504 Subs.push_back(SubOp.getOperand(I));
58505 }
58506 if (AllConstants)
58507 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58508 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58509 };
58510
58511 unsigned Opcode = Op0.getOpcode();
58512 switch (Opcode) {
58513 case ISD::BITCAST: {
58514 // TODO: Support AVX1/AVX2 bitcasts.
58516 for (SDValue SubOp : Ops)
58517 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58518 EVT InnerVT = SubOps[0].getValueType();
58519 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58520 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58521 (Subtarget.hasBWI() ||
58522 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58523 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58524 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58525 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58526 return Op.getValueType() == InnerVT;
58527 })) {
58528 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58529 MVT ConcatVT = MVT::getVectorVT(
58530 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58531 if (SDValue ConcatSrc = combineConcatVectorOps(
58532 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58533 return DAG.getBitcast(VT, ConcatSrc);
58534 }
58535 break;
58536 }
58537 case ISD::VECTOR_SHUFFLE: {
58538 // TODO: Generalize NumOps support.
58539 if (!IsSplat && NumOps == 2 &&
58540 ((VT.is256BitVector() &&
58541 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58542 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58543 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58544 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58545 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58546 if (Concat0 || Concat1 ||
58547 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58548 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58549 Subtarget.hasVBMI())) {
58550 int NumSubElts = Op0.getValueType().getVectorNumElements();
58551 SmallVector<int> NewMask;
58552 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58553 M = M >= NumSubElts ? M + NumSubElts : M;
58554 NewMask.push_back(M);
58555 }
58556 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58557 if (0 <= M)
58558 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58559 NewMask.push_back(M);
58560 }
58561 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58562 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58563 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58564 }
58565 }
58566 break;
58567 }
58568 case X86ISD::VBROADCAST: {
58569 // TODO: 512-bit VBROADCAST concatenation.
58570 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58571 return Op.getOperand(0).getValueType().is128BitVector();
58572 })) {
58573 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58574 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58575 ConcatSubOperand(VT, Ops, 0),
58576 ConcatSubOperand(VT, Ops, 0));
58577 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58578 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58579 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58581 DL, VT, ConcatSubOperand(VT, Ops, 0),
58582 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58583 }
58584 break;
58585 }
58586 case X86ISD::MOVDDUP:
58587 case X86ISD::MOVSHDUP:
58588 case X86ISD::MOVSLDUP: {
58589 if (!IsSplat && (VT.is256BitVector() ||
58590 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58591 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58592 break;
58593 }
58594 case X86ISD::SHUFP: {
58595 if (!IsSplat &&
58596 (VT == MVT::v8f32 ||
58597 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58598 llvm::all_of(Ops, [Op0](SDValue Op) {
58599 return Op.getOperand(2) == Op0.getOperand(2);
58600 })) {
58601 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58602 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58603 if (Concat0 || Concat1)
58604 return DAG.getNode(Opcode, DL, VT,
58605 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58606 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58607 Op0.getOperand(2));
58608 }
58609 break;
58610 }
58611 case X86ISD::UNPCKH:
58612 case X86ISD::UNPCKL: {
58613 // TODO: UNPCK should use CombineSubOperand
58614 // Don't concatenate build_vector patterns.
58615 if (!IsSplat &&
58616 ((VT.is256BitVector() &&
58617 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58618 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58619 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58620 none_of(Ops, [](SDValue Op) {
58621 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58623 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58625 })) {
58626 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58627 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58628 if (Concat0 || Concat1 ||
58629 (Subtarget.hasInt256() && EltSizeInBits == 64))
58630 return DAG.getNode(Opcode, DL, VT,
58631 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58632 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58633 }
58634 break;
58635 }
58636 case X86ISD::PSHUFHW:
58637 case X86ISD::PSHUFLW:
58638 case X86ISD::PSHUFD:
58639 if (!IsSplat &&
58640 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58641 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58642 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58643 llvm::all_of(Ops, [Op0](SDValue Op) {
58644 return Op.getOperand(1) == Op0.getOperand(1);
58645 })) {
58646 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58647 Op0.getOperand(1));
58648 }
58649 [[fallthrough]];
58650 case X86ISD::VPERMILPI:
58651 if (!IsSplat && EltSizeInBits == 32 &&
58652 (VT.is256BitVector() ||
58653 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58654 all_of(Ops, [&Op0](SDValue Op) {
58655 return Op0.getOperand(1) == Op.getOperand(1);
58656 })) {
58657 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58658 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58659 Res =
58660 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58661 return DAG.getBitcast(VT, Res);
58662 }
58663 break;
58664 case X86ISD::VPERMILPV:
58665 if (!IsSplat && (VT.is256BitVector() ||
58666 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58667 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58668 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58669 if (Concat0 || Concat1)
58670 return DAG.getNode(Opcode, DL, VT,
58671 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58672 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58673 }
58674 break;
58675 case X86ISD::PSHUFB:
58676 case X86ISD::PSADBW:
58677 case X86ISD::VPMADDUBSW:
58678 case X86ISD::VPMADDWD:
58679 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58680 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58681 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58682 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58683 NumOps * SrcVT.getVectorNumElements());
58684 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58685 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58686 if (Concat0 || Concat1)
58687 return DAG.getNode(
58688 Opcode, DL, VT,
58689 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58690 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58691 }
58692 break;
58693 case X86ISD::VPERMV:
58694 // TODO: Handle 256-bit and NumOps == 4 cases.
58695 if (!IsSplat && NumOps == 2 &&
58696 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58697 MVT OpVT = Op0.getSimpleValueType();
58698 int NumSrcElts = OpVT.getVectorNumElements();
58699 SmallVector<int, 64> ConcatMask;
58700 for (unsigned i = 0; i != NumOps; ++i) {
58701 SmallVector<int, 64> SubMask;
58703 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58704 break;
58705 for (int M : SubMask) {
58706 if (0 <= M)
58707 M += i * NumSrcElts;
58708 ConcatMask.push_back(M);
58709 }
58710 }
58711 if (ConcatMask.size() == (NumOps * NumSrcElts))
58712 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58713 ConcatSubOperand(VT, Ops, 1),
58714 DAG.getUNDEF(VT), Subtarget, DAG);
58715 }
58716 break;
58717 case X86ISD::VPERMV3:
58718 // TODO: Handle 256-bit and NumOps == 4 cases.
58719 if (!IsSplat && NumOps == 2 &&
58720 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58721 MVT OpVT = Op0.getSimpleValueType();
58722 int NumSrcElts = OpVT.getVectorNumElements();
58723 SmallVector<int, 64> ConcatMask;
58724 for (unsigned i = 0; i != NumOps; ++i) {
58725 SmallVector<int, 64> SubMask;
58727 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58728 break;
58729 for (int M : SubMask) {
58730 if (0 <= M) {
58731 int Src = M < NumSrcElts ? 0 : 2;
58732 M += M < NumSrcElts ? 0 : NumSrcElts;
58733
58734 // Reference the lowest sub if the upper sub is the same.
58735 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58736 M += i * NumSrcElts;
58737 }
58738 ConcatMask.push_back(M);
58739 }
58740 }
58741 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58742 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58743 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58744 if (Concat0 || Concat1)
58745 return lowerShuffleWithPERMV(
58746 DL, VT, ConcatMask,
58747 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58748 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58749 DAG);
58750 }
58751 }
58752 break;
58753 case X86ISD::VPERM2X128: {
58754 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58755 assert(NumOps == 2 && "Bad concat_vectors operands");
58756 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58757 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58758 // TODO: Handle zero'd subvectors.
58759 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58760 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58761 (int)((Imm1 >> 4) & 0x3)};
58762 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58763 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58764 Ops[0].getOperand(1), DAG, DL);
58765 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58766 Ops[1].getOperand(1), DAG, DL);
58767 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58768 DAG.getBitcast(ShuffleVT, LHS),
58769 DAG.getBitcast(ShuffleVT, RHS),
58770 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58771 return DAG.getBitcast(VT, Res);
58772 }
58773 }
58774 break;
58775 }
58776 case X86ISD::SHUF128: {
58777 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58778 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58779 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58780 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58781 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58782 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58783 Ops[0].getOperand(1), DAG, DL);
58784 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58785 Ops[1].getOperand(1), DAG, DL);
58786 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58787 DAG.getTargetConstant(Imm, DL, MVT::i8));
58788 }
58789 break;
58790 }
58791 case ISD::TRUNCATE:
58792 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58793 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58794 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58795 SrcVT == Ops[1].getOperand(0).getValueType() &&
58796 Subtarget.useAVX512Regs() &&
58797 Subtarget.getPreferVectorWidth() >= 512 &&
58798 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58799 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58800 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58801 ConcatSubOperand(NewSrcVT, Ops, 0));
58802 }
58803 }
58804 break;
58805 case ISD::ANY_EXTEND:
58806 case ISD::SIGN_EXTEND:
58807 case ISD::ZERO_EXTEND:
58808 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58809 if (!IsSplat && NumOps == 2 &&
58810 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58811 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58812 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58813 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58814 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58815 SrcVT == Ops[1].getOperand(0).getValueType()) {
58816 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58817 return DAG.getNode(Opcode, DL, VT,
58818 ConcatSubOperand(NewSrcVT, Ops, 0));
58819 }
58820 }
58821 break;
58825 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58826 if (!IsSplat && NumOps == 2 &&
58827 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58828 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58829 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58831 Op0.getOperand(0).getValueType() ==
58832 Ops[0].getOperand(0).getValueType()) {
58833 EVT SrcVT = Op0.getOperand(0).getValueType();
58834 unsigned NumElts = VT.getVectorNumElements();
58835 MVT UnpackSVT =
58836 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58837 MVT UnpackVT =
58838 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58839 SDValue Unpack =
58840 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
58841 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
58842 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
58843 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
58844 DAG.getBitcast(SrcVT, Unpack), DAG);
58845 }
58846 break;
58847 }
58848 case X86ISD::VSHLI:
58849 case X86ISD::VSRLI:
58850 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58851 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
58852 llvm::all_of(Ops, [](SDValue Op) {
58853 return Op.getConstantOperandAPInt(1) == 32;
58854 })) {
58855 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
58856 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
58857 Res = DAG.getBitcast(MVT::v8i32, Res);
58858 if (Opcode == X86ISD::VSHLI) {
58859 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58860 {8, 0, 8, 2, 8, 4, 8, 6});
58861 } else {
58862 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58863 {1, 8, 3, 8, 5, 8, 7, 8});
58864 }
58865 return DAG.getBitcast(VT, Res);
58866 }
58867 }
58868 [[fallthrough]];
58869 case X86ISD::VSRAI:
58870 case X86ISD::VSHL:
58871 case X86ISD::VSRL:
58872 case X86ISD::VSRA:
58873 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
58874 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58875 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58876 llvm::all_of(Ops, [Op0](SDValue Op) {
58877 return Op0.getOperand(1) == Op.getOperand(1);
58878 })) {
58879 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58880 Op0.getOperand(1));
58881 }
58882 break;
58883 case X86ISD::VPERMI:
58884 case X86ISD::VROTLI:
58885 case X86ISD::VROTRI:
58886 if (!IsSplat &&
58887 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58888 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58889 llvm::all_of(Ops, [Op0](SDValue Op) {
58890 return Op0.getOperand(1) == Op.getOperand(1);
58891 })) {
58892 assert(!(Opcode == X86ISD::VPERMI &&
58893 Op0.getValueType().is128BitVector()) &&
58894 "Illegal 128-bit X86ISD::VPERMI nodes");
58895 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58896 Op0.getOperand(1));
58897 }
58898 break;
58899 case ISD::AND:
58900 case ISD::OR:
58901 case ISD::XOR:
58902 case X86ISD::ANDNP:
58903 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
58904 if (!IsSplat && (VT.is256BitVector() ||
58905 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58906 // Don't concatenate root AVX1 NOT patterns.
58907 // TODO: Allow NOT folding if Concat0 succeeds.
58908 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
58909 llvm::all_of(Ops, [](SDValue X) {
58910 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
58911 }))
58912 break;
58913 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58914 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58915 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
58916 return DAG.getNode(Opcode, DL, VT,
58917 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58918 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58919 }
58920 break;
58921 case X86ISD::PCMPEQ:
58922 case X86ISD::PCMPGT:
58923 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
58924 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
58925 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58926 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58927 if (Concat0 || Concat1)
58928 return DAG.getNode(Opcode, DL, VT,
58929 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58930 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58931 break;
58932 }
58933
58934 if (!IsSplat && VT == MVT::v8i32) {
58935 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
58936 // TODO: Handle v4f64 as well?
58937 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
58938 for (unsigned I = 0; I != NumOps; ++I) {
58939 MaxSigBitsLHS =
58940 std::max(MaxSigBitsLHS,
58941 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
58942 MaxSigBitsRHS =
58943 std::max(MaxSigBitsRHS,
58944 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
58945 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
58946 break;
58947 }
58948
58949 ISD::CondCode ICC =
58950 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
58951 ISD::CondCode FCC =
58953
58954 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
58955 MVT FpVT = VT.changeVectorElementType(FpSVT);
58956
58957 if (std::optional<unsigned> CastOpc =
58958 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
58959 SDValue LHS = CombineSubOperand(VT, Ops, 0);
58960 SDValue RHS = CombineSubOperand(VT, Ops, 1);
58961 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
58962 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
58963 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
58964 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
58965
58966 bool IsAlwaysSignaling;
58967 unsigned FSETCC =
58968 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
58969 return DAG.getBitcast(
58970 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
58971 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
58972 }
58973 }
58974 break;
58975 case ISD::CTPOP:
58976 case ISD::CTTZ:
58977 case ISD::CTLZ:
58980 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58981 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58982 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58983 }
58984 break;
58986 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
58987 if (!IsSplat &&
58988 (VT.is256BitVector() ||
58989 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58990 llvm::all_of(Ops, [Op0](SDValue Op) {
58991 return Op0.getOperand(2) == Op.getOperand(2);
58992 })) {
58993 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58994 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
58995 }
58996 break;
58997 case ISD::ADD:
58998 case ISD::SUB:
58999 case ISD::MUL:
59000 // TODO: Add more integer binops?
59001 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59002 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59003 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59004 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59005 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59006 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59007 return Op.getOperand(0) == Op.getOperand(1);
59008 }))
59009 return DAG.getNode(Opcode, DL, VT,
59010 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59011 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59012 }
59013 break;
59014 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59015 // their latency are short, so here we don't replace them unless we won't
59016 // introduce extra VINSERT.
59017 case ISD::FADD:
59018 case ISD::FSUB:
59019 case ISD::FMUL:
59020 if (!IsSplat && (VT.is256BitVector() ||
59021 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59022 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59023 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59024 if (Concat0 || Concat1)
59025 return DAG.getNode(Opcode, DL, VT,
59026 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59027 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59028 }
59029 break;
59030 // Always prefer to concatenate high latency FDIV instructions.
59031 case ISD::FDIV:
59032 if (!IsSplat && (VT.is256BitVector() ||
59033 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59034 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59035 ConcatSubOperand(VT, Ops, 1));
59036 }
59037 break;
59038 case X86ISD::HADD:
59039 case X86ISD::HSUB:
59040 case X86ISD::FHADD:
59041 case X86ISD::FHSUB:
59042 if (!IsSplat && VT.is256BitVector() &&
59043 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59044 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59045 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59046 if (Concat0 || Concat1)
59047 return DAG.getNode(Opcode, DL, VT,
59048 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59049 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59050 }
59051 break;
59052 case X86ISD::PACKSS:
59053 case X86ISD::PACKUS:
59054 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59055 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59056 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59057 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59058 NumOps * SrcVT.getVectorNumElements());
59059 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59060 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59061 if (Concat0 || Concat1)
59062 return DAG.getNode(
59063 Opcode, DL, VT,
59064 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59065 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59066 }
59067 break;
59068 case X86ISD::VSHLD:
59069 case X86ISD::VSHRD:
59070 case X86ISD::PALIGNR:
59071 if (!IsSplat &&
59072 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59073 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59074 llvm::all_of(Ops, [Op0](SDValue Op) {
59075 return Op0.getOperand(2) == Op.getOperand(2);
59076 })) {
59077 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59078 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59079 if (Concat0 || Concat1)
59080 return DAG.getNode(Opcode, DL, VT,
59081 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59082 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59083 Op0.getOperand(2));
59084 }
59085 break;
59086 case X86ISD::BLENDI:
59087 if (VT.is256BitVector() && NumOps == 2 &&
59088 (EltSizeInBits >= 32 ||
59089 (Subtarget.hasInt256() &&
59090 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59091 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59092 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59093 if (Concat0 || Concat1) {
59094 unsigned NumElts = VT.getVectorNumElements();
59095 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59096 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59097 Mask = Mask.zextOrTrunc(8);
59098 return DAG.getNode(Opcode, DL, VT,
59099 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59100 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59101 DAG.getTargetConstant(Mask, DL, MVT::i8));
59102 }
59103 }
59104 // TODO: BWI targets should only use CombineSubOperand.
59105 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59106 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59107 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59108 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59109 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59110 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59111 unsigned NumElts = VT.getVectorNumElements();
59112 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59113 for (unsigned I = 1; I != NumOps; ++I)
59114 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59115 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59116 Mask = Mask.zextOrTrunc(NumMaskBits);
59117 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59118 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59119 SDValue Sel =
59120 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59121 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59122 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59123 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59124 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59125 }
59126 }
59127 break;
59128 case ISD::VSELECT:
59129 // TODO: VSELECT should use CombineSubOperand.
59130 if (!IsSplat && Subtarget.hasAVX512() &&
59131 (VT.is256BitVector() ||
59132 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59133 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59134 EVT SelVT = Ops[0].getOperand(0).getValueType();
59135 if (SelVT.getVectorElementType() == MVT::i1) {
59136 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59137 NumOps * SelVT.getVectorNumElements());
59138 if (TLI.isTypeLegal(SelVT))
59139 return DAG.getNode(
59140 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59141 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59142 }
59143 }
59144 [[fallthrough]];
59145 case X86ISD::BLENDV:
59146 // TODO: BLENDV should use CombineSubOperand.
59147 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59148 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59149 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59150 EVT SelVT = Ops[0].getOperand(0).getValueType();
59151 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59152 if (TLI.isTypeLegal(SelVT))
59153 return DAG.getNode(
59154 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59155 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59156 }
59157 break;
59158 }
59159 }
59160
59161 // Fold subvector loads into one.
59162 // If needed, look through bitcasts to get to the load.
59163 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59164 unsigned Fast;
59165 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59166 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59167 *FirstLd->getMemOperand(), &Fast) &&
59168 Fast) {
59169 if (SDValue Ld =
59170 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59171 return Ld;
59172 }
59173 }
59174
59175 // Attempt to fold target constant loads.
59176 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59177 SmallVector<APInt> EltBits;
59178 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59179 for (unsigned I = 0; I != NumOps; ++I) {
59180 APInt OpUndefElts;
59181 SmallVector<APInt> OpEltBits;
59182 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59183 OpEltBits, /*AllowWholeUndefs*/ true,
59184 /*AllowPartialUndefs*/ false))
59185 break;
59186 EltBits.append(OpEltBits);
59187 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59188 }
59189 if (EltBits.size() == VT.getVectorNumElements()) {
59190 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59191 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59192 SDValue CV = DAG.getConstantPool(C, PVT);
59195 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59196 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59198 return Ld;
59199 }
59200 }
59201
59202 // If this simple subvector or scalar/subvector broadcast_load is inserted
59203 // into both halves, use a larger broadcast_load. Update other uses to use
59204 // an extracted subvector.
59205 if (IsSplat &&
59206 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59207 if (ISD::isNormalLoad(Op0.getNode()) ||
59210 auto *Mem = cast<MemSDNode>(Op0);
59211 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59214 if (SDValue BcastLd =
59215 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59216 SDValue BcastSrc =
59217 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59218 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59219 return BcastLd;
59220 }
59221 }
59222 }
59223
59224 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59225 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59226 Subtarget.useAVX512Regs()) {
59227 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59228 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59229 Res = DAG.getBitcast(ShuffleVT, Res);
59230 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59231 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59232 return DAG.getBitcast(VT, Res);
59233 }
59234
59235 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59236 if (!IsSplat &&
59237 ((NumOps == 2 && VT == MVT::v4f64) ||
59238 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59239 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59240 // Collect the individual per-lane v2f64/v4f64 shuffles.
59241 MVT OpVT = Ops[0].getSimpleValueType();
59242 unsigned NumOpElts = OpVT.getVectorNumElements();
59243 SmallVector<SmallVector<SDValue, 2>, 4> SrcOps(NumOps);
59244 SmallVector<SmallVector<int, 8>, 4> SrcMasks(NumOps);
59245 if (all_of(seq<int>(NumOps), [&](int I) {
59246 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59247 Depth + 1) &&
59248 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59249 none_of(SrcMasks[I], isUndefOrZero) &&
59250 SrcMasks[I].size() == NumOpElts &&
59251 all_of(SrcOps[I], [&OpVT](SDValue V) {
59252 return V.getValueType() == OpVT;
59253 });
59254 })) {
59255 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59256 bool Unary = true;
59257 unsigned SHUFPDMask = 0;
59258 SmallVector<SDValue, 4> LHS(NumOps), RHS(NumOps);
59259 for (unsigned I = 0; I != NumOps; ++I) {
59260 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59261 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59262 Unary &= LHS[I] == RHS[I];
59263 for (unsigned J = 0; J != NumOpElts; ++J)
59264 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59265 }
59266 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59267 // PERMILPD mask and we can always profitably concatenate them.
59268 SDValue Concat0 =
59269 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59270 SDValue Concat1 =
59271 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59272 if (Unary || Concat0 || Concat1) {
59273 Concat0 =
59274 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59275 Concat1 =
59276 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59277 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59278 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59279 }
59280 }
59281 }
59282
59283 return SDValue();
59284}
59285
59288 const X86Subtarget &Subtarget) {
59289 EVT VT = N->getValueType(0);
59290 EVT SrcVT = N->getOperand(0).getValueType();
59291 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59292 SmallVector<SDValue, 4> Ops(N->ops());
59293
59294 if (VT.getVectorElementType() == MVT::i1) {
59295 // Attempt to constant fold.
59296 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59298 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59299 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
59300 if (!C) break;
59301 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59302 if (I == (E - 1)) {
59303 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59304 if (TLI.isTypeLegal(IntVT))
59305 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59306 }
59307 }
59308
59309 // Don't do anything else for i1 vectors.
59310 return SDValue();
59311 }
59312
59313 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59314 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59315 Subtarget))
59316 return R;
59317 }
59318
59319 return SDValue();
59320}
59321
59324 const X86Subtarget &Subtarget) {
59325 if (DCI.isBeforeLegalizeOps())
59326 return SDValue();
59327
59328 MVT OpVT = N->getSimpleValueType(0);
59329
59330 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59331
59332 SDLoc dl(N);
59333 SDValue Vec = N->getOperand(0);
59334 SDValue SubVec = N->getOperand(1);
59335
59336 uint64_t IdxVal = N->getConstantOperandVal(2);
59337 MVT SubVecVT = SubVec.getSimpleValueType();
59338 int VecNumElts = OpVT.getVectorNumElements();
59339 int SubVecNumElts = SubVecVT.getVectorNumElements();
59340
59341 if (Vec.isUndef() && SubVec.isUndef())
59342 return DAG.getUNDEF(OpVT);
59343
59344 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59345 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59346 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59347 return getZeroVector(OpVT, Subtarget, DAG, dl);
59348
59350 // If we're inserting into a zero vector and then into a larger zero vector,
59351 // just insert into the larger zero vector directly.
59352 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59354 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59355 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59356 getZeroVector(OpVT, Subtarget, DAG, dl),
59357 SubVec.getOperand(1),
59358 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59359 }
59360
59361 // If we're inserting into a zero vector and our input was extracted from an
59362 // insert into a zero vector of the same type and the extraction was at
59363 // least as large as the original insertion. Just insert the original
59364 // subvector into a zero vector.
59365 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59366 isNullConstant(SubVec.getOperand(1)) &&
59368 SDValue Ins = SubVec.getOperand(0);
59369 if (isNullConstant(Ins.getOperand(2)) &&
59370 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59371 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59372 SubVecVT.getFixedSizeInBits())
59373 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59374 getZeroVector(OpVT, Subtarget, DAG, dl),
59375 Ins.getOperand(1), N->getOperand(2));
59376 }
59377 }
59378
59379 // Stop here if this is an i1 vector.
59380 if (IsI1Vector)
59381 return SDValue();
59382
59383 // Eliminate an intermediate vector widening:
59384 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59385 // insert_subvector X, Y, Idx
59386 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59387 // there?
59388 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59389 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59390 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59391 SubVec.getOperand(1), N->getOperand(2));
59392
59393 // If this is an insert of an extract, combine to a shuffle. Don't do this
59394 // if the insert or extract can be represented with a subregister operation.
59395 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59396 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59397 (IdxVal != 0 ||
59398 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59399 SDValue ExtSrc = SubVec.getOperand(0);
59400 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59401 // Create a shuffle mask matching the extraction and insertion.
59402 SmallVector<int, 64> Mask(VecNumElts);
59403 std::iota(Mask.begin(), Mask.end(), 0);
59404 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59405 ExtIdxVal + VecNumElts);
59406 if (ExtIdxVal != 0)
59407 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59408 // See if we can use a blend instead of extract/insert pair.
59409 SmallVector<int, 64> BlendMask(VecNumElts);
59410 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59411 std::iota(BlendMask.begin() + IdxVal,
59412 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59413 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59414 VecNumElts == (2 * SubVecNumElts)) {
59415 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59416 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59417 SDValue Blend = DAG.getNode(
59418 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59419 DAG.getBitcast(MVT::v8f32, ExtSrc),
59420 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59421 return DAG.getBitcast(OpVT, Blend);
59422 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59423 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59424 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59425 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59426 SDValue Shuffle =
59427 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59428 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59429 return DAG.getBitcast(OpVT, Shuffle);
59430 }
59431 }
59432 }
59433
59434 // Match concat_vector style patterns.
59435 SmallVector<SDValue, 2> SubVectorOps;
59436 if (collectConcatOps(N, SubVectorOps, DAG)) {
59437 if (SDValue Fold =
59438 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59439 return Fold;
59440
59441 // If we're inserting all zeros into the upper half, change this to
59442 // a concat with zero. We will match this to a move
59443 // with implicit upper bit zeroing during isel.
59444 // We do this here because we don't want combineConcatVectorOps to
59445 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59446 if (SubVectorOps.size() == 2 &&
59447 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59448 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59449 getZeroVector(OpVT, Subtarget, DAG, dl),
59450 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59451
59452 // Attempt to recursively combine to a shuffle.
59453 if (all_of(SubVectorOps, [](SDValue SubOp) {
59455 })) {
59456 SDValue Op(N, 0);
59457 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59458 return Res;
59459 }
59460 }
59461
59462 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59463 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59464 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59465
59466 // If this is a broadcast load inserted into an upper undef, use a larger
59467 // broadcast load.
59468 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59469 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59470 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59472 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59473 }
59474
59475 // If we're splatting the lower half subvector of a full vector load into the
59476 // upper half, attempt to create a subvector broadcast.
59477 if ((int)IdxVal == (VecNumElts / 2) &&
59478 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59479 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59480 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59481 if (VecLd && SubLd &&
59483 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59485 SubVecVT, SubLd, 0, DAG);
59486 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59487 BcastLd, DAG.getVectorIdxConstant(0, dl));
59488 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59489 return BcastLd;
59490 }
59491 }
59492
59493 // Attempt to constant fold (if we're not widening).
59494 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59495 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59496 APInt VecUndefElts, SubUndefElts;
59497 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59498 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59499 VecEltBits) &&
59500 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59501 SubEltBits)) {
59502 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59503 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59504 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59505 }
59506 }
59507
59508 // Attempt to recursively combine to a shuffle.
59511 SDValue Op(N, 0);
59512 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59513 return Res;
59514 }
59515
59516 // Match insertion of subvector load that perfectly aliases a base load.
59517 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59518 ISD::isNormalLoad(SubVec.getNode()) &&
59520 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59521 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59522 return Vec;
59523
59524 return SDValue();
59525}
59526
59527/// If we are extracting a subvector of a vector select and the select condition
59528/// is composed of concatenated vectors, try to narrow the select width. This
59529/// is a common pattern for AVX1 integer code because 256-bit selects may be
59530/// legal, but there is almost no integer math/logic available for 256-bit.
59531/// This function should only be called with legal types (otherwise, the calls
59532/// to get simple value types will assert).
59534 SelectionDAG &DAG) {
59535 SDValue Sel = Ext->getOperand(0);
59536 if (Sel.getOpcode() != ISD::VSELECT ||
59537 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59538 return SDValue();
59539
59540 // Note: We assume simple value types because this should only be called with
59541 // legal operations/types.
59542 // TODO: This can be extended to handle extraction to 256-bits.
59543 MVT VT = Ext->getSimpleValueType(0);
59544 if (!VT.is128BitVector())
59545 return SDValue();
59546
59547 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59548 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59549 return SDValue();
59550
59551 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59552 MVT SelVT = Sel.getSimpleValueType();
59553 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59554 "Unexpected vector type with legal operations");
59555
59556 unsigned SelElts = SelVT.getVectorNumElements();
59557 unsigned CastedElts = WideVT.getVectorNumElements();
59558 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59559 if (SelElts % CastedElts == 0) {
59560 // The select has the same or more (narrower) elements than the extract
59561 // operand. The extraction index gets scaled by that factor.
59562 ExtIdx *= (SelElts / CastedElts);
59563 } else if (CastedElts % SelElts == 0) {
59564 // The select has less (wider) elements than the extract operand. Make sure
59565 // that the extraction index can be divided evenly.
59566 unsigned IndexDivisor = CastedElts / SelElts;
59567 if (ExtIdx % IndexDivisor != 0)
59568 return SDValue();
59569 ExtIdx /= IndexDivisor;
59570 } else {
59571 llvm_unreachable("Element count of simple vector types are not divisible?");
59572 }
59573
59574 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59575 unsigned NarrowElts = SelElts / NarrowingFactor;
59576 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59577 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59578 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59579 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59580 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59581 return DAG.getBitcast(VT, NarrowSel);
59582}
59583
59586 const X86Subtarget &Subtarget) {
59587 if (!N->getValueType(0).isSimple())
59588 return SDValue();
59589
59590 MVT VT = N->getSimpleValueType(0);
59591 SDValue InVec = N->getOperand(0);
59592 unsigned IdxVal = N->getConstantOperandVal(1);
59593 EVT InVecVT = InVec.getValueType();
59594 unsigned SizeInBits = VT.getSizeInBits();
59595 unsigned InSizeInBits = InVecVT.getSizeInBits();
59596 unsigned NumSubElts = VT.getVectorNumElements();
59597 unsigned NumInElts = InVecVT.getVectorNumElements();
59598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59599 SDLoc DL(N);
59600
59601 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59602 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59603 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59604 // We let generic combining take over from there to simplify the
59605 // insert/extract and 'not'.
59606 // This pattern emerges during AVX1 legalization. We handle it before lowering
59607 // to avoid complications like splitting constant vector loads.
59608 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59609 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59610 auto isConcatenatedNot = [](SDValue V) {
59611 V = peekThroughBitcasts(V);
59612 if (!isBitwiseNot(V))
59613 return false;
59614 SDValue NotOp = V->getOperand(0);
59616 };
59617 if (isConcatenatedNot(InVec.getOperand(0)) ||
59618 isConcatenatedNot(InVec.getOperand(1))) {
59619 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59620 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59621 splitVectorIntBinary(InVec, DAG, DL),
59622 N->getOperand(1));
59623 }
59624 }
59625
59626 if (DCI.isBeforeLegalizeOps())
59627 return SDValue();
59628
59629 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59630 return V;
59631
59633 return getZeroVector(VT, Subtarget, DAG, DL);
59634
59635 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59636 if (VT.getScalarType() == MVT::i1)
59637 return DAG.getConstant(1, DL, VT);
59638 return getOnesVector(VT, DAG, DL);
59639 }
59640
59641 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59642 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59643
59644 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59645 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59646 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59647 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59648 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59649 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59650 }
59651
59652 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59653 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59654 // iff SUB is entirely contained in the extraction.
59655 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59656 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59657 SDValue Src = InVec.getOperand(0);
59658 SDValue Sub = InVec.getOperand(1);
59659 EVT SubVT = Sub.getValueType();
59660 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59661 if (IdxVal <= InsIdx &&
59662 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59663 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59664 DAG.getVectorIdxConstant(IdxVal, DL));
59665 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59666 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59667 }
59668 }
59669
59670 // If we're extracting an upper subvector see if we'd get the same elements if
59671 // we extracted the lowest subvector instead which should allow
59672 // SimplifyDemandedVectorElts do more simplifications.
59673 if (IdxVal != 0) {
59674 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59675 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59676 });
59677 if (AllEquiv)
59678 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59679 }
59680
59681 // Check if we're extracting a whole broadcasted subvector.
59682 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59683 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59684 EVT MemVT = MemIntr->getMemoryVT();
59685 if (MemVT == VT) {
59686 // If this is the only use, we can replace with a regular load (this may
59687 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59688 // memory chain).
59689 if (InVec.hasOneUse()) {
59690 SDValue Ld =
59691 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59692 MemIntr->getMemOperand());
59693 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59694 return Ld;
59695 }
59696 }
59697 }
59698
59699 // Attempt to extract from the source of a shuffle vector.
59700 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59701 SmallVector<int, 32> ShuffleMask;
59702 SmallVector<int, 32> ScaledMask;
59703 SmallVector<SDValue, 2> ShuffleInputs;
59704 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59705 // Decode the shuffle mask and scale it so its shuffling subvectors.
59706 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59707 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59708 unsigned SubVecIdx = IdxVal / NumSubElts;
59709 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59710 return DAG.getUNDEF(VT);
59711 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59712 return getZeroVector(VT, Subtarget, DAG, DL);
59713 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59714 if (Src.getValueSizeInBits() == InSizeInBits) {
59715 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59716 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59717 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59718 DL, SizeInBits);
59719 }
59720 }
59721 }
59722
59723 auto IsExtractFree = [](SDValue V) {
59724 if (V.hasOneUse()) {
59726 if (V.getOpcode() == ISD::LOAD)
59727 return true;
59728 }
59729 V = peekThroughBitcasts(V);
59730 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59731 return true;
59733 return true;
59734 return V.isUndef();
59735 };
59736
59737 // If we're extracting the lowest subvector and we're the only user,
59738 // we may be able to perform this with a smaller vector width.
59739 unsigned InOpcode = InVec.getOpcode();
59740 if (InVec.hasOneUse()) {
59741 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59742 // v2f64 CVTDQ2PD(v4i32).
59743 if (InOpcode == ISD::SINT_TO_FP &&
59744 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59745 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59746 }
59747 // v2f64 CVTUDQ2PD(v4i32).
59748 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59749 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59750 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59751 }
59752 // v2f64 CVTPS2PD(v4f32).
59753 if (InOpcode == ISD::FP_EXTEND &&
59754 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59755 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59756 }
59757 }
59758 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59759 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59760 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59761 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59762 Subtarget.hasVLX())) &&
59763 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59764 SDValue Src = InVec.getOperand(0);
59765 if (Src.getValueType().getScalarSizeInBits() == 32)
59766 return DAG.getNode(InOpcode, DL, VT,
59767 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59768 }
59769 if (IdxVal == 0 &&
59770 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59771 (SizeInBits == 128 || SizeInBits == 256) &&
59772 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59773 SDValue Ext = InVec.getOperand(0);
59774 if (Ext.getValueSizeInBits() > SizeInBits)
59775 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59776 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59777 return DAG.getNode(ExtOp, DL, VT, Ext);
59778 }
59779 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59780 InVec.getOperand(0).getValueType().is256BitVector() &&
59781 InVec.getOperand(1).getValueType().is256BitVector() &&
59782 InVec.getOperand(2).getValueType().is256BitVector()) {
59783 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59784 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59785 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59786 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59787 }
59788 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59789 (SizeInBits == 128 || SizeInBits == 256)) {
59790 SDValue InVecSrc = InVec.getOperand(0);
59791 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59792 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59793 return DAG.getNode(InOpcode, DL, VT, Ext);
59794 }
59795
59796 if (SizeInBits == 128 || SizeInBits == 256) {
59797 switch (InOpcode) {
59798 case X86ISD::MOVDDUP:
59799 return DAG.getNode(
59800 InOpcode, DL, VT,
59801 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59802 case X86ISD::PSHUFD:
59803 case X86ISD::VPERMILPI:
59804 if (InVec.getOperand(0).hasOneUse()) {
59805 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59806 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59807 return DAG.getNode(InOpcode, DL, VT,
59808 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59809 DL, SizeInBits),
59810 DAG.getTargetConstant(M, DL, MVT::i8));
59811 }
59812 break;
59813 case X86ISD::PCMPEQ:
59814 case X86ISD::PCMPGT:
59815 case X86ISD::UNPCKH:
59816 case X86ISD::UNPCKL:
59817 if (IsExtractFree(InVec.getOperand(0)) ||
59818 IsExtractFree(InVec.getOperand(1)))
59819 return DAG.getNode(InOpcode, DL, VT,
59820 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59821 DL, SizeInBits),
59822 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59823 DL, SizeInBits));
59824 break;
59825 case X86ISD::CMPP:
59826 if (IsExtractFree(InVec.getOperand(0)) ||
59827 IsExtractFree(InVec.getOperand(1)))
59828 return DAG.getNode(InOpcode, DL, VT,
59829 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59830 DL, SizeInBits),
59831 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59832 DL, SizeInBits),
59833 InVec.getOperand(2));
59834 break;
59835 case X86ISD::BLENDI:
59836 if (IsExtractFree(InVec.getOperand(0)) ||
59837 IsExtractFree(InVec.getOperand(1))) {
59838 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59839 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59840 return DAG.getNode(InOpcode, DL, VT,
59841 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59842 DL, SizeInBits),
59843 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59844 DL, SizeInBits),
59845 DAG.getTargetConstant(M, DL, MVT::i8));
59846 }
59847 break;
59848 case X86ISD::VPERMV:
59849 if (IdxVal != 0) {
59850 SDValue Mask = InVec.getOperand(0);
59851 SDValue Src = InVec.getOperand(1);
59852 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59853 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59854 DL, InSizeInBits);
59855 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
59856 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59857 }
59858 break;
59859 case X86ISD::VPERMV3:
59860 if (IdxVal != 0) {
59861 SDValue Src0 = InVec.getOperand(0);
59862 SDValue Mask = InVec.getOperand(1);
59863 SDValue Src1 = InVec.getOperand(2);
59864 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59865 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59866 DL, InSizeInBits);
59867 SDValue Shuffle =
59868 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
59869 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59870 }
59871 break;
59872 }
59873 }
59874 }
59875
59876 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
59877 // as this is very likely to fold into a shuffle/truncation.
59878 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
59879 InVecVT.getScalarSizeInBits() == 64 &&
59880 InVec.getConstantOperandAPInt(1) == 32) {
59881 SDValue Ext =
59882 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
59883 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
59884 }
59885
59886 return SDValue();
59887}
59888
59890 const X86Subtarget &Subtarget) {
59891 using namespace SDPatternMatch;
59892 EVT VT = N->getValueType(0);
59893 SDValue Src = N->getOperand(0);
59894 SDLoc DL(N);
59895
59896 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
59897 // This occurs frequently in our masked scalar intrinsic code and our
59898 // floating point select lowering with AVX512.
59899 // TODO: SimplifyDemandedBits instead?
59900 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
59901 isOneConstant(Src.getOperand(1)))
59902 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
59903
59904 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
59905 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
59906 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
59907 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
59908 isNullConstant(Src.getOperand(1)))
59909 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
59910 Src.getOperand(1));
59911
59912 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
59913 // TODO: Move to DAGCombine/SimplifyDemandedBits?
59914 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
59915 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
59916 if (Op.getValueType() != MVT::i64)
59917 return SDValue();
59918 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
59919 if (Op.getOpcode() == Opc &&
59920 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
59921 return Op.getOperand(0);
59922 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
59923 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
59924 if (Ld->getExtensionType() == Ext &&
59925 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
59926 return Op;
59927 if (IsZeroExt) {
59928 KnownBits Known = DAG.computeKnownBits(Op);
59929 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
59930 return Op;
59931 }
59932 return SDValue();
59933 };
59934
59935 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
59936 return DAG.getBitcast(
59937 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
59938 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
59939
59940 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
59941 return DAG.getBitcast(
59942 VT,
59943 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
59944 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
59945 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
59946 }
59947
59948 if (Src.getOpcode() == ISD::BITCAST) {
59949 SDValue SrcOp = Src.getOperand(0);
59950 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
59951 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
59952 return DAG.getBitcast(
59953 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
59954 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
59955 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
59956 return DAG.getBitcast(
59957 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
59958 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
59959 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
59960 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
59961 }
59962
59963 if (VT == MVT::v4i32) {
59964 SDValue HalfSrc;
59965 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
59966 // to remove XMM->GPR->XMM moves.
59967 if (sd_match(Src, m_AnyExt(m_BitCast(
59968 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
59969 return DAG.getBitcast(
59970 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
59971 }
59972
59973 // See if we're broadcasting the scalar value, in which case just reuse that.
59974 // Ensure the same SDValue from the SDNode use is being used.
59975 if (VT.getScalarType() == Src.getValueType())
59976 for (SDNode *User : Src->users())
59977 if (User->getOpcode() == X86ISD::VBROADCAST &&
59978 Src == User->getOperand(0)) {
59979 unsigned SizeInBits = VT.getFixedSizeInBits();
59980 unsigned BroadcastSizeInBits =
59981 User->getValueSizeInBits(0).getFixedValue();
59982 if (BroadcastSizeInBits == SizeInBits)
59983 return SDValue(User, 0);
59984 if (BroadcastSizeInBits > SizeInBits)
59985 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
59986 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
59987 // coverage.
59988 }
59989
59990 // Check for cases where we've ended up with a scalarized shift, typically
59991 // during type legalization.
59992 switch (Src.getOpcode()) {
59993 case ISD::SHL:
59994 case ISD::SRL:
59995 case ISD::SRA:
59996 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
59997 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
59998 Src.hasOneUse()) {
59999 SDValue SrcVec =
60000 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60001 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60002 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60003 Amt->getZExtValue(), DAG);
60004 }
60005 }
60006 break;
60007 case ISD::FSHL:
60008 case ISD::FSHR:
60009 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60010 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60011 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60012 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60013 Src.hasOneUse()) {
60014 uint64_t AmtVal =
60015 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60016 SDValue SrcVec0 =
60017 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60018 SDValue SrcVec1 =
60019 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60020 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60021 DAG.getConstant(AmtVal, DL, VT));
60022 }
60023 }
60024 break;
60025 }
60026
60027 return SDValue();
60028}
60029
60030// Simplify PMULDQ and PMULUDQ operations.
60033 const X86Subtarget &Subtarget) {
60034 SDValue LHS = N->getOperand(0);
60035 SDValue RHS = N->getOperand(1);
60036
60037 // Canonicalize constant to RHS.
60040 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60041
60042 // Multiply by zero.
60043 // Don't return RHS as it may contain UNDEFs.
60044 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60045 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60046
60047 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60048 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60049 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60050 return SDValue(N, 0);
60051
60052 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60053 // convert it to any_extend_invec, due to the LegalOperations check, do the
60054 // conversion directly to a vector shuffle manually. This exposes combine
60055 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60056 // combineX86ShufflesRecursively on SSE4.1 targets.
60057 // FIXME: This is basically a hack around several other issues related to
60058 // ANY_EXTEND_VECTOR_INREG.
60059 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60060 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60061 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60062 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60063 SDLoc dl(N);
60064 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60065 LHS.getOperand(0), { 0, -1, 1, -1 });
60066 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60067 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60068 }
60069 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60070 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60071 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60072 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60073 SDLoc dl(N);
60074 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60075 RHS.getOperand(0), { 0, -1, 1, -1 });
60076 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60077 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60078 }
60079
60080 return SDValue();
60081}
60082
60083// Simplify VPMADDUBSW/VPMADDWD operations.
60086 MVT VT = N->getSimpleValueType(0);
60087 SDValue LHS = N->getOperand(0);
60088 SDValue RHS = N->getOperand(1);
60089 unsigned Opc = N->getOpcode();
60090 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60092 "Unexpected PMADD opcode");
60093
60094 // Multiply by zero.
60095 // Don't return LHS/RHS as it may contain UNDEFs.
60096 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60098 return DAG.getConstant(0, SDLoc(N), VT);
60099
60100 // Constant folding.
60101 APInt LHSUndefs, RHSUndefs;
60102 SmallVector<APInt> LHSBits, RHSBits;
60103 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60104 unsigned DstEltBits = VT.getScalarSizeInBits();
60105 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60106 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60107 SmallVector<APInt> Result;
60108 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60109 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60110 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60111 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60112 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60113 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60114 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60115 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60116 Result.push_back(Res);
60117 }
60118 return getConstVector(Result, VT, DAG, SDLoc(N));
60119 }
60120
60121 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60122 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60123 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60124 return SDValue(N, 0);
60125
60126 return SDValue();
60127}
60128
60129// Simplify VPMADD52L/VPMADD52H operations.
60132 MVT VT = N->getSimpleValueType(0);
60133 unsigned NumEltBits = VT.getScalarSizeInBits();
60134 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60135 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60136 DCI))
60137 return SDValue(N, 0);
60138
60139 return SDValue();
60140}
60141
60144 const X86Subtarget &Subtarget) {
60145 EVT VT = N->getValueType(0);
60146 SDValue In = N->getOperand(0);
60147 unsigned Opcode = N->getOpcode();
60148 unsigned InOpcode = In.getOpcode();
60149 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60150 SDLoc DL(N);
60151
60152 // Try to merge vector loads and extend_inreg to an extload.
60153 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60154 In.hasOneUse()) {
60155 auto *Ld = cast<LoadSDNode>(In);
60156 if (Ld->isSimple()) {
60157 MVT SVT = In.getSimpleValueType().getVectorElementType();
60160 : ISD::ZEXTLOAD;
60161 EVT MemVT = VT.changeVectorElementType(SVT);
60162 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60163 SDValue Load = DAG.getExtLoad(
60164 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60165 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60166 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60167 return Load;
60168 }
60169 }
60170 }
60171
60172 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60173 if (Opcode == InOpcode)
60174 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60175
60176 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60177 // -> EXTEND_VECTOR_INREG(X).
60178 // TODO: Handle non-zero subvector indices.
60179 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60180 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60181 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60182 In.getValueSizeInBits())
60183 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60184
60185 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60186 // TODO: Move to DAGCombine?
60187 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60188 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60189 In.getValueSizeInBits() == VT.getSizeInBits()) {
60190 unsigned NumElts = VT.getVectorNumElements();
60191 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60192 EVT EltVT = In.getOperand(0).getValueType();
60193 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60194 for (unsigned I = 0; I != NumElts; ++I)
60195 Elts[I * Scale] = In.getOperand(I);
60196 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60197 }
60198
60199 // Attempt to combine as a shuffle on SSE41+ targets.
60200 if (Subtarget.hasSSE41()) {
60201 SDValue Op(N, 0);
60202 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60203 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60204 return Res;
60205 }
60206
60207 return SDValue();
60208}
60209
60212 EVT VT = N->getValueType(0);
60213 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60214 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60215 return DAG.getConstant(0, SDLoc(N), VT);
60216
60217 // Fold kshiftr(extract_subvector(X,C1),C2)
60218 // --> extract_subvector(kshiftr(X,C1+C2),0)
60219 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60220 if (N->getOpcode() == X86ISD::KSHIFTR) {
60221 SDLoc DL(N);
60222 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60223 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60224 SDValue Src = N->getOperand(0).getOperand(0);
60225 uint64_t Amt = N->getConstantOperandVal(1) +
60226 N->getOperand(0).getConstantOperandVal(1);
60227 EVT SrcVT = Src.getValueType();
60228 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60229 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60230 DAG.getTargetConstant(Amt, DL, MVT::i8));
60231 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60232 DAG.getVectorIdxConstant(0, DL));
60233 }
60234 }
60235 }
60236
60237 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60238 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60239 return SDValue(N, 0);
60240
60241 return SDValue();
60242}
60243
60244// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60245// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60246// extra instructions between the conversion due to going to scalar and back.
60248 const X86Subtarget &Subtarget) {
60249 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60250 return SDValue();
60251
60252 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60253 return SDValue();
60254
60255 if (N->getValueType(0) != MVT::f32 ||
60256 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60257 return SDValue();
60258
60259 SDLoc dl(N);
60260 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60261 N->getOperand(0).getOperand(0));
60262 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60263 DAG.getTargetConstant(4, dl, MVT::i32));
60264 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60265 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60266 DAG.getVectorIdxConstant(0, dl));
60267}
60268
60271 const X86Subtarget &Subtarget) {
60272 EVT VT = N->getValueType(0);
60273 bool IsStrict = N->isStrictFPOpcode();
60274 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60275 EVT SrcVT = Src.getValueType();
60276
60277 SDLoc dl(N);
60278 if (SrcVT.getScalarType() == MVT::bf16) {
60279 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60280 !IsStrict && Src.getOperand(0).getValueType() == VT)
60281 return Src.getOperand(0);
60282
60283 if (!SrcVT.isVector())
60284 return SDValue();
60285
60286 assert(!IsStrict && "Strict FP doesn't support BF16");
60287 if (VT.getVectorElementType() == MVT::f64) {
60288 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60289 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60290 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60291 }
60292 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60293 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60294 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60295 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60296 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60297 return DAG.getBitcast(VT, Src);
60298 }
60299
60300 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60301 return SDValue();
60302
60303 if (Subtarget.hasFP16())
60304 return SDValue();
60305
60306 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60307 return SDValue();
60308
60309 if (VT.getVectorElementType() != MVT::f32 &&
60310 VT.getVectorElementType() != MVT::f64)
60311 return SDValue();
60312
60313 unsigned NumElts = VT.getVectorNumElements();
60314 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60315 return SDValue();
60316
60317 // Convert the input to vXi16.
60318 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60319 Src = DAG.getBitcast(IntVT, Src);
60320
60321 // Widen to at least 8 input elements.
60322 if (NumElts < 8) {
60323 unsigned NumConcats = 8 / NumElts;
60324 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60325 : DAG.getConstant(0, dl, IntVT);
60326 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60327 Ops[0] = Src;
60328 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60329 }
60330
60331 // Destination is vXf32 with at least 4 elements.
60332 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60333 std::max(4U, NumElts));
60334 SDValue Cvt, Chain;
60335 if (IsStrict) {
60336 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60337 {N->getOperand(0), Src});
60338 Chain = Cvt.getValue(1);
60339 } else {
60340 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60341 }
60342
60343 if (NumElts < 4) {
60344 assert(NumElts == 2 && "Unexpected size");
60345 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60346 DAG.getVectorIdxConstant(0, dl));
60347 }
60348
60349 if (IsStrict) {
60350 // Extend to the original VT if necessary.
60351 if (Cvt.getValueType() != VT) {
60352 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60353 {Chain, Cvt});
60354 Chain = Cvt.getValue(1);
60355 }
60356 return DAG.getMergeValues({Cvt, Chain}, dl);
60357 }
60358
60359 // Extend to the original VT if necessary.
60360 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60361}
60362
60363// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60366 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60367 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60368 "Unknown broadcast load type");
60369
60370 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60371 SDValue Ptr = MemIntrin->getBasePtr();
60372 SDValue Chain = MemIntrin->getChain();
60373 EVT VT = N->getSimpleValueType(0);
60374 EVT MemVT = MemIntrin->getMemoryVT();
60375
60376 // Look at other users of our base pointer and try to find a wider broadcast.
60377 // The input chain and the size of the memory VT must match.
60378 for (SDNode *User : Ptr->users())
60379 if (User != N && User->getOpcode() == N->getOpcode() &&
60380 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60381 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60382 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60383 MemVT.getSizeInBits() &&
60384 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60385 assert(cast<MemIntrinsicSDNode>(User)->isSimple() &&
60386 MemIntrin->isSimple() && "Illegal broadcast load type");
60388 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60389 VT.getSizeInBits());
60390 Extract = DAG.getBitcast(VT, Extract);
60391 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60392 return Extract;
60393 }
60394
60395 return SDValue();
60396}
60397
60399 const X86Subtarget &Subtarget) {
60400 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60401 return SDValue();
60402
60403 bool IsStrict = N->isStrictFPOpcode();
60404 EVT VT = N->getValueType(0);
60405 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60406 EVT SrcVT = Src.getValueType();
60407
60408 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60409 SrcVT.getVectorElementType() != MVT::f32)
60410 return SDValue();
60411
60412 SDLoc dl(N);
60413
60414 SDValue Cvt, Chain;
60415 unsigned NumElts = VT.getVectorNumElements();
60416 if (Subtarget.hasFP16()) {
60417 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60418 // v4f32 (xint_to_fp v4i64))))
60419 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60420 // v8f16 (CVTXI2P v4i64)))
60421 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60422 Src.getNumOperands() == 2) {
60423 SDValue Cvt0, Cvt1;
60424 SDValue Op0 = Src.getOperand(0);
60425 SDValue Op1 = Src.getOperand(1);
60426 bool IsOp0Strict = Op0->isStrictFPOpcode();
60427 if (Op0.getOpcode() != Op1.getOpcode() ||
60428 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60429 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60430 return SDValue();
60431 }
60432 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60433 if (IsStrict) {
60434 assert(IsOp0Strict && "Op0 must be strict node");
60435 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60438 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60439 {Op0.getOperand(0), Op0.getOperand(1)});
60440 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60441 {Op1.getOperand(0), Op1.getOperand(1)});
60442 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60443 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60444 }
60445 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60447 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60448 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60449 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60450 }
60451 return SDValue();
60452 }
60453
60454 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60455 return SDValue();
60456
60457 // Widen to at least 4 input elements.
60458 if (NumElts < 4)
60459 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60460 DAG.getConstantFP(0.0, dl, SrcVT));
60461
60462 // Destination is v8i16 with at least 8 elements.
60463 EVT CvtVT =
60464 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60465 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60466 if (IsStrict) {
60467 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60468 {N->getOperand(0), Src, Rnd});
60469 Chain = Cvt.getValue(1);
60470 } else {
60471 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60472 }
60473
60474 // Extract down to real number of elements.
60475 if (NumElts < 8) {
60477 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60478 DAG.getVectorIdxConstant(0, dl));
60479 }
60480
60481 Cvt = DAG.getBitcast(VT, Cvt);
60482
60483 if (IsStrict)
60484 return DAG.getMergeValues({Cvt, Chain}, dl);
60485
60486 return Cvt;
60487}
60488
60490 SDValue Src = N->getOperand(0);
60491
60492 // Turn MOVDQ2Q+simple_load into an mmx load.
60493 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60494 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60495
60496 if (LN->isSimple()) {
60497 SDValue NewLd =
60498 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60499 LN->getPointerInfo(), LN->getBaseAlign(),
60500 LN->getMemOperand()->getFlags());
60501 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60502 return NewLd;
60503 }
60504 }
60505
60506 return SDValue();
60507}
60508
60511 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60512 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60513 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60514 return SDValue(N, 0);
60515
60516 return SDValue();
60517}
60518
60519// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60520// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60521// use x86mmx instead.
60523 SDLoc dl(N);
60524
60525 bool MadeChange = false, CastReturnVal = false;
60527 for (const SDValue &Arg : N->op_values()) {
60528 if (Arg.getValueType() == MVT::v1i64) {
60529 MadeChange = true;
60530 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60531 } else
60532 Args.push_back(Arg);
60533 }
60534 SDVTList VTs = N->getVTList();
60535 SDVTList NewVTs = VTs;
60536 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60537 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60538 NewVTArr[0] = MVT::x86mmx;
60539 NewVTs = DAG.getVTList(NewVTArr);
60540 MadeChange = true;
60541 CastReturnVal = true;
60542 }
60543
60544 if (MadeChange) {
60545 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60546 if (CastReturnVal) {
60548 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60549 Returns.push_back(Result.getValue(i));
60550 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60551 return DAG.getMergeValues(Returns, dl);
60552 }
60553 return Result;
60554 }
60555 return SDValue();
60556}
60559 if (!DCI.isBeforeLegalize())
60560 return SDValue();
60561
60562 unsigned IntNo = N->getConstantOperandVal(0);
60563 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60564
60565 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60566 return FixupMMXIntrinsicTypes(N, DAG);
60567
60568 return SDValue();
60569}
60570
60573 if (!DCI.isBeforeLegalize())
60574 return SDValue();
60575
60576 unsigned IntNo = N->getConstantOperandVal(1);
60577 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60578
60579 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60580 return FixupMMXIntrinsicTypes(N, DAG);
60581
60582 return SDValue();
60583}
60584
60587 if (!DCI.isBeforeLegalize())
60588 return SDValue();
60589
60590 unsigned IntNo = N->getConstantOperandVal(1);
60591 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60592
60593 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60594 return FixupMMXIntrinsicTypes(N, DAG);
60595
60596 return SDValue();
60597}
60598
60600 DAGCombinerInfo &DCI) const {
60601 SelectionDAG &DAG = DCI.DAG;
60602 switch (N->getOpcode()) {
60603 // clang-format off
60604 default: break;
60606 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60608 case X86ISD::PEXTRW:
60609 case X86ISD::PEXTRB:
60610 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60612 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60614 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60616 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60617 case ISD::VSELECT:
60618 case ISD::SELECT:
60619 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60620 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60621 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60622 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60623 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60624 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60625 case X86ISD::ADD:
60626 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60627 case X86ISD::CLOAD:
60628 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60629 case X86ISD::SBB: return combineSBB(N, DAG);
60630 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60631 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60632 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60633 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60634 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60635 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60636 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60637 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60638 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60639 case ISD::AVGCEILS:
60640 case ISD::AVGCEILU:
60641 case ISD::AVGFLOORS:
60642 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60643 case X86ISD::BEXTR:
60644 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60645 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60646 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60647 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60648 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60650 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60651 case ISD::SINT_TO_FP:
60653 return combineSIntToFP(N, DAG, DCI, Subtarget);
60654 case ISD::UINT_TO_FP:
60656 return combineUIntToFP(N, DAG, Subtarget);
60657 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60658 case ISD::LRINT:
60659 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60660 case ISD::FADD:
60661 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60662 case X86ISD::VFCMULC:
60663 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60664 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60665 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60666 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60667 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60668 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60669 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60670 case X86ISD::FXOR:
60671 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60672 case X86ISD::FMIN:
60673 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60674 case ISD::FMINNUM:
60675 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60676 case X86ISD::CVTSI2P:
60677 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60678 case X86ISD::CVTP2SI:
60679 case X86ISD::CVTP2UI:
60681 case X86ISD::CVTTP2SI:
60683 case X86ISD::CVTTP2UI:
60684 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60686 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60687 case X86ISD::BT: return combineBT(N, DAG, DCI);
60688 case ISD::ANY_EXTEND:
60689 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60690 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60691 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60695 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60696 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60697 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60698 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60699 case X86ISD::PACKSS:
60700 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60701 case X86ISD::HADD:
60702 case X86ISD::HSUB:
60703 case X86ISD::FHADD:
60704 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60705 case X86ISD::VSHL:
60706 case X86ISD::VSRA:
60707 case X86ISD::VSRL:
60708 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60709 case X86ISD::VSHLI:
60710 case X86ISD::VSRAI:
60711 case X86ISD::VSRLI:
60712 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60714 case X86ISD::PINSRB:
60715 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60716 case X86ISD::SHUFP: // Handle all target specific shuffles
60717 case X86ISD::INSERTPS:
60718 case X86ISD::EXTRQI:
60719 case X86ISD::INSERTQI:
60720 case X86ISD::VALIGN:
60721 case X86ISD::PALIGNR:
60722 case X86ISD::VSHLDQ:
60723 case X86ISD::VSRLDQ:
60724 case X86ISD::BLENDI:
60725 case X86ISD::UNPCKH:
60726 case X86ISD::UNPCKL:
60727 case X86ISD::MOVHLPS:
60728 case X86ISD::MOVLHPS:
60729 case X86ISD::PSHUFB:
60730 case X86ISD::PSHUFD:
60731 case X86ISD::PSHUFHW:
60732 case X86ISD::PSHUFLW:
60733 case X86ISD::MOVSHDUP:
60734 case X86ISD::MOVSLDUP:
60735 case X86ISD::MOVDDUP:
60736 case X86ISD::MOVSS:
60737 case X86ISD::MOVSD:
60738 case X86ISD::MOVSH:
60739 case X86ISD::VBROADCAST:
60740 case X86ISD::VPPERM:
60741 case X86ISD::VPERMI:
60742 case X86ISD::VPERMV:
60743 case X86ISD::VPERMV3:
60744 case X86ISD::VPERMIL2:
60745 case X86ISD::VPERMILPI:
60746 case X86ISD::VPERMILPV:
60747 case X86ISD::VPERM2X128:
60748 case X86ISD::SHUF128:
60749 case X86ISD::VZEXT_MOVL:
60750 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60751 case X86ISD::FMADD_RND:
60752 case X86ISD::FMSUB:
60754 case X86ISD::FMSUB_RND:
60755 case X86ISD::FNMADD:
60757 case X86ISD::FNMADD_RND:
60758 case X86ISD::FNMSUB:
60760 case X86ISD::FNMSUB_RND:
60761 case ISD::FMA:
60762 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60765 case X86ISD::FMADDSUB:
60766 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60767 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60768 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60769 case X86ISD::MGATHER:
60770 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60771 case ISD::MGATHER:
60772 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60773 case X86ISD::PCMPEQ:
60774 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60775 case X86ISD::PMULDQ:
60776 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60777 case X86ISD::VPMADDUBSW:
60778 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60779 case X86ISD::VPMADD52L:
60780 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60781 case X86ISD::KSHIFTL:
60782 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60783 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60785 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60787 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60789 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60790 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60791 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60792 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60793 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60794 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60796 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60797 // clang-format on
60798 }
60799
60800 return SDValue();
60801}
60802
60804 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60805}
60806
60807// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60809 EVT ExtVT) const {
60810 return Subtarget.hasAVX512() || !VT.isVector();
60811}
60812
60814 if (!isTypeLegal(VT))
60815 return false;
60816
60817 // There are no vXi8 shifts.
60818 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
60819 return false;
60820
60821 // TODO: Almost no 8-bit ops are desirable because they have no actual
60822 // size/speed advantages vs. 32-bit ops, but they do have a major
60823 // potential disadvantage by causing partial register stalls.
60824 //
60825 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
60826 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
60827 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
60828 // check for a constant operand to the multiply.
60829 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
60830 return false;
60831
60832 // i16 instruction encodings are longer and some i16 instructions are slow,
60833 // so those are not desirable.
60834 if (VT == MVT::i16) {
60835 switch (Opc) {
60836 default:
60837 break;
60838 case ISD::LOAD:
60839 case ISD::SIGN_EXTEND:
60840 case ISD::ZERO_EXTEND:
60841 case ISD::ANY_EXTEND:
60842 case ISD::MUL:
60843 return false;
60844 case ISD::SHL:
60845 case ISD::SRA:
60846 case ISD::SRL:
60847 case ISD::SUB:
60848 case ISD::ADD:
60849 case ISD::AND:
60850 case ISD::OR:
60851 case ISD::XOR:
60852 // NDD instruction never has "partial register write" issue b/c it has
60853 // destination register's upper bits [63:OSIZE]) zeroed even when
60854 // OSIZE=8/16.
60855 return Subtarget.hasNDD();
60856 }
60857 }
60858
60859 // Any legal type not explicitly accounted for above here is desirable.
60860 return true;
60861}
60862
60865 int JTI,
60866 SelectionDAG &DAG) const {
60867 const Module *M = DAG.getMachineFunction().getFunction().getParent();
60868 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
60869 if (IsCFProtectionSupported) {
60870 // In case control-flow branch protection is enabled, we need to add
60871 // notrack prefix to the indirect branch.
60872 // In order to do that we create NT_BRIND SDNode.
60873 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
60874 SDValue Chain = Value;
60875 // Jump table debug info is only needed if CodeView is enabled.
60877 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
60878 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
60879 }
60880
60881 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
60882}
60883
60886 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
60888 EVT VT = LogicOp->getValueType(0);
60889 EVT OpVT = SETCC0->getOperand(0).getValueType();
60890 if (!VT.isInteger())
60892
60893 if (VT.isVector())
60898
60899 // Don't use `NotAnd` as even though `not` is generally shorter code size than
60900 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
60901 // `NotAnd` applies, `AddAnd` does as well.
60902 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
60903 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
60905}
60906
60908 EVT VT = Op.getValueType();
60909 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
60910 isa<ConstantSDNode>(Op.getOperand(1));
60911
60912 // i16 is legal, but undesirable since i16 instruction encodings are longer
60913 // and some i16 instructions are slow.
60914 // 8-bit multiply-by-constant can usually be expanded to something cheaper
60915 // using LEA and/or other ALU ops.
60916 if (VT != MVT::i16 && !Is8BitMulByConstant)
60917 return false;
60918
60919 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
60920 if (!Op.hasOneUse())
60921 return false;
60922 SDNode *User = *Op->user_begin();
60924 return false;
60925 auto *Ld = cast<LoadSDNode>(Load);
60926 auto *St = cast<StoreSDNode>(User);
60927 return Ld->getBasePtr() == St->getBasePtr();
60928 };
60929
60930 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
60931 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
60932 return false;
60933 if (!Op.hasOneUse())
60934 return false;
60935 SDNode *User = *Op->user_begin();
60936 if (User->getOpcode() != ISD::ATOMIC_STORE)
60937 return false;
60938 auto *Ld = cast<AtomicSDNode>(Load);
60939 auto *St = cast<AtomicSDNode>(User);
60940 return Ld->getBasePtr() == St->getBasePtr();
60941 };
60942
60943 auto IsFoldableZext = [](SDValue Op) {
60944 if (!Op.hasOneUse())
60945 return false;
60946 SDNode *User = *Op->user_begin();
60947 EVT VT = User->getValueType(0);
60948 return (User->getOpcode() == ISD::ZERO_EXTEND &&
60949 (VT == MVT::i32 || VT == MVT::i64));
60950 };
60951
60952 bool Commute = false;
60953 switch (Op.getOpcode()) {
60954 default: return false;
60955 case ISD::SIGN_EXTEND:
60956 case ISD::ZERO_EXTEND:
60957 case ISD::ANY_EXTEND:
60958 break;
60959 case ISD::SHL:
60960 case ISD::SRA:
60961 case ISD::SRL: {
60962 SDValue N0 = Op.getOperand(0);
60963 // Look out for (store (shl (load), x)).
60964 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
60965 return false;
60966 break;
60967 }
60968 case ISD::MUL:
60969 // When ZU is enabled, we prefer to not promote for MUL by a constant
60970 // when there is an opportunity to fold a zext with imulzu.
60971 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
60972 (isa<ConstantSDNode>(Op.getOperand(0)) ||
60973 isa<ConstantSDNode>(Op.getOperand(1))))
60974 return false;
60975 [[fallthrough]];
60976 case ISD::ADD:
60977 case ISD::AND:
60978 case ISD::OR:
60979 case ISD::XOR:
60980 Commute = true;
60981 [[fallthrough]];
60982 case ISD::SUB: {
60983 SDValue N0 = Op.getOperand(0);
60984 SDValue N1 = Op.getOperand(1);
60985 // Avoid disabling potential load folding opportunities.
60986 if (X86::mayFoldLoad(N1, Subtarget) &&
60987 (!Commute || !isa<ConstantSDNode>(N0) ||
60988 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
60989 return false;
60990 if (X86::mayFoldLoad(N0, Subtarget) &&
60991 ((Commute && !isa<ConstantSDNode>(N1)) ||
60992 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
60993 return false;
60994 if (IsFoldableAtomicRMW(N0, Op) ||
60995 (Commute && IsFoldableAtomicRMW(N1, Op)))
60996 return false;
60997 }
60998 }
60999
61000 PVT = MVT::i32;
61001 return true;
61002}
61003
61004//===----------------------------------------------------------------------===//
61005// X86 Inline Assembly Support
61006//===----------------------------------------------------------------------===//
61007
61008// Helper to match a string separated by whitespace.
61010 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
61011
61012 for (StringRef Piece : Pieces) {
61013 if (!S.starts_with(Piece)) // Check if the piece matches.
61014 return false;
61015
61016 S = S.substr(Piece.size());
61018 if (Pos == 0) // We matched a prefix.
61019 return false;
61020
61021 S = S.substr(Pos);
61022 }
61023
61024 return S.empty();
61025}
61026
61028
61029 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
61030 if (llvm::is_contained(AsmPieces, "~{cc}") &&
61031 llvm::is_contained(AsmPieces, "~{flags}") &&
61032 llvm::is_contained(AsmPieces, "~{fpsr}")) {
61033
61034 if (AsmPieces.size() == 3)
61035 return true;
61036 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
61037 return true;
61038 }
61039 }
61040 return false;
61041}
61042
61044 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
61045
61046 StringRef AsmStr = IA->getAsmString();
61047
61048 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
61049 if (!Ty || Ty->getBitWidth() % 16 != 0)
61050 return false;
61051
61052 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
61053 SmallVector<StringRef, 4> AsmPieces;
61054 SplitString(AsmStr, AsmPieces, ";\n");
61055
61056 switch (AsmPieces.size()) {
61057 default: return false;
61058 case 1:
61059 // FIXME: this should verify that we are targeting a 486 or better. If not,
61060 // we will turn this bswap into something that will be lowered to logical
61061 // ops instead of emitting the bswap asm. For now, we don't support 486 or
61062 // lower so don't worry about this.
61063 // bswap $0
61064 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
61065 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
61066 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
61067 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
61068 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
61069 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
61070 // No need to check constraints, nothing other than the equivalent of
61071 // "=r,0" would be valid here.
61073 }
61074
61075 // rorw $$8, ${0:w} --> llvm.bswap.i16
61076 if (CI->getType()->isIntegerTy(16) &&
61077 IA->getConstraintString().starts_with("=r,0,") &&
61078 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
61079 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
61080 AsmPieces.clear();
61081 StringRef ConstraintsStr = IA->getConstraintString();
61082 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
61083 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
61084 if (clobbersFlagRegisters(AsmPieces))
61086 }
61087 break;
61088 case 3:
61089 if (CI->getType()->isIntegerTy(32) &&
61090 IA->getConstraintString().starts_with("=r,0,") &&
61091 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
61092 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
61093 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
61094 AsmPieces.clear();
61095 StringRef ConstraintsStr = IA->getConstraintString();
61096 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
61097 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
61098 if (clobbersFlagRegisters(AsmPieces))
61100 }
61101
61102 if (CI->getType()->isIntegerTy(64)) {
61103 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
61104 if (Constraints.size() >= 2 &&
61105 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
61106 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
61107 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
61108 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
61109 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
61110 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
61112 }
61113 }
61114 break;
61115 }
61116 return false;
61117}
61118
61121 .Case("{@cca}", X86::COND_A)
61122 .Case("{@ccae}", X86::COND_AE)
61123 .Case("{@ccb}", X86::COND_B)
61124 .Case("{@ccbe}", X86::COND_BE)
61125 .Case("{@ccc}", X86::COND_B)
61126 .Case("{@cce}", X86::COND_E)
61127 .Case("{@ccz}", X86::COND_E)
61128 .Case("{@ccg}", X86::COND_G)
61129 .Case("{@ccge}", X86::COND_GE)
61130 .Case("{@ccl}", X86::COND_L)
61131 .Case("{@ccle}", X86::COND_LE)
61132 .Case("{@ccna}", X86::COND_BE)
61133 .Case("{@ccnae}", X86::COND_B)
61134 .Case("{@ccnb}", X86::COND_AE)
61135 .Case("{@ccnbe}", X86::COND_A)
61136 .Case("{@ccnc}", X86::COND_AE)
61137 .Case("{@ccne}", X86::COND_NE)
61138 .Case("{@ccnz}", X86::COND_NE)
61139 .Case("{@ccng}", X86::COND_LE)
61140 .Case("{@ccnge}", X86::COND_L)
61141 .Case("{@ccnl}", X86::COND_GE)
61142 .Case("{@ccnle}", X86::COND_G)
61143 .Case("{@ccno}", X86::COND_NO)
61144 .Case("{@ccnp}", X86::COND_NP)
61145 .Case("{@ccns}", X86::COND_NS)
61146 .Case("{@cco}", X86::COND_O)
61147 .Case("{@ccp}", X86::COND_P)
61148 .Case("{@ccs}", X86::COND_S)
61150 return Cond;
61151}
61152
61153/// Given a constraint letter, return the type of constraint for this target.
61156 if (Constraint.size() == 1) {
61157 switch (Constraint[0]) {
61158 case 'R':
61159 case 'q':
61160 case 'Q':
61161 case 'f':
61162 case 't':
61163 case 'u':
61164 case 'y':
61165 case 'x':
61166 case 'v':
61167 case 'l':
61168 case 'k': // AVX512 masking registers.
61169 return C_RegisterClass;
61170 case 'a':
61171 case 'b':
61172 case 'c':
61173 case 'd':
61174 case 'S':
61175 case 'D':
61176 case 'A':
61177 return C_Register;
61178 case 'I':
61179 case 'J':
61180 case 'K':
61181 case 'N':
61182 case 'G':
61183 case 'L':
61184 case 'M':
61185 return C_Immediate;
61186 case 'C':
61187 case 'e':
61188 case 'Z':
61189 return C_Other;
61190 default:
61191 break;
61192 }
61193 }
61194 else if (Constraint.size() == 2) {
61195 switch (Constraint[0]) {
61196 default:
61197 break;
61198 case 'W':
61199 if (Constraint[1] != 's')
61200 break;
61201 return C_Other;
61202 case 'Y':
61203 switch (Constraint[1]) {
61204 default:
61205 break;
61206 case 'z':
61207 return C_Register;
61208 case 'i':
61209 case 'm':
61210 case 'k':
61211 case 't':
61212 case '2':
61213 return C_RegisterClass;
61214 }
61215 break;
61216 case 'j':
61217 switch (Constraint[1]) {
61218 default:
61219 break;
61220 case 'r':
61221 case 'R':
61222 return C_RegisterClass;
61223 }
61224 }
61225 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61226 return C_Other;
61227 return TargetLowering::getConstraintType(Constraint);
61228}
61229
61230/// Examine constraint type and operand type and determine a weight value.
61231/// This object must already have been set up with the operand type
61232/// and the current alternative constraint selected.
61235 AsmOperandInfo &Info, const char *Constraint) const {
61237 Value *CallOperandVal = Info.CallOperandVal;
61238 // If we don't have a value, we can't do a match,
61239 // but allow it at the lowest weight.
61240 if (!CallOperandVal)
61241 return CW_Default;
61242 Type *Ty = CallOperandVal->getType();
61243 // Look at the constraint type.
61244 switch (*Constraint) {
61245 default:
61247 [[fallthrough]];
61248 case 'R':
61249 case 'q':
61250 case 'Q':
61251 case 'a':
61252 case 'b':
61253 case 'c':
61254 case 'd':
61255 case 'S':
61256 case 'D':
61257 case 'A':
61258 if (CallOperandVal->getType()->isIntegerTy())
61259 Wt = CW_SpecificReg;
61260 break;
61261 case 'f':
61262 case 't':
61263 case 'u':
61264 if (Ty->isFloatingPointTy())
61265 Wt = CW_SpecificReg;
61266 break;
61267 case 'y':
61268 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61269 Wt = CW_SpecificReg;
61270 break;
61271 case 'Y':
61272 if (StringRef(Constraint).size() != 2)
61273 break;
61274 switch (Constraint[1]) {
61275 default:
61276 return CW_Invalid;
61277 // XMM0
61278 case 'z':
61279 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61280 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61281 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61282 return CW_SpecificReg;
61283 return CW_Invalid;
61284 // Conditional OpMask regs (AVX512)
61285 case 'k':
61286 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61287 return CW_Register;
61288 return CW_Invalid;
61289 // Any MMX reg
61290 case 'm':
61291 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61292 return CW_SpecificReg;
61293 return CW_Invalid;
61294 // Any SSE reg when ISA >= SSE2, same as 'x'
61295 case 'i':
61296 case 't':
61297 case '2':
61298 if (!Subtarget.hasSSE2())
61299 return CW_Invalid;
61300 break;
61301 }
61302 break;
61303 case 'j':
61304 if (StringRef(Constraint).size() != 2)
61305 break;
61306 switch (Constraint[1]) {
61307 default:
61308 return CW_Invalid;
61309 case 'r':
61310 case 'R':
61311 if (CallOperandVal->getType()->isIntegerTy())
61312 Wt = CW_SpecificReg;
61313 break;
61314 }
61315 break;
61316 case 'v':
61317 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61318 Wt = CW_Register;
61319 [[fallthrough]];
61320 case 'x':
61321 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61322 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61323 Wt = CW_Register;
61324 break;
61325 case 'k':
61326 // Enable conditional vector operations using %k<#> registers.
61327 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61328 Wt = CW_Register;
61329 break;
61330 case 'I':
61331 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61332 if (C->getZExtValue() <= 31)
61333 Wt = CW_Constant;
61334 break;
61335 case 'J':
61336 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61337 if (C->getZExtValue() <= 63)
61338 Wt = CW_Constant;
61339 break;
61340 case 'K':
61341 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61342 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61343 Wt = CW_Constant;
61344 break;
61345 case 'L':
61346 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61347 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61348 Wt = CW_Constant;
61349 break;
61350 case 'M':
61351 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61352 if (C->getZExtValue() <= 3)
61353 Wt = CW_Constant;
61354 break;
61355 case 'N':
61356 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61357 if (C->getZExtValue() <= 0xff)
61358 Wt = CW_Constant;
61359 break;
61360 case 'G':
61361 case 'C':
61362 if (isa<ConstantFP>(CallOperandVal))
61363 Wt = CW_Constant;
61364 break;
61365 case 'e':
61366 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61367 if ((C->getSExtValue() >= -0x80000000LL) &&
61368 (C->getSExtValue() <= 0x7fffffffLL))
61369 Wt = CW_Constant;
61370 break;
61371 case 'Z':
61372 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61373 if (C->getZExtValue() <= 0xffffffff)
61374 Wt = CW_Constant;
61375 break;
61376 }
61377 return Wt;
61378}
61379
61380/// Try to replace an X constraint, which matches anything, with another that
61381/// has more specific requirements based on the type of the corresponding
61382/// operand.
61384LowerXConstraint(EVT ConstraintVT) const {
61385 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61386 // 'f' like normal targets.
61387 if (ConstraintVT.isFloatingPoint()) {
61388 if (Subtarget.hasSSE1())
61389 return "x";
61390 }
61391
61392 return TargetLowering::LowerXConstraint(ConstraintVT);
61393}
61394
61395// Lower @cc targets via setcc.
61397 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61398 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61400 if (Cond == X86::COND_INVALID)
61401 return SDValue();
61402 // Check that return type is valid.
61403 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61404 OpInfo.ConstraintVT.getSizeInBits() < 8)
61405 report_fatal_error("Glue output operand is of invalid type");
61406
61407 // Get EFLAGS register. Only update chain when copyfrom is glued.
61408 if (Glue.getNode()) {
61409 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61410 Chain = Glue.getValue(1);
61411 } else
61412 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61413 // Extract CC code.
61414 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61415 // Extend to 32-bits
61416 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61417
61418 return Result;
61419}
61420
61421/// Lower the specified operand into the Ops vector.
61422/// If it is invalid, don't add anything to Ops.
61424 StringRef Constraint,
61425 std::vector<SDValue> &Ops,
61426 SelectionDAG &DAG) const {
61427 SDValue Result;
61428 char ConstraintLetter = Constraint[0];
61429 switch (ConstraintLetter) {
61430 default: break;
61431 case 'I':
61432 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61433 if (C->getZExtValue() <= 31) {
61434 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61435 Op.getValueType());
61436 break;
61437 }
61438 }
61439 return;
61440 case 'J':
61441 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61442 if (C->getZExtValue() <= 63) {
61443 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61444 Op.getValueType());
61445 break;
61446 }
61447 }
61448 return;
61449 case 'K':
61450 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61451 if (isInt<8>(C->getSExtValue())) {
61452 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61453 Op.getValueType());
61454 break;
61455 }
61456 }
61457 return;
61458 case 'L':
61459 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61460 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61461 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61462 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61463 Op.getValueType());
61464 break;
61465 }
61466 }
61467 return;
61468 case 'M':
61469 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61470 if (C->getZExtValue() <= 3) {
61471 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61472 Op.getValueType());
61473 break;
61474 }
61475 }
61476 return;
61477 case 'N':
61478 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61479 if (C->getZExtValue() <= 255) {
61480 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61481 Op.getValueType());
61482 break;
61483 }
61484 }
61485 return;
61486 case 'O':
61487 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61488 if (C->getZExtValue() <= 127) {
61489 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61490 Op.getValueType());
61491 break;
61492 }
61493 }
61494 return;
61495 case 'e': {
61496 // 32-bit signed value
61497 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61499 C->getSExtValue())) {
61500 // Widen to 64 bits here to get it sign extended.
61501 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61502 break;
61503 }
61504 // FIXME gcc accepts some relocatable values here too, but only in certain
61505 // memory models; it's complicated.
61506 }
61507 return;
61508 }
61509 case 'W': {
61510 assert(Constraint[1] == 's');
61511 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61512 // offset.
61513 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61514 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61515 BA->getValueType(0)));
61516 } else {
61517 int64_t Offset = 0;
61518 if (Op->getOpcode() == ISD::ADD &&
61519 isa<ConstantSDNode>(Op->getOperand(1))) {
61520 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61521 Op = Op->getOperand(0);
61522 }
61523 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61524 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61525 GA->getValueType(0), Offset));
61526 }
61527 return;
61528 }
61529 case 'Z': {
61530 // 32-bit unsigned value
61531 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61533 C->getZExtValue())) {
61534 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61535 Op.getValueType());
61536 break;
61537 }
61538 }
61539 // FIXME gcc accepts some relocatable values here too, but only in certain
61540 // memory models; it's complicated.
61541 return;
61542 }
61543 case 'i': {
61544 // Literal immediates are always ok.
61545 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61546 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61547 BooleanContent BCont = getBooleanContents(MVT::i64);
61548 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61550 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61551 : CST->getSExtValue();
61552 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61553 break;
61554 }
61555
61556 // In any sort of PIC mode addresses need to be computed at runtime by
61557 // adding in a register or some sort of table lookup. These can't
61558 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61559 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61560 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
61561 return;
61562
61563 // If we are in non-pic codegen mode, we allow the address of a global (with
61564 // an optional displacement) to be used with 'i'.
61565 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61566 // If we require an extra load to get this address, as in PIC mode, we
61567 // can't accept it.
61569 Subtarget.classifyGlobalReference(GA->getGlobal())))
61570 return;
61571 break;
61572 }
61573 }
61574
61575 if (Result.getNode()) {
61576 Ops.push_back(Result);
61577 return;
61578 }
61579 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61580}
61581
61582/// Check if \p RC is a general purpose register class.
61583/// I.e., GR* or one of their variant.
61584static bool isGRClass(const TargetRegisterClass &RC) {
61585 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61586 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61587 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61588 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61589 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61590}
61591
61592/// Check if \p RC is a vector register class.
61593/// I.e., FR* / VR* or one of their variant.
61594static bool isFRClass(const TargetRegisterClass &RC) {
61595 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61596 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61597 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61598 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61599 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61600 RC.hasSuperClassEq(&X86::VR512RegClass);
61601}
61602
61603/// Check if \p RC is a mask register class.
61604/// I.e., VK* or one of their variant.
61605static bool isVKClass(const TargetRegisterClass &RC) {
61606 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61607 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61608 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61609 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61610 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61611 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61612 RC.hasSuperClassEq(&X86::VK64RegClass);
61613}
61614
61615static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61616 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61617}
61618
61619std::pair<unsigned, const TargetRegisterClass *>
61621 StringRef Constraint,
61622 MVT VT) const {
61623 // First, see if this is a constraint that directly corresponds to an LLVM
61624 // register class.
61625 if (Constraint.size() == 1) {
61626 // GCC Constraint Letters
61627 switch (Constraint[0]) {
61628 default: break;
61629 // 'A' means [ER]AX + [ER]DX.
61630 case 'A':
61631 if (Subtarget.is64Bit())
61632 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61633 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61634 "Expecting 64, 32 or 16 bit subtarget");
61635 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61636
61637 // TODO: Slight differences here in allocation order and leaving
61638 // RIP in the class. Do they matter any more here than they do
61639 // in the normal allocation?
61640 case 'k':
61641 if (Subtarget.hasAVX512()) {
61642 if (VT == MVT::v1i1 || VT == MVT::i1)
61643 return std::make_pair(0U, &X86::VK1RegClass);
61644 if (VT == MVT::v8i1 || VT == MVT::i8)
61645 return std::make_pair(0U, &X86::VK8RegClass);
61646 if (VT == MVT::v16i1 || VT == MVT::i16)
61647 return std::make_pair(0U, &X86::VK16RegClass);
61648 }
61649 if (Subtarget.hasBWI()) {
61650 if (VT == MVT::v32i1 || VT == MVT::i32)
61651 return std::make_pair(0U, &X86::VK32RegClass);
61652 if (VT == MVT::v64i1 || VT == MVT::i64)
61653 return std::make_pair(0U, &X86::VK64RegClass);
61654 }
61655 break;
61656 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61657 if (Subtarget.is64Bit()) {
61658 if (VT == MVT::i8 || VT == MVT::i1)
61659 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61660 ? &X86::GR8RegClass
61661 : &X86::GR8_NOREX2RegClass);
61662 if (VT == MVT::i16)
61663 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61664 ? &X86::GR16RegClass
61665 : &X86::GR16_NOREX2RegClass);
61666 if (VT == MVT::i32 || VT == MVT::f32)
61667 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61668 ? &X86::GR32RegClass
61669 : &X86::GR32_NOREX2RegClass);
61670 if (VT != MVT::f80 && !VT.isVector())
61671 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61672 ? &X86::GR64RegClass
61673 : &X86::GR64_NOREX2RegClass);
61674 break;
61675 }
61676 [[fallthrough]];
61677 // 32-bit fallthrough
61678 case 'Q': // Q_REGS
61679 if (VT == MVT::i8 || VT == MVT::i1)
61680 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61681 if (VT == MVT::i16)
61682 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61683 if (VT == MVT::i32 || VT == MVT::f32 ||
61684 (!VT.isVector() && !Subtarget.is64Bit()))
61685 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61686 if (VT != MVT::f80 && !VT.isVector())
61687 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61688 break;
61689 case 'r': // GENERAL_REGS
61690 case 'l': // INDEX_REGS
61691 if (VT == MVT::i8 || VT == MVT::i1)
61692 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61693 ? &X86::GR8RegClass
61694 : &X86::GR8_NOREX2RegClass);
61695 if (VT == MVT::i16)
61696 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61697 ? &X86::GR16RegClass
61698 : &X86::GR16_NOREX2RegClass);
61699 if (VT == MVT::i32 || VT == MVT::f32 ||
61700 (!VT.isVector() && !Subtarget.is64Bit()))
61701 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61702 ? &X86::GR32RegClass
61703 : &X86::GR32_NOREX2RegClass);
61704 if (VT != MVT::f80 && !VT.isVector())
61705 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61706 ? &X86::GR64RegClass
61707 : &X86::GR64_NOREX2RegClass);
61708 break;
61709 case 'R': // LEGACY_REGS
61710 if (VT == MVT::i8 || VT == MVT::i1)
61711 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61712 if (VT == MVT::i16)
61713 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61714 if (VT == MVT::i32 || VT == MVT::f32 ||
61715 (!VT.isVector() && !Subtarget.is64Bit()))
61716 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61717 if (VT != MVT::f80 && !VT.isVector())
61718 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61719 break;
61720 case 'f': // FP Stack registers.
61721 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61722 // value to the correct fpstack register class.
61723 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61724 return std::make_pair(0U, &X86::RFP32RegClass);
61725 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61726 return std::make_pair(0U, &X86::RFP64RegClass);
61727 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61728 return std::make_pair(0U, &X86::RFP80RegClass);
61729 break;
61730 case 'y': // MMX_REGS if MMX allowed.
61731 if (!Subtarget.hasMMX()) break;
61732 return std::make_pair(0U, &X86::VR64RegClass);
61733 case 'v':
61734 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61735 if (!Subtarget.hasSSE1()) break;
61736 bool VConstraint = (Constraint[0] == 'v');
61737
61738 switch (VT.SimpleTy) {
61739 default: break;
61740 // Scalar SSE types.
61741 case MVT::f16:
61742 if (VConstraint && Subtarget.hasFP16())
61743 return std::make_pair(0U, &X86::FR16XRegClass);
61744 break;
61745 case MVT::f32:
61746 case MVT::i32:
61747 if (VConstraint && Subtarget.hasVLX())
61748 return std::make_pair(0U, &X86::FR32XRegClass);
61749 return std::make_pair(0U, &X86::FR32RegClass);
61750 case MVT::f64:
61751 case MVT::i64:
61752 if (VConstraint && Subtarget.hasVLX())
61753 return std::make_pair(0U, &X86::FR64XRegClass);
61754 return std::make_pair(0U, &X86::FR64RegClass);
61755 case MVT::i128:
61756 if (Subtarget.is64Bit()) {
61757 if (VConstraint && Subtarget.hasVLX())
61758 return std::make_pair(0U, &X86::VR128XRegClass);
61759 return std::make_pair(0U, &X86::VR128RegClass);
61760 }
61761 break;
61762 // Vector types and fp128.
61763 case MVT::v8f16:
61764 if (!Subtarget.hasFP16())
61765 break;
61766 if (VConstraint)
61767 return std::make_pair(0U, &X86::VR128XRegClass);
61768 return std::make_pair(0U, &X86::VR128RegClass);
61769 case MVT::v8bf16:
61770 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61771 break;
61772 if (VConstraint)
61773 return std::make_pair(0U, &X86::VR128XRegClass);
61774 return std::make_pair(0U, &X86::VR128RegClass);
61775 case MVT::f128:
61776 if (!Subtarget.is64Bit())
61777 break;
61778 [[fallthrough]];
61779 case MVT::v16i8:
61780 case MVT::v8i16:
61781 case MVT::v4i32:
61782 case MVT::v2i64:
61783 case MVT::v4f32:
61784 case MVT::v2f64:
61785 if (VConstraint && Subtarget.hasVLX())
61786 return std::make_pair(0U, &X86::VR128XRegClass);
61787 return std::make_pair(0U, &X86::VR128RegClass);
61788 // AVX types.
61789 case MVT::v16f16:
61790 if (!Subtarget.hasFP16())
61791 break;
61792 if (VConstraint)
61793 return std::make_pair(0U, &X86::VR256XRegClass);
61794 return std::make_pair(0U, &X86::VR256RegClass);
61795 case MVT::v16bf16:
61796 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61797 break;
61798 if (VConstraint)
61799 return std::make_pair(0U, &X86::VR256XRegClass);
61800 return std::make_pair(0U, &X86::VR256RegClass);
61801 case MVT::v32i8:
61802 case MVT::v16i16:
61803 case MVT::v8i32:
61804 case MVT::v4i64:
61805 case MVT::v8f32:
61806 case MVT::v4f64:
61807 if (VConstraint && Subtarget.hasVLX())
61808 return std::make_pair(0U, &X86::VR256XRegClass);
61809 if (Subtarget.hasAVX())
61810 return std::make_pair(0U, &X86::VR256RegClass);
61811 break;
61812 case MVT::v32f16:
61813 if (!Subtarget.hasFP16())
61814 break;
61815 if (VConstraint)
61816 return std::make_pair(0U, &X86::VR512RegClass);
61817 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61818 case MVT::v32bf16:
61819 if (!Subtarget.hasBF16())
61820 break;
61821 if (VConstraint)
61822 return std::make_pair(0U, &X86::VR512RegClass);
61823 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61824 case MVT::v64i8:
61825 case MVT::v32i16:
61826 case MVT::v8f64:
61827 case MVT::v16f32:
61828 case MVT::v16i32:
61829 case MVT::v8i64:
61830 if (!Subtarget.hasAVX512()) break;
61831 if (VConstraint)
61832 return std::make_pair(0U, &X86::VR512RegClass);
61833 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61834 }
61835 break;
61836 }
61837 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61838 switch (Constraint[1]) {
61839 default:
61840 break;
61841 case 'i':
61842 case 't':
61843 case '2':
61844 return getRegForInlineAsmConstraint(TRI, "x", VT);
61845 case 'm':
61846 if (!Subtarget.hasMMX()) break;
61847 return std::make_pair(0U, &X86::VR64RegClass);
61848 case 'z':
61849 if (!Subtarget.hasSSE1()) break;
61850 switch (VT.SimpleTy) {
61851 default: break;
61852 // Scalar SSE types.
61853 case MVT::f16:
61854 if (!Subtarget.hasFP16())
61855 break;
61856 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61857 case MVT::f32:
61858 case MVT::i32:
61859 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61860 case MVT::f64:
61861 case MVT::i64:
61862 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61863 case MVT::v8f16:
61864 if (!Subtarget.hasFP16())
61865 break;
61866 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61867 case MVT::v8bf16:
61868 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61869 break;
61870 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61871 case MVT::f128:
61872 case MVT::v16i8:
61873 case MVT::v8i16:
61874 case MVT::v4i32:
61875 case MVT::v2i64:
61876 case MVT::v4f32:
61877 case MVT::v2f64:
61878 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61879 // AVX types.
61880 case MVT::v16f16:
61881 if (!Subtarget.hasFP16())
61882 break;
61883 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61884 case MVT::v16bf16:
61885 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61886 break;
61887 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61888 case MVT::v32i8:
61889 case MVT::v16i16:
61890 case MVT::v8i32:
61891 case MVT::v4i64:
61892 case MVT::v8f32:
61893 case MVT::v4f64:
61894 if (Subtarget.hasAVX())
61895 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61896 break;
61897 case MVT::v32f16:
61898 if (!Subtarget.hasFP16())
61899 break;
61900 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61901 case MVT::v32bf16:
61902 if (!Subtarget.hasBF16())
61903 break;
61904 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61905 case MVT::v64i8:
61906 case MVT::v32i16:
61907 case MVT::v8f64:
61908 case MVT::v16f32:
61909 case MVT::v16i32:
61910 case MVT::v8i64:
61911 if (Subtarget.hasAVX512())
61912 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61913 break;
61914 }
61915 break;
61916 case 'k':
61917 // This register class doesn't allocate k0 for masked vector operation.
61918 if (Subtarget.hasAVX512()) {
61919 if (VT == MVT::v1i1 || VT == MVT::i1)
61920 return std::make_pair(0U, &X86::VK1WMRegClass);
61921 if (VT == MVT::v8i1 || VT == MVT::i8)
61922 return std::make_pair(0U, &X86::VK8WMRegClass);
61923 if (VT == MVT::v16i1 || VT == MVT::i16)
61924 return std::make_pair(0U, &X86::VK16WMRegClass);
61925 }
61926 if (Subtarget.hasBWI()) {
61927 if (VT == MVT::v32i1 || VT == MVT::i32)
61928 return std::make_pair(0U, &X86::VK32WMRegClass);
61929 if (VT == MVT::v64i1 || VT == MVT::i64)
61930 return std::make_pair(0U, &X86::VK64WMRegClass);
61931 }
61932 break;
61933 }
61934 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
61935 switch (Constraint[1]) {
61936 default:
61937 break;
61938 case 'r':
61939 if (VT == MVT::i8 || VT == MVT::i1)
61940 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
61941 if (VT == MVT::i16)
61942 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
61943 if (VT == MVT::i32 || VT == MVT::f32)
61944 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
61945 if (VT != MVT::f80 && !VT.isVector())
61946 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
61947 break;
61948 case 'R':
61949 if (VT == MVT::i8 || VT == MVT::i1)
61950 return std::make_pair(0U, &X86::GR8RegClass);
61951 if (VT == MVT::i16)
61952 return std::make_pair(0U, &X86::GR16RegClass);
61953 if (VT == MVT::i32 || VT == MVT::f32)
61954 return std::make_pair(0U, &X86::GR32RegClass);
61955 if (VT != MVT::f80 && !VT.isVector())
61956 return std::make_pair(0U, &X86::GR64RegClass);
61957 break;
61958 }
61959 }
61960
61961 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61962 return std::make_pair(0U, &X86::GR32RegClass);
61963
61964 // Use the default implementation in TargetLowering to convert the register
61965 // constraint into a member of a register class.
61966 std::pair<Register, const TargetRegisterClass*> Res;
61968
61969 // Not found as a standard register?
61970 if (!Res.second) {
61971 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
61972 // to/from f80.
61973 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
61974 // Map st(0) -> st(7) -> ST0
61975 if (Constraint.size() == 7 && Constraint[0] == '{' &&
61976 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
61977 Constraint[3] == '(' &&
61978 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
61979 Constraint[5] == ')' && Constraint[6] == '}') {
61980 // st(7) is not allocatable and thus not a member of RFP80. Return
61981 // singleton class in cases where we have a reference to it.
61982 if (Constraint[4] == '7')
61983 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
61984 return std::make_pair(X86::FP0 + Constraint[4] - '0',
61985 &X86::RFP80RegClass);
61986 }
61987
61988 // GCC allows "st(0)" to be called just plain "st".
61989 if (StringRef("{st}").equals_insensitive(Constraint))
61990 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
61991 }
61992
61993 // flags -> EFLAGS
61994 if (StringRef("{flags}").equals_insensitive(Constraint))
61995 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
61996
61997 // dirflag -> DF
61998 // Only allow for clobber.
61999 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
62000 VT == MVT::Other)
62001 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
62002
62003 // fpsr -> FPSW
62004 // Only allow for clobber.
62005 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
62006 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
62007
62008 return Res;
62009 }
62010
62011 // Make sure it isn't a register that requires 64-bit mode.
62012 if (!Subtarget.is64Bit() &&
62013 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
62014 TRI->getEncodingValue(Res.first) >= 8) {
62015 // Register requires REX prefix, but we're in 32-bit mode.
62016 return std::make_pair(0, nullptr);
62017 }
62018
62019 // Make sure it isn't a register that requires AVX512.
62020 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
62021 TRI->getEncodingValue(Res.first) & 0x10) {
62022 // Register requires EVEX prefix.
62023 return std::make_pair(0, nullptr);
62024 }
62025
62026 // Otherwise, check to see if this is a register class of the wrong value
62027 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62028 // turn into {ax},{dx}.
62029 // MVT::Other is used to specify clobber names.
62030 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62031 return Res; // Correct type already, nothing to do.
62032
62033 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62034 // return "eax". This should even work for things like getting 64bit integer
62035 // registers when given an f64 type.
62036 const TargetRegisterClass *Class = Res.second;
62037 // The generic code will match the first register class that contains the
62038 // given register. Thus, based on the ordering of the tablegened file,
62039 // the "plain" GR classes might not come first.
62040 // Therefore, use a helper method.
62041 if (isGRClass(*Class)) {
62042 unsigned Size = VT.getSizeInBits();
62043 if (Size == 1) Size = 8;
62044 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62045 return std::make_pair(0, nullptr);
62046 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62047 if (DestReg.isValid()) {
62048 bool is64Bit = Subtarget.is64Bit();
62049 const TargetRegisterClass *RC =
62050 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62051 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62052 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62053 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62054 if (Size == 64 && !is64Bit) {
62055 // Model GCC's behavior here and select a fixed pair of 32-bit
62056 // registers.
62057 switch (DestReg) {
62058 case X86::RAX:
62059 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62060 case X86::RDX:
62061 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62062 case X86::RCX:
62063 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62064 case X86::RBX:
62065 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62066 case X86::RSI:
62067 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62068 case X86::RDI:
62069 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62070 case X86::RBP:
62071 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62072 default:
62073 return std::make_pair(0, nullptr);
62074 }
62075 }
62076 if (RC && RC->contains(DestReg))
62077 return std::make_pair(DestReg, RC);
62078 return Res;
62079 }
62080 // No register found/type mismatch.
62081 return std::make_pair(0, nullptr);
62082 } else if (isFRClass(*Class)) {
62083 // Handle references to XMM physical registers that got mapped into the
62084 // wrong class. This can happen with constraints like {xmm0} where the
62085 // target independent register mapper will just pick the first match it can
62086 // find, ignoring the required type.
62087
62088 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62089 if (VT == MVT::f16)
62090 Res.second = &X86::FR16XRegClass;
62091 else if (VT == MVT::f32 || VT == MVT::i32)
62092 Res.second = &X86::FR32XRegClass;
62093 else if (VT == MVT::f64 || VT == MVT::i64)
62094 Res.second = &X86::FR64XRegClass;
62095 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62096 Res.second = &X86::VR128XRegClass;
62097 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62098 Res.second = &X86::VR256XRegClass;
62099 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62100 Res.second = &X86::VR512RegClass;
62101 else {
62102 // Type mismatch and not a clobber: Return an error;
62103 Res.first = 0;
62104 Res.second = nullptr;
62105 }
62106 } else if (isVKClass(*Class)) {
62107 if (VT == MVT::v1i1 || VT == MVT::i1)
62108 Res.second = &X86::VK1RegClass;
62109 else if (VT == MVT::v8i1 || VT == MVT::i8)
62110 Res.second = &X86::VK8RegClass;
62111 else if (VT == MVT::v16i1 || VT == MVT::i16)
62112 Res.second = &X86::VK16RegClass;
62113 else if (VT == MVT::v32i1 || VT == MVT::i32)
62114 Res.second = &X86::VK32RegClass;
62115 else if (VT == MVT::v64i1 || VT == MVT::i64)
62116 Res.second = &X86::VK64RegClass;
62117 else {
62118 // Type mismatch and not a clobber: Return an error;
62119 Res.first = 0;
62120 Res.second = nullptr;
62121 }
62122 }
62123
62124 return Res;
62125}
62126
62128 // Integer division on x86 is expensive. However, when aggressively optimizing
62129 // for code size, we prefer to use a div instruction, as it is usually smaller
62130 // than the alternative sequence.
62131 // The exception to this is vector division. Since x86 doesn't have vector
62132 // integer division, leaving the division as-is is a loss even in terms of
62133 // size, because it will have to be scalarized, while the alternative code
62134 // sequence can be performed in vector form.
62135 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62136 return OptSize && !VT.isVector();
62137}
62138
62139void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62140 if (!Subtarget.is64Bit())
62141 return;
62142
62143 // Update IsSplitCSR in X86MachineFunctionInfo.
62145 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62146 AFI->setIsSplitCSR(true);
62147}
62148
62149void X86TargetLowering::insertCopiesSplitCSR(
62150 MachineBasicBlock *Entry,
62151 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62152 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62153 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62154 if (!IStart)
62155 return;
62156
62157 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62158 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62159 MachineBasicBlock::iterator MBBI = Entry->begin();
62160 for (const MCPhysReg *I = IStart; *I; ++I) {
62161 const TargetRegisterClass *RC = nullptr;
62162 if (X86::GR64RegClass.contains(*I))
62163 RC = &X86::GR64RegClass;
62164 else
62165 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62166
62167 Register NewVR = MRI->createVirtualRegister(RC);
62168 // Create copy from CSR to a virtual register.
62169 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62170 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62171 // nounwind. If we want to generalize this later, we may need to emit
62172 // CFI pseudo-instructions.
62173 assert(
62174 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62175 "Function should be nounwind in insertCopiesSplitCSR!");
62176 Entry->addLiveIn(*I);
62177 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62178 .addReg(*I);
62179
62180 // Insert the copy-back instructions right before the terminator.
62181 for (auto *Exit : Exits)
62182 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62183 TII->get(TargetOpcode::COPY), *I)
62184 .addReg(NewVR);
62185 }
62186}
62187
62189 return Subtarget.is64Bit();
62190}
62191
62195 const TargetInstrInfo *TII) const {
62196 assert(MBBI->isCall() && MBBI->getCFIType() &&
62197 "Invalid call instruction for a KCFI check");
62198
62199 MachineFunction &MF = *MBB.getParent();
62200 // If the call target is a memory operand, unfold it and use R11 for the
62201 // call, so KCFI_CHECK won't have to recompute the address.
62202 switch (MBBI->getOpcode()) {
62203 case X86::CALL64m:
62204 case X86::CALL64m_NT:
62205 case X86::TAILJMPm64:
62206 case X86::TAILJMPm64_REX: {
62209 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62210 /*UnfoldStore=*/false, NewMIs))
62211 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62212 for (auto *NewMI : NewMIs)
62213 MBBI = MBB.insert(OrigCall, NewMI);
62214 assert(MBBI->isCall() &&
62215 "Unexpected instruction after memory operand unfolding");
62216 if (OrigCall->shouldUpdateAdditionalCallInfo())
62217 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62218 MBBI->setCFIType(MF, OrigCall->getCFIType());
62219 OrigCall->eraseFromParent();
62220 break;
62221 }
62222 default:
62223 break;
62224 }
62225
62226 MachineOperand &Target = MBBI->getOperand(0);
62227 Register TargetReg;
62228 switch (MBBI->getOpcode()) {
62229 case X86::CALL64r:
62230 case X86::CALL64r_ImpCall:
62231 case X86::CALL64r_NT:
62232 case X86::TAILJMPr64:
62233 case X86::TAILJMPr64_REX:
62234 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62235 Target.setIsRenamable(false);
62236 TargetReg = Target.getReg();
62237 break;
62238 case X86::CALL64pcrel32:
62239 case X86::TAILJMPd64:
62240 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62241 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62242 // 64-bit indirect thunk calls.
62243 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62244 "Unexpected register for an indirect thunk call");
62245 TargetReg = X86::R11;
62246 break;
62247 default:
62248 llvm_unreachable("Unexpected CFI call opcode");
62249 break;
62250 }
62251
62252 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62253 .addReg(TargetReg)
62254 .addImm(MBBI->getCFIType())
62255 .getInstr();
62256}
62257
62258/// Returns true if stack probing through a function call is requested.
62260 return !getStackProbeSymbolName(MF).empty();
62261}
62262
62263/// Returns true if stack probing through inline assembly is requested.
62265
62266 // No inline stack probe for Windows, they have their own mechanism.
62267 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62268 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62269 return false;
62270
62271 // If the function specifically requests inline stack probes, emit them.
62272 if (MF.getFunction().hasFnAttribute("probe-stack"))
62273 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62274 "inline-asm";
62275
62276 return false;
62277}
62278
62279/// Returns the name of the symbol used to emit stack probes or the empty
62280/// string if not applicable.
62283 // Inline Stack probes disable stack probe call
62284 if (hasInlineStackProbe(MF))
62285 return "";
62286
62287 // If the function specifically requests stack probes, emit them.
62288 if (MF.getFunction().hasFnAttribute("probe-stack"))
62289 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62290
62291 // Generally, if we aren't on Windows, the platform ABI does not include
62292 // support for stack probes, so don't emit them.
62293 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62294 Subtarget.isTargetMachO() ||
62295 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62296 return "";
62297
62298 // We need a stack probe to conform to the Windows ABI. Choose the right
62299 // symbol.
62300 if (Subtarget.is64Bit())
62301 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62302 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62303}
62304
62305unsigned
62307 // The default stack probe size is 4096 if the function has no stackprobesize
62308 // attribute.
62309 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62310 4096);
62311}
62312
62314 if (ML && ML->isInnermost() &&
62315 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62318}
unsigned const MachineRegisterInfo * MRI
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
Definition: BitTracker.cpp:68
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Looks at all the uses of the given value Returns the Liveness deduced from the uses of this value Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses If the result is MaybeLiveUses might be modified but its content should be ignored(since it might not be complete). DeadArgumentEliminationPass
uint64_t Addr
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:546
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
#define R2(n)
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
static StringRef substr(StringRef Str, uint64_t Len)
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition: Debug.h:119
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
@ ShiftBit
@ UndefBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISD::P...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static bool matchAsm(StringRef S, ArrayRef< const char * > Pieces)
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code? Current x86 isa includes the foll...
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition: APFloat.cpp:6082
void clearSign()
Definition: APFloat.h:1298
opStatus next(bool nextDown)
Definition: APFloat.h:1254
void changeSign()
Definition: APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1079
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition: APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
APInt abs() const
Get the absolute value.
Definition: APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition: APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition: APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition: APInt.h:1079
int32_t exactLogBase2() const
Definition: APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition: APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:219
unsigned countTrailingZeros() const
Definition: APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition: APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition: APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition: APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition: APInt.h:1435
unsigned logBase2() const
Definition: APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition: APInt.h:471
bool isMask(unsigned numBits) const
Definition: APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition: APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition: APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:206
iterator begin() const
Definition: ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:657
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:843
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:721
@ Add
*p = old + v
Definition: Instructions.h:725
@ FAdd
*p = old + v
Definition: Instructions.h:746
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:777
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:739
@ Or
*p = old | v
Definition: Instructions.h:733
@ Sub
*p = old - v
Definition: Instructions.h:727
@ And
*p = old & v
Definition: Instructions.h:729
@ Xor
*p = old ^ v
Definition: Instructions.h:735
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:781
@ FSub
*p = old - v
Definition: Instructions.h:749
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:769
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:737
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:743
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:757
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:741
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:753
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:773
@ Nand
*p = ~(old & v)
Definition: Instructions.h:731
Value * getPointerOperand()
Definition: Instructions.h:886
BinOp getOperation() const
Definition: Instructions.h:819
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:877
Value * getValOperand()
Definition: Instructions.h:890
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:863
This is an SDNode representing atomic operations.
LLVM_ABI bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:400
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
bool any() const
any - Returns true if any bit is set.
Definition: BitVector.h:170
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
The address of a basic block.
Definition: Constants.h:899
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
Value * getCalledOperand() const
Definition: InstrTypes.h:1340
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1314
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition: Constants.cpp:3005
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1602
bool isMachineConstantPoolEntry() const
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1423
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:403
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:435
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:229
unsigned size() const
Definition: DenseMap.h:108
bool empty() const
Definition: DenseMap.h:107
iterator begin()
Definition: DenseMap.h:78
iterator end()
Definition: DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:214
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
Tagged union holding either a T or a Error.
Definition: Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:128
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition: Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1036
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:352
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition: GlobalValue.h:569
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition: Globals.cpp:424
ThreadLocalMode getThreadLocalMode() const
Definition: GlobalValue.h:273
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
This class is used to form a handle around another node that is persistent and is updated across invo...
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:123
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:171
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:82
Class to represent integer types.
Definition: DerivedTypes.h:42
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:74
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:180
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:245
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition: MCContext.cpp:253
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition: MCContext.cpp:258
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:247
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:317
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
reverse_iterator rend()
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
unsigned succ_size() const
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
succ_reverse_iterator succ_rbegin()
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool readMem() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition: Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:352
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:720
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
Definition: SelectionDAG.h:372
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition: SelectionDAG.h:995
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:758
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
Definition: SelectionDAG.h:941
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:500
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:813
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
Definition: SelectionDAG.h:963
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
Definition: SelectionDAG.h:956
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:504
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:459
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:868
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:839
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:506
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:499
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:808
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:885
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:511
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:777
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:581
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
Definition: SelectionDAG.h:918
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition: SelectionDAG.h:979
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
void reserve(size_type NewNumEntries)
Definition: SmallPtrSet.h:117
void insert_range(Range &&R)
Definition: SmallPtrSet.h:490
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
size_type size() const
Definition: SmallSet.h:171
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:705
iterator erase(const_iterator CI)
Definition: SmallVector.h:738
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:579
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:287
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:581
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:269
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:151
size_t size_type
Definition: StringRef.h:61
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:281
static constexpr size_t npos
Definition: StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition: StringRef.h:180
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Definition: StringRef.cpp:252
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:720
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition: Triple.h:772
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, DriverKit, XROS, or bridgeOS).
Definition: Triple.h:609
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:408
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
LLVM_ABI uint64_t getArrayNumElements() const
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1866
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:35
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
use_iterator use_begin()
Definition: Value.h:364
bool use_empty() const
Definition: Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1101
iterator_range< use_iterator > uses()
Definition: Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getDarwinTLSCallPreservedMask() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
Definition: X86Subtarget.h:232
bool hasAnyFMA() const
Definition: X86Subtarget.h:199
bool isOSWindows() const
Definition: X86Subtarget.h:325
bool isTargetMachO() const
Definition: X86Subtarget.h:292
bool isUEFI() const
Definition: X86Subtarget.h:323
bool useIndirectThunkBranches() const
Definition: X86Subtarget.h:217
bool hasSSE1() const
Definition: X86Subtarget.h:189
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
Definition: X86Subtarget.h:280
bool hasBitScanPassThrough() const
Definition: X86Subtarget.h:265
bool isPICStyleGOT() const
Definition: X86Subtarget.h:333
bool hasSSE42() const
Definition: X86Subtarget.h:194
const X86TargetLowering * getTargetLowering() const override
Definition: X86Subtarget.h:118
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition: X86Subtarget.h:277
bool canUseCMOV() const
Definition: X86Subtarget.h:188
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:336
bool isTargetWindowsMSVC() const
Definition: X86Subtarget.h:301
bool canUseCMPXCHG8B() const
Definition: X86Subtarget.h:181
bool isTargetDarwin() const
Definition: X86Subtarget.h:284
bool isTargetWin64() const
Definition: X86Subtarget.h:329
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition: X86Subtarget.h:176
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:282
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:122
bool useAVX512Regs() const
Definition: X86Subtarget.h:249
bool hasSSE3() const
Definition: X86Subtarget.h:191
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:342
bool hasAVX512() const
Definition: X86Subtarget.h:197
bool canExtendTo512DQ() const
Definition: X86Subtarget.h:228
bool hasSSE41() const
Definition: X86Subtarget.h:193
bool isTargetELF() const
Definition: X86Subtarget.h:290
bool hasSSEPrefetch() const
Definition: X86Subtarget.h:205
bool canUseCMPXCHG16B() const
Definition: X86Subtarget.h:182
unsigned char classifyGlobalReference(const GlobalValue *GV, const Module &M) const
bool hasSSE2() const
Definition: X86Subtarget.h:190
bool hasSSSE3() const
Definition: X86Subtarget.h:192
bool hasInt256() const
Definition: X86Subtarget.h:198
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:334
bool isTargetCygMing() const
Definition: X86Subtarget.h:321
unsigned char classifyLocalReference(const GlobalValue *GV) const
Classify a global variable reference for the current subtarget according to how we should reference i...
unsigned char classifyBlockAddressReference() const
Classify a blockaddress reference for the current subtarget according to how we should reference it i...
bool isTargetPS() const
Definition: X86Subtarget.h:288
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:132
bool hasAVX() const
Definition: X86Subtarget.h:195
bool isTargetWindowsGNU() const
Definition: X86Subtarget.h:313
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:221
bool isTargetWindowsItanium() const
Definition: X86Subtarget.h:317
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:124
bool useBWIRegs() const
Definition: X86Subtarget.h:258
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const
Classify a global function reference for the current subtarget.
bool hasAVX2() const
Definition: X86Subtarget.h:196
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y —> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
#define INT64_MIN
Definition: DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:3009
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ Entry
Definition: COFF.h:862
@ X86_ThisCall
Similar to X86_StdCall.
Definition: CallingConv.h:122
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition: CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition: CallingConv.h:103
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:256
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1236
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1232
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:45
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1401
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1379
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1265
@ ConstantFP
Definition: ISDOpcodes.h:87
@ STRICT_FATAN2
Definition: ISDOpcodes.h:441
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1381
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1351
@ STRICT_FCEIL
Definition: ISDOpcodes.h:454
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1382
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition: ISDOpcodes.h:140
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1112
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:515
@ STRICT_FTANH
Definition: ISDOpcodes.h:444
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
Definition: ISDOpcodes.h:1098
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:1020
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:167
@ GlobalAddress
Definition: ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1364
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:738
@ MEMBARRIER
MEMBARRIER - Compiler barrier only; generate a no-op.
Definition: ISDOpcodes.h:1338
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1343
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition: ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:505
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:985
@ STRICT_FLOG2
Definition: ISDOpcodes.h:449
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1377
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1378
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1309
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1574
@ GlobalTLSAddress
Definition: ISDOpcodes.h:89
@ EH_LABEL
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:1212
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition: ISDOpcodes.h:151
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:957
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ STRICT_FASIN
Definition: ISDOpcodes.h:438
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:117
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1157
@ STRICT_FATAN
Definition: ISDOpcodes.h:440
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:773
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1331
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1090
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:809
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1187
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:347
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1380
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1166
@ GC_TRANSITION_START
GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the beginning and end of GC transition s...
Definition: ISDOpcodes.h:1432
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1347
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:663
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1261
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:343
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:458
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:952
@ STRICT_FP_TO_FP16
Definition: ISDOpcodes.h:988
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ STRICT_FP16_TO_FP
Definition: ISDOpcodes.h:987
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1375
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:601
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1075
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:463
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:452
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1321
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:928
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:453
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:793
@ STRICT_FSINH
Definition: ISDOpcodes.h:442
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1383
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:130
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1325
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:351
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1151
@ ConstantPool
Definition: ISDOpcodes.h:92
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ STRICT_FROUND
Definition: ISDOpcodes.h:456
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:477
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
Definition: ISDOpcodes.h:1413
@ STRICT_BF16_TO_FP
Definition: ISDOpcodes.h:996
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:455
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:457
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:994
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:110
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1373
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:470
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1374
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1292
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1318
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ GET_FPENV_MEM
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1117
@ STRICT_FCOSH
Definition: ISDOpcodes.h:443
@ STRICT_FP_TO_BF16
Definition: ISDOpcodes.h:997
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:420
@ STRICT_FLOG10
Definition: ISDOpcodes.h:448
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition: ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:53
@ STRICT_FEXP2
Definition: ISDOpcodes.h:446
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1372
@ ExternalSymbol
Definition: ISDOpcodes.h:93
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1025
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition: ISDOpcodes.h:690
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:434
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:903
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:979
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1256
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1180
@ BlockAddress
Definition: ISDOpcodes.h:94
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:815
@ GC_TRANSITION_END
Definition: ISDOpcodes.h:1433
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:360
@ AssertZext
Definition: ISDOpcodes.h:63
@ STRICT_FRINT
Definition: ISDOpcodes.h:450
@ SET_FPENV_MEM
Sets the current floating point environment.
Definition: ISDOpcodes.h:1122
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1086
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:713
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1315
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ STRICT_FACOS
Definition: ISDOpcodes.h:439
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1767
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1762
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1578
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition: ISDOpcodes.h:1749
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1724
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1691
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1671
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition: ISDOpcodes.h:1730
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition: PatternMatch.h:664
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:980
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:58
@ GeneralDynamic
Definition: CodeGen.h:46
@ X86
Windows x64, Windows Itanium (IA-64)
@ PTR32_UPTR
Definition: X86.h:217
@ FS
Definition: X86.h:214
@ PTR64
Definition: X86.h:218
@ PTR32_SPTR
Definition: X86.h:216
@ GS
Definition: X86.h:213
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:411
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition: X86BaseInfo.h:391
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: X86BaseInfo.h:488
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition: X86BaseInfo.h:450
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:432
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:456
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition: X86BaseInfo.h:438
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition: X86BaseInfo.h:476
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:403
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:363
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition: X86BaseInfo.h:472
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition: X86BaseInfo.h:460
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:425
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition: X86BaseInfo.h:480
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:444
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:419
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:387
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
@ TO_NEAREST_INT
Definition: X86BaseInfo.h:42
@ CUR_DIRECTION
Definition: X86BaseInfo.h:46
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
@ AddrNumOperands
Definition: X86BaseInfo.h:36
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
constexpr double e
Definition: MathExtras.h:47
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition: X86InstrInfo.h:121
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:477
@ Length
Definition: DWP.cpp:477
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition: X86InstrInfo.h:139
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition: Utils.cpp:1605
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:355
@ SM_SentinelUndef
@ SM_SentinelZero
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:270
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition: STLExtras.h:2095
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1587
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:295
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1796
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
unsigned M1(unsigned Val)
Definition: VE.h:377
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:203
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:164
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition: DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2013
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1879
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1973
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:376
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1854
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
const char * toString(DWARFSectionKind Kind)
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2127
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ TRUNCATE_TO_REG
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ TRUNCATE2_TO_REG
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ FIXUPIMM_MASKZ
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition: STLExtras.h:1629
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
#define EQ(a, b)
Definition: regexec.c:112
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition: APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition: APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition: APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:299
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:330
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:294
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
Definition: KnownBits.cpp:764
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition: KnownBits.cpp:487
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition: KnownBits.h:179
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
Definition: KnownBits.cpp:916
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:101
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:235
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition: KnownBits.h:267
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:154
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition: KnownBits.h:282
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition: KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:165
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition: KnownBits.h:104
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition: KnownBits.cpp:228
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:218
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:289
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:304
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:340
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition: KnownBits.h:189
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:241
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:138
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:98
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition: KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:803
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition: KnownBits.cpp:525
bool isAllOnes() const
Returns true if value is all one bits.
Definition: KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
MVT ConstraintVT
The ValueType for the operand value.
std::string ConstraintCode
This contains the actual string for the code, like "m".
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
X86AddressMode - This struct holds a generalized full x86 address mode.